diff --git a/src/skill_seekers/benchmark/__init__.py b/src/skill_seekers/benchmark/__init__.py index cdd4861..c842885 100644 --- a/src/skill_seekers/benchmark/__init__.py +++ b/src/skill_seekers/benchmark/__init__.py @@ -33,9 +33,9 @@ from .runner import BenchmarkRunner from .models import BenchmarkReport, Metric __all__ = [ - 'Benchmark', - 'BenchmarkResult', - 'BenchmarkRunner', - 'BenchmarkReport', - 'Metric', + "Benchmark", + "BenchmarkResult", + "BenchmarkRunner", + "BenchmarkReport", + "Metric", ] diff --git a/src/skill_seekers/benchmark/framework.py b/src/skill_seekers/benchmark/framework.py index ba1fb89..8086841 100644 --- a/src/skill_seekers/benchmark/framework.py +++ b/src/skill_seekers/benchmark/framework.py @@ -11,12 +11,7 @@ from typing import Any from collections.abc import Callable from pathlib import Path -from .models import ( - Metric, - TimingResult, - MemoryUsage, - BenchmarkReport -) +from .models import Metric, TimingResult, MemoryUsage, BenchmarkReport class BenchmarkResult: @@ -97,7 +92,7 @@ class BenchmarkResult: memory=self.memory, metrics=self.metrics, system_info=self.system_info, - recommendations=self.recommendations + recommendations=self.recommendations, ) @@ -161,7 +156,7 @@ class Benchmark: operation=operation, duration=duration, iterations=iterations, - avg_duration=duration / iterations if iterations > 1 else duration + avg_duration=duration / iterations if iterations > 1 else duration, ) self.result.add_timing(timing) @@ -201,7 +196,7 @@ class Benchmark: before_mb=mem_before, after_mb=mem_after, peak_mb=peak_memory, - allocated_mb=mem_after - mem_before + allocated_mb=mem_after - mem_before, ) self.result.add_memory(usage) @@ -212,7 +207,7 @@ class Benchmark: *args, operation: str | None = None, track_memory: bool = False, - **kwargs + **kwargs, ) -> Any: """ Measure function execution. @@ -260,17 +255,16 @@ class Benchmark: def load_config(path): return json.load(open(path)) """ + def decorator(func: Callable) -> Callable: @functools.wraps(func) def wrapper(*args, **kwargs): return self.measure( - func, - *args, - operation=operation, - track_memory=track_memory, - **kwargs + func, *args, operation=operation, track_memory=track_memory, **kwargs ) + return wrapper + return decorator def metric(self, name: str, value: float, unit: str): @@ -285,11 +279,7 @@ class Benchmark: Examples: benchmark.metric("pages_per_sec", 12.5, "pages/sec") """ - metric = Metric( - name=name, - value=value, - unit=unit - ) + metric = Metric(name=name, value=value, unit=unit) self.result.add_metric(metric) def recommend(self, text: str): @@ -328,7 +318,7 @@ class Benchmark: path.parent.mkdir(parents=True, exist_ok=True) - with open(path, 'w') as f: + with open(path, "w") as f: f.write(report.model_dump_json(indent=2)) def analyze(self): @@ -339,11 +329,7 @@ class Benchmark: """ # Analyze timing bottlenecks if self.result.timings: - sorted_timings = sorted( - self.result.timings, - key=lambda t: t.duration, - reverse=True - ) + sorted_timings = sorted(self.result.timings, key=lambda t: t.duration, reverse=True) slowest = sorted_timings[0] total_time = sum(t.duration for t in self.result.timings) @@ -351,7 +337,7 @@ class Benchmark: if slowest.duration > total_time * 0.5: self.recommend( f"Bottleneck: '{slowest.operation}' takes " - f"{slowest.duration:.1f}s ({slowest.duration/total_time*100:.0f}% of total)" + f"{slowest.duration:.1f}s ({slowest.duration / total_time * 100:.0f}% of total)" ) # Analyze memory usage @@ -360,8 +346,7 @@ class Benchmark: if peak > 1000: # >1GB self.recommend( - f"High memory usage: {peak:.0f}MB peak. " - "Consider processing in batches." + f"High memory usage: {peak:.0f}MB peak. Consider processing in batches." ) # Check for memory leaks diff --git a/src/skill_seekers/benchmark/models.py b/src/skill_seekers/benchmark/models.py index 107d100..f8da8a7 100644 --- a/src/skill_seekers/benchmark/models.py +++ b/src/skill_seekers/benchmark/models.py @@ -14,8 +14,7 @@ class Metric(BaseModel): value: float = Field(..., description="Metric value") unit: str = Field(..., description="Unit (seconds, bytes, pages/sec, etc.)") timestamp: datetime = Field( - default_factory=datetime.utcnow, - description="When metric was recorded" + default_factory=datetime.utcnow, description="When metric was recorded" ) @@ -48,26 +47,13 @@ class BenchmarkReport(BaseModel): finished_at: datetime = Field(..., description="Finish time") total_duration: float = Field(..., description="Total duration in seconds") - timings: list[TimingResult] = Field( - default_factory=list, - description="Timing results" - ) - memory: list[MemoryUsage] = Field( - default_factory=list, - description="Memory usage results" - ) - metrics: list[Metric] = Field( - default_factory=list, - description="Additional metrics" - ) + timings: list[TimingResult] = Field(default_factory=list, description="Timing results") + memory: list[MemoryUsage] = Field(default_factory=list, description="Memory usage results") + metrics: list[Metric] = Field(default_factory=list, description="Additional metrics") - system_info: dict[str, Any] = Field( - default_factory=dict, - description="System information" - ) + system_info: dict[str, Any] = Field(default_factory=dict, description="System information") recommendations: list[str] = Field( - default_factory=list, - description="Optimization recommendations" + default_factory=list, description="Optimization recommendations" ) @property @@ -89,14 +75,8 @@ class ComparisonReport(BaseModel): baseline: BenchmarkReport = Field(..., description="Baseline benchmark") current: BenchmarkReport = Field(..., description="Current benchmark") - improvements: list[str] = Field( - default_factory=list, - description="Performance improvements" - ) - regressions: list[str] = Field( - default_factory=list, - description="Performance regressions" - ) + improvements: list[str] = Field(default_factory=list, description="Performance improvements") + regressions: list[str] = Field(default_factory=list, description="Performance regressions") speedup_factor: float = Field(..., description="Overall speedup factor") memory_change_mb: float = Field(..., description="Memory usage change (MB)") diff --git a/src/skill_seekers/benchmark/runner.py b/src/skill_seekers/benchmark/runner.py index 1233453..32d62e2 100644 --- a/src/skill_seekers/benchmark/runner.py +++ b/src/skill_seekers/benchmark/runner.py @@ -46,10 +46,7 @@ class BenchmarkRunner: self.output_dir.mkdir(parents=True, exist_ok=True) def run( - self, - name: str, - benchmark_func: Callable[[Benchmark], None], - save: bool = True + self, name: str, benchmark_func: Callable[[Benchmark], None], save: bool = True ) -> BenchmarkReport: """ Run single benchmark. @@ -83,7 +80,7 @@ class BenchmarkRunner: filename = f"{name}_{timestamp}.json" path = self.output_dir / filename - with open(path, 'w') as f: + with open(path, "w") as f: f.write(report.model_dump_json(indent=2)) print(f"📊 Saved benchmark: {path}") @@ -91,9 +88,7 @@ class BenchmarkRunner: return report def run_suite( - self, - benchmarks: dict[str, Callable[[Benchmark], None]], - save: bool = True + self, benchmarks: dict[str, Callable[[Benchmark], None]], save: bool = True ) -> dict[str, BenchmarkReport]: """ Run multiple benchmarks. @@ -122,11 +117,7 @@ class BenchmarkRunner: return reports - def compare( - self, - baseline_path: Path, - current_path: Path - ) -> ComparisonReport: + def compare(self, baseline_path: Path, current_path: Path) -> ComparisonReport: """ Compare two benchmark reports. @@ -215,7 +206,7 @@ class BenchmarkRunner: improvements=improvements, regressions=regressions, speedup_factor=speedup_factor, - memory_change_mb=memory_change_mb + memory_change_mb=memory_change_mb, ) def list_benchmarks(self) -> list[dict[str, Any]]: @@ -237,13 +228,15 @@ class BenchmarkRunner: with open(path) as f: data = json.load(f) - benchmarks.append({ - "name": data["name"], - "path": str(path), - "started_at": data["started_at"], - "duration": data["total_duration"], - "operations": len(data.get("timings", [])) - }) + benchmarks.append( + { + "name": data["name"], + "path": str(path), + "started_at": data["started_at"], + "duration": data["total_duration"], + "operations": len(data.get("timings", [])), + } + ) except Exception: # Skip invalid files continue diff --git a/src/skill_seekers/cli/adaptors/base.py b/src/skill_seekers/cli/adaptors/base.py index 4b8246e..ca02c30 100644 --- a/src/skill_seekers/cli/adaptors/base.py +++ b/src/skill_seekers/cli/adaptors/base.py @@ -74,7 +74,7 @@ class SkillAdaptor(ABC): output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, - preserve_code_blocks: bool = True + preserve_code_blocks: bool = True, ) -> Path: """ Package skill for platform (ZIP, tar.gz, etc.). @@ -282,7 +282,7 @@ class SkillAdaptor(ABC): enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True, - source_file: str = None + source_file: str = None, ) -> list[tuple[str, dict]]: """ Optionally chunk content for RAG platforms. @@ -326,33 +326,31 @@ class SkillAdaptor(ABC): chunk_overlap=max(50, chunk_max_tokens // 10), # 10% overlap preserve_code_blocks=preserve_code_blocks, preserve_paragraphs=True, - min_chunk_size=100 # 100 tokens minimum + min_chunk_size=100, # 100 tokens minimum ) # Chunk the document chunks = chunker.chunk_document( text=content, metadata=metadata, - source_file=source_file or metadata.get('file', 'unknown') + source_file=source_file or metadata.get("file", "unknown"), ) # Convert RAGChunker output format to (text, metadata) tuples result = [] for chunk_dict in chunks: - chunk_text = chunk_dict['page_content'] + chunk_text = chunk_dict["page_content"] chunk_meta = { **metadata, # Base metadata - **chunk_dict['metadata'], # RAGChunker metadata (chunk_index, etc.) - 'is_chunked': True, - 'chunk_id': chunk_dict['chunk_id'] + **chunk_dict["metadata"], # RAGChunker metadata (chunk_index, etc.) + "is_chunked": True, + "chunk_id": chunk_dict["chunk_id"], } result.append((chunk_text, chunk_meta)) return result - def _format_output_path( - self, skill_dir: Path, output_path: Path, suffix: str - ) -> Path: + def _format_output_path(self, skill_dir: Path, output_path: Path, suffix: str) -> Path: """ Generate standardized output path with intelligent format handling. @@ -379,11 +377,13 @@ class SkillAdaptor(ABC): output_str = str(output_path) # Extract the file extension from suffix (e.g., ".json" from "-langchain.json") - correct_ext = suffix.split('.')[-1] if '.' in suffix else '' + correct_ext = suffix.split(".")[-1] if "." in suffix else "" if correct_ext and not output_str.endswith(f".{correct_ext}"): # Replace common incorrect extensions - output_str = output_str.replace(".zip", f".{correct_ext}").replace(".tar.gz", f".{correct_ext}") + output_str = output_str.replace(".zip", f".{correct_ext}").replace( + ".tar.gz", f".{correct_ext}" + ) # Ensure platform suffix is present if not output_str.endswith(suffix): @@ -395,9 +395,7 @@ class SkillAdaptor(ABC): return Path(output_str) - def _generate_deterministic_id( - self, content: str, metadata: dict, format: str = "hex" - ) -> str: + def _generate_deterministic_id(self, content: str, metadata: dict, format: str = "hex") -> str: """ Generate deterministic ID from content and metadata. diff --git a/src/skill_seekers/cli/adaptors/chroma.py b/src/skill_seekers/cli/adaptors/chroma.py index a6b876c..836d937 100644 --- a/src/skill_seekers/cli/adaptors/chroma.py +++ b/src/skill_seekers/cli/adaptors/chroma.py @@ -43,11 +43,7 @@ class ChromaAdaptor(SkillAdaptor): return self._generate_deterministic_id(content, metadata, format="hex") def format_skill_md( - self, - skill_dir: Path, - metadata: SkillMetadata, - enable_chunking: bool = False, - **kwargs + self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs ) -> str: """ Format skill as JSON for Chroma ingestion. @@ -90,9 +86,9 @@ class ChromaAdaptor(SkillAdaptor): content, doc_metadata, enable_chunking=enable_chunking, - chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), - preserve_code_blocks=kwargs.get('preserve_code_blocks', True), - source_file="SKILL.md" + chunk_max_tokens=kwargs.get("chunk_max_tokens", 512), + preserve_code_blocks=kwargs.get("preserve_code_blocks", True), + source_file="SKILL.md", ) # Add all chunks to parallel arrays @@ -120,9 +116,9 @@ class ChromaAdaptor(SkillAdaptor): ref_content, doc_metadata, enable_chunking=enable_chunking, - chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), - preserve_code_blocks=kwargs.get('preserve_code_blocks', True), - source_file=ref_file.name + chunk_max_tokens=kwargs.get("chunk_max_tokens", 512), + preserve_code_blocks=kwargs.get("preserve_code_blocks", True), + source_file=ref_file.name, ) # Add all chunks to parallel arrays @@ -149,7 +145,7 @@ class ChromaAdaptor(SkillAdaptor): output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, - preserve_code_blocks: bool = True + preserve_code_blocks: bool = True, ) -> Path: """ Package skill into JSON file for Chroma. @@ -183,7 +179,7 @@ class ChromaAdaptor(SkillAdaptor): metadata, enable_chunking=enable_chunking, chunk_max_tokens=chunk_max_tokens, - preserve_code_blocks=preserve_code_blocks + preserve_code_blocks=preserve_code_blocks, ) # Write to file @@ -233,7 +229,7 @@ class ChromaAdaptor(SkillAdaptor): except ImportError: return { "success": False, - "message": "chromadb not installed. Run: pip install chromadb" + "message": "chromadb not installed. Run: pip install chromadb", } # Load package @@ -241,8 +237,8 @@ class ChromaAdaptor(SkillAdaptor): data = json.load(f) # Determine client type and configuration - persist_directory = kwargs.get('persist_directory') - chroma_url = kwargs.get('chroma_url') + persist_directory = kwargs.get("persist_directory") + chroma_url = kwargs.get("chroma_url") try: if persist_directory: @@ -253,15 +249,15 @@ class ChromaAdaptor(SkillAdaptor): # Remote HTTP client print(f"🌐 Connecting to ChromaDB at: {chroma_url}") # Parse URL - if '://' in chroma_url: - parts = chroma_url.split('://') + if "://" in chroma_url: + parts = chroma_url.split("://") parts[0] host_port = parts[1] else: host_port = chroma_url - if ':' in host_port: - host, port = host_port.rsplit(':', 1) + if ":" in host_port: + host, port = host_port.rsplit(":", 1) port = int(port) else: host = host_port @@ -276,12 +272,12 @@ class ChromaAdaptor(SkillAdaptor): except Exception as e: return { "success": False, - "message": f"Failed to connect to ChromaDB: {e}\n\nTry:\n pip install chromadb\n chroma run # Start local server" + "message": f"Failed to connect to ChromaDB: {e}\n\nTry:\n pip install chromadb\n chroma run # Start local server", } # Get or create collection - collection_name = kwargs.get('collection_name', data.get('collection_name', 'skill_docs')) - distance_function = kwargs.get('distance_function', 'cosine') + collection_name = kwargs.get("collection_name", data.get("collection_name", "skill_docs")) + distance_function = kwargs.get("distance_function", "cosine") try: # Try to get existing collection @@ -291,62 +287,57 @@ class ChromaAdaptor(SkillAdaptor): try: # Create new collection metadata = {"hnsw:space": distance_function} - collection = client.create_collection( - name=collection_name, - metadata=metadata - ) + collection = client.create_collection(name=collection_name, metadata=metadata) print(f"✅ Created collection: {collection_name} (distance: {distance_function})") except Exception as e: return { "success": False, - "message": f"Failed to create collection '{collection_name}': {e}" + "message": f"Failed to create collection '{collection_name}': {e}", } # Handle embeddings - embedding_function = kwargs.get('embedding_function') + embedding_function = kwargs.get("embedding_function") try: - if embedding_function == 'openai': + if embedding_function == "openai": # Generate embeddings with OpenAI print("🔄 Generating OpenAI embeddings...") embeddings = self._generate_openai_embeddings( - data['documents'], - api_key=kwargs.get('openai_api_key') + data["documents"], api_key=kwargs.get("openai_api_key") ) collection.add( - documents=data['documents'], - metadatas=data['metadatas'], - ids=data['ids'], - embeddings=embeddings + documents=data["documents"], + metadatas=data["metadatas"], + ids=data["ids"], + embeddings=embeddings, ) - elif embedding_function == 'sentence-transformers': + elif embedding_function == "sentence-transformers": # Use sentence-transformers print("🔄 Generating sentence-transformer embeddings...") try: from chromadb.utils import embedding_functions + ef = embedding_functions.SentenceTransformerEmbeddingFunction() - embeddings = [ef([doc])[0] for doc in data['documents']] + embeddings = [ef([doc])[0] for doc in data["documents"]] collection.add( - documents=data['documents'], - metadatas=data['metadatas'], - ids=data['ids'], - embeddings=embeddings + documents=data["documents"], + metadatas=data["metadatas"], + ids=data["ids"], + embeddings=embeddings, ) except ImportError: return { "success": False, - "message": "sentence-transformers not installed. Run: pip install sentence-transformers" + "message": "sentence-transformers not installed. Run: pip install sentence-transformers", } else: # No embeddings - Chroma will auto-generate print("🔄 Using Chroma's default embedding function...") collection.add( - documents=data['documents'], - metadatas=data['metadatas'], - ids=data['ids'] + documents=data["documents"], metadatas=data["metadatas"], ids=data["ids"] ) - count = len(data['documents']) + count = len(data["documents"]) print(f"✅ Uploaded {count} documents to ChromaDB") print(f"📊 Collection '{collection_name}' now has {collection.count()} total documents") @@ -355,19 +346,14 @@ class ChromaAdaptor(SkillAdaptor): "message": f"Uploaded {count} documents to ChromaDB collection '{collection_name}'", "collection": collection_name, "count": count, - "url": f"{chroma_url}/collections/{collection_name}" if chroma_url else None + "url": f"{chroma_url}/collections/{collection_name}" if chroma_url else None, } except Exception as e: - return { - "success": False, - "message": f"Upload failed: {e}" - } + return {"success": False, "message": f"Upload failed: {e}"} def _generate_openai_embeddings( - self, - documents: list[str], - api_key: str = None + self, documents: list[str], api_key: str = None ) -> list[list[float]]: """ Generate embeddings using OpenAI API. @@ -380,12 +366,13 @@ class ChromaAdaptor(SkillAdaptor): List of embedding vectors """ import os + try: from openai import OpenAI except ImportError: raise ImportError("openai not installed. Run: pip install openai") from None - api_key = api_key or os.getenv('OPENAI_API_KEY') + api_key = api_key or os.getenv("OPENAI_API_KEY") if not api_key: raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key") @@ -398,14 +385,14 @@ class ChromaAdaptor(SkillAdaptor): print(f" Generating embeddings for {len(documents)} documents...") for i in range(0, len(documents), batch_size): - batch = documents[i:i+batch_size] + batch = documents[i : i + batch_size] try: response = client.embeddings.create( input=batch, - model="text-embedding-3-small" # Cheapest, fastest + model="text-embedding-3-small", # Cheapest, fastest ) embeddings.extend([item.embedding for item in response.data]) - print(f" ✓ Processed {min(i+batch_size, len(documents))}/{len(documents)}") + print(f" ✓ Processed {min(i + batch_size, len(documents))}/{len(documents)}") except Exception as e: raise Exception(f"OpenAI embedding generation failed: {e}") from e diff --git a/src/skill_seekers/cli/adaptors/claude.py b/src/skill_seekers/cli/adaptors/claude.py index 82ec1bc..503ca1d 100644 --- a/src/skill_seekers/cli/adaptors/claude.py +++ b/src/skill_seekers/cli/adaptors/claude.py @@ -81,7 +81,14 @@ version: {metadata.version} {content_body} """ - def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path: + def package( + self, + skill_dir: Path, + output_path: Path, + enable_chunking: bool = False, + chunk_max_tokens: int = 512, + preserve_code_blocks: bool = True, + ) -> Path: """ Package skill into ZIP file for Claude. diff --git a/src/skill_seekers/cli/adaptors/faiss_helpers.py b/src/skill_seekers/cli/adaptors/faiss_helpers.py index 2a4480a..62c8539 100644 --- a/src/skill_seekers/cli/adaptors/faiss_helpers.py +++ b/src/skill_seekers/cli/adaptors/faiss_helpers.py @@ -46,11 +46,7 @@ class FAISSHelpers(SkillAdaptor): return self._generate_deterministic_id(content, metadata, format="hex") def format_skill_md( - self, - skill_dir: Path, - metadata: SkillMetadata, - enable_chunking: bool = False, - **kwargs + self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs ) -> str: """ Format skill as JSON for FAISS ingestion. @@ -92,9 +88,9 @@ class FAISSHelpers(SkillAdaptor): content, doc_metadata, enable_chunking=enable_chunking, - chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), - preserve_code_blocks=kwargs.get('preserve_code_blocks', True), - source_file="SKILL.md" + chunk_max_tokens=kwargs.get("chunk_max_tokens", 512), + preserve_code_blocks=kwargs.get("preserve_code_blocks", True), + source_file="SKILL.md", ) # Add all chunks to parallel arrays @@ -121,9 +117,9 @@ class FAISSHelpers(SkillAdaptor): ref_content, doc_metadata, enable_chunking=enable_chunking, - chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), - preserve_code_blocks=kwargs.get('preserve_code_blocks', True), - source_file=ref_file.name + chunk_max_tokens=kwargs.get("chunk_max_tokens", 512), + preserve_code_blocks=kwargs.get("preserve_code_blocks", True), + source_file=ref_file.name, ) # Add all chunks to parallel arrays @@ -160,7 +156,7 @@ class FAISSHelpers(SkillAdaptor): output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, - preserve_code_blocks: bool = True + preserve_code_blocks: bool = True, ) -> Path: """ Package skill into JSON file for FAISS. @@ -193,7 +189,7 @@ class FAISSHelpers(SkillAdaptor): metadata, enable_chunking=enable_chunking, chunk_max_tokens=chunk_max_tokens, - preserve_code_blocks=preserve_code_blocks + preserve_code_blocks=preserve_code_blocks, ) # Write to file diff --git a/src/skill_seekers/cli/adaptors/gemini.py b/src/skill_seekers/cli/adaptors/gemini.py index 692480f..af74a8a 100644 --- a/src/skill_seekers/cli/adaptors/gemini.py +++ b/src/skill_seekers/cli/adaptors/gemini.py @@ -86,7 +86,14 @@ See the references directory for complete documentation with examples and best p # Return plain markdown (NO frontmatter) return content_body - def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path: + def package( + self, + skill_dir: Path, + output_path: Path, + enable_chunking: bool = False, + chunk_max_tokens: int = 512, + preserve_code_blocks: bool = True, + ) -> Path: """ Package skill into tar.gz file for Gemini. diff --git a/src/skill_seekers/cli/adaptors/haystack.py b/src/skill_seekers/cli/adaptors/haystack.py index eb9541f..7876ccc 100644 --- a/src/skill_seekers/cli/adaptors/haystack.py +++ b/src/skill_seekers/cli/adaptors/haystack.py @@ -29,11 +29,7 @@ class HaystackAdaptor(SkillAdaptor): DEFAULT_API_ENDPOINT = None # No upload endpoint def format_skill_md( - self, - skill_dir: Path, - metadata: SkillMetadata, - enable_chunking: bool = False, - **kwargs + self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs ) -> str: """ Format skill as JSON array of Haystack Documents. @@ -73,17 +69,19 @@ class HaystackAdaptor(SkillAdaptor): content, doc_meta, enable_chunking=enable_chunking, - chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), - preserve_code_blocks=kwargs.get('preserve_code_blocks', True), - source_file="SKILL.md" + chunk_max_tokens=kwargs.get("chunk_max_tokens", 512), + preserve_code_blocks=kwargs.get("preserve_code_blocks", True), + source_file="SKILL.md", ) # Add all chunks as documents for chunk_text, chunk_meta in chunks: - documents.append({ - "content": chunk_text, - "meta": chunk_meta, - }) + documents.append( + { + "content": chunk_text, + "meta": chunk_meta, + } + ) # Convert all reference files using base helper method for ref_file, ref_content in self._iterate_references(skill_dir): @@ -104,17 +102,19 @@ class HaystackAdaptor(SkillAdaptor): ref_content, doc_meta, enable_chunking=enable_chunking, - chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), - preserve_code_blocks=kwargs.get('preserve_code_blocks', True), - source_file=ref_file.name + chunk_max_tokens=kwargs.get("chunk_max_tokens", 512), + preserve_code_blocks=kwargs.get("preserve_code_blocks", True), + source_file=ref_file.name, ) # Add all chunks as documents for chunk_text, chunk_meta in chunks: - documents.append({ - "content": chunk_text, - "meta": chunk_meta, - }) + documents.append( + { + "content": chunk_text, + "meta": chunk_meta, + } + ) # Return as formatted JSON return json.dumps(documents, indent=2, ensure_ascii=False) @@ -125,7 +125,7 @@ class HaystackAdaptor(SkillAdaptor): output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, - preserve_code_blocks: bool = True + preserve_code_blocks: bool = True, ) -> Path: """ Package skill into JSON file for Haystack. @@ -159,7 +159,7 @@ class HaystackAdaptor(SkillAdaptor): metadata, enable_chunking=enable_chunking, chunk_max_tokens=chunk_max_tokens, - preserve_code_blocks=preserve_code_blocks + preserve_code_blocks=preserve_code_blocks, ) # Write to file diff --git a/src/skill_seekers/cli/adaptors/langchain.py b/src/skill_seekers/cli/adaptors/langchain.py index 4481384..d937290 100644 --- a/src/skill_seekers/cli/adaptors/langchain.py +++ b/src/skill_seekers/cli/adaptors/langchain.py @@ -29,11 +29,7 @@ class LangChainAdaptor(SkillAdaptor): DEFAULT_API_ENDPOINT = None # No upload endpoint def format_skill_md( - self, - skill_dir: Path, - metadata: SkillMetadata, - enable_chunking: bool = False, - **kwargs + self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs ) -> str: """ Format skill as JSON array of LangChain Documents. @@ -73,17 +69,14 @@ class LangChainAdaptor(SkillAdaptor): content, doc_metadata, enable_chunking=enable_chunking, - chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), - preserve_code_blocks=kwargs.get('preserve_code_blocks', True), - source_file="SKILL.md" + chunk_max_tokens=kwargs.get("chunk_max_tokens", 512), + preserve_code_blocks=kwargs.get("preserve_code_blocks", True), + source_file="SKILL.md", ) # Add all chunks to documents for chunk_text, chunk_meta in chunks: - documents.append({ - "page_content": chunk_text, - "metadata": chunk_meta - }) + documents.append({"page_content": chunk_text, "metadata": chunk_meta}) # Convert all reference files using base helper method for ref_file, ref_content in self._iterate_references(skill_dir): @@ -104,17 +97,14 @@ class LangChainAdaptor(SkillAdaptor): ref_content, doc_metadata, enable_chunking=enable_chunking, - chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), - preserve_code_blocks=kwargs.get('preserve_code_blocks', True), - source_file=ref_file.name + chunk_max_tokens=kwargs.get("chunk_max_tokens", 512), + preserve_code_blocks=kwargs.get("preserve_code_blocks", True), + source_file=ref_file.name, ) # Add all chunks to documents for chunk_text, chunk_meta in chunks: - documents.append({ - "page_content": chunk_text, - "metadata": chunk_meta - }) + documents.append({"page_content": chunk_text, "metadata": chunk_meta}) # Return as formatted JSON return json.dumps(documents, indent=2, ensure_ascii=False) @@ -125,7 +115,7 @@ class LangChainAdaptor(SkillAdaptor): output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, - preserve_code_blocks: bool = True + preserve_code_blocks: bool = True, ) -> Path: """ Package skill into JSON file for LangChain. @@ -162,7 +152,7 @@ class LangChainAdaptor(SkillAdaptor): metadata, enable_chunking=enable_chunking, chunk_max_tokens=chunk_max_tokens, - preserve_code_blocks=preserve_code_blocks + preserve_code_blocks=preserve_code_blocks, ) # Write to file diff --git a/src/skill_seekers/cli/adaptors/llama_index.py b/src/skill_seekers/cli/adaptors/llama_index.py index f4a0637..7ea6ed9 100644 --- a/src/skill_seekers/cli/adaptors/llama_index.py +++ b/src/skill_seekers/cli/adaptors/llama_index.py @@ -42,11 +42,7 @@ class LlamaIndexAdaptor(SkillAdaptor): return self._generate_deterministic_id(content, metadata, format="hex") def format_skill_md( - self, - skill_dir: Path, - metadata: SkillMetadata, - enable_chunking: bool = False, - **kwargs + self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs ) -> str: """ Format skill as JSON array of LlamaIndex Nodes. @@ -88,19 +84,21 @@ class LlamaIndexAdaptor(SkillAdaptor): content, node_metadata, enable_chunking=enable_chunking, - chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), - preserve_code_blocks=kwargs.get('preserve_code_blocks', True), - source_file="SKILL.md" + chunk_max_tokens=kwargs.get("chunk_max_tokens", 512), + preserve_code_blocks=kwargs.get("preserve_code_blocks", True), + source_file="SKILL.md", ) # Add all chunks as nodes for chunk_text, chunk_meta in chunks: - nodes.append({ - "text": chunk_text, - "metadata": chunk_meta, - "id_": self._generate_node_id(chunk_text, chunk_meta), - "embedding": None, - }) + nodes.append( + { + "text": chunk_text, + "metadata": chunk_meta, + "id_": self._generate_node_id(chunk_text, chunk_meta), + "embedding": None, + } + ) # Convert all reference files using base helper method for ref_file, ref_content in self._iterate_references(skill_dir): @@ -121,19 +119,21 @@ class LlamaIndexAdaptor(SkillAdaptor): ref_content, node_metadata, enable_chunking=enable_chunking, - chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), - preserve_code_blocks=kwargs.get('preserve_code_blocks', True), - source_file=ref_file.name + chunk_max_tokens=kwargs.get("chunk_max_tokens", 512), + preserve_code_blocks=kwargs.get("preserve_code_blocks", True), + source_file=ref_file.name, ) # Add all chunks as nodes for chunk_text, chunk_meta in chunks: - nodes.append({ - "text": chunk_text, - "metadata": chunk_meta, - "id_": self._generate_node_id(chunk_text, chunk_meta), - "embedding": None, - }) + nodes.append( + { + "text": chunk_text, + "metadata": chunk_meta, + "id_": self._generate_node_id(chunk_text, chunk_meta), + "embedding": None, + } + ) # Return as formatted JSON return json.dumps(nodes, indent=2, ensure_ascii=False) @@ -144,7 +144,7 @@ class LlamaIndexAdaptor(SkillAdaptor): output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, - preserve_code_blocks: bool = True + preserve_code_blocks: bool = True, ) -> Path: """ Package skill into JSON file for LlamaIndex. @@ -178,7 +178,7 @@ class LlamaIndexAdaptor(SkillAdaptor): metadata, enable_chunking=enable_chunking, chunk_max_tokens=chunk_max_tokens, - preserve_code_blocks=preserve_code_blocks + preserve_code_blocks=preserve_code_blocks, ) # Write to file diff --git a/src/skill_seekers/cli/adaptors/markdown.py b/src/skill_seekers/cli/adaptors/markdown.py index 057f662..5d60033 100644 --- a/src/skill_seekers/cli/adaptors/markdown.py +++ b/src/skill_seekers/cli/adaptors/markdown.py @@ -81,7 +81,14 @@ Browse the reference files for detailed information on each topic. All files are # Return pure markdown (no frontmatter, no special formatting) return content_body - def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path: + def package( + self, + skill_dir: Path, + output_path: Path, + enable_chunking: bool = False, + chunk_max_tokens: int = 512, + preserve_code_blocks: bool = True, + ) -> Path: """ Package skill into ZIP file with markdown documentation. diff --git a/src/skill_seekers/cli/adaptors/openai.py b/src/skill_seekers/cli/adaptors/openai.py index 5384238..e6437af 100644 --- a/src/skill_seekers/cli/adaptors/openai.py +++ b/src/skill_seekers/cli/adaptors/openai.py @@ -103,7 +103,14 @@ Always prioritize accuracy by consulting the attached documentation files before # Return plain text instructions (NO frontmatter) return content_body - def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path: + def package( + self, + skill_dir: Path, + output_path: Path, + enable_chunking: bool = False, + chunk_max_tokens: int = 512, + preserve_code_blocks: bool = True, + ) -> Path: """ Package skill into ZIP file for OpenAI Assistants. diff --git a/src/skill_seekers/cli/adaptors/qdrant.py b/src/skill_seekers/cli/adaptors/qdrant.py index b74815e..d201510 100644 --- a/src/skill_seekers/cli/adaptors/qdrant.py +++ b/src/skill_seekers/cli/adaptors/qdrant.py @@ -44,11 +44,7 @@ class QdrantAdaptor(SkillAdaptor): return self._generate_deterministic_id(content, metadata, format="uuid5") def format_skill_md( - self, - skill_dir: Path, - metadata: SkillMetadata, - enable_chunking: bool = False, - **kwargs + self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs ) -> str: """ Format skill as Qdrant collection JSON. @@ -87,30 +83,35 @@ class QdrantAdaptor(SkillAdaptor): content, payload_meta, enable_chunking=enable_chunking, - chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), - preserve_code_blocks=kwargs.get('preserve_code_blocks', True), - source_file="SKILL.md" + chunk_max_tokens=kwargs.get("chunk_max_tokens", 512), + preserve_code_blocks=kwargs.get("preserve_code_blocks", True), + source_file="SKILL.md", ) # Add all chunks as points for chunk_text, chunk_meta in chunks: - point_id = self._generate_point_id(chunk_text, { - "source": chunk_meta.get("source", metadata.name), - "file": chunk_meta.get("file", "SKILL.md") - }) - - points.append({ - "id": point_id, - "vector": None, # User will generate embeddings - "payload": { - "content": chunk_text, + point_id = self._generate_point_id( + chunk_text, + { "source": chunk_meta.get("source", metadata.name), - "category": chunk_meta.get("category", "overview"), "file": chunk_meta.get("file", "SKILL.md"), - "type": chunk_meta.get("type", "documentation"), - "version": chunk_meta.get("version", metadata.version), + }, + ) + + points.append( + { + "id": point_id, + "vector": None, # User will generate embeddings + "payload": { + "content": chunk_text, + "source": chunk_meta.get("source", metadata.name), + "category": chunk_meta.get("category", "overview"), + "file": chunk_meta.get("file", "SKILL.md"), + "type": chunk_meta.get("type", "documentation"), + "version": chunk_meta.get("version", metadata.version), + }, } - }) + ) # Convert all reference files using base helper method for ref_file, ref_content in self._iterate_references(skill_dir): @@ -130,30 +131,35 @@ class QdrantAdaptor(SkillAdaptor): ref_content, payload_meta, enable_chunking=enable_chunking, - chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), - preserve_code_blocks=kwargs.get('preserve_code_blocks', True), - source_file=ref_file.name + chunk_max_tokens=kwargs.get("chunk_max_tokens", 512), + preserve_code_blocks=kwargs.get("preserve_code_blocks", True), + source_file=ref_file.name, ) # Add all chunks as points for chunk_text, chunk_meta in chunks: - point_id = self._generate_point_id(chunk_text, { - "source": chunk_meta.get("source", metadata.name), - "file": chunk_meta.get("file", ref_file.name) - }) - - points.append({ - "id": point_id, - "vector": None, # User will generate embeddings - "payload": { - "content": chunk_text, + point_id = self._generate_point_id( + chunk_text, + { "source": chunk_meta.get("source", metadata.name), - "category": chunk_meta.get("category", category), "file": chunk_meta.get("file", ref_file.name), - "type": chunk_meta.get("type", "reference"), - "version": chunk_meta.get("version", metadata.version), + }, + ) + + points.append( + { + "id": point_id, + "vector": None, # User will generate embeddings + "payload": { + "content": chunk_text, + "source": chunk_meta.get("source", metadata.name), + "category": chunk_meta.get("category", category), + "file": chunk_meta.get("file", ref_file.name), + "type": chunk_meta.get("type", "reference"), + "version": chunk_meta.get("version", metadata.version), + }, } - }) + ) # Qdrant configuration config = { @@ -184,7 +190,7 @@ class QdrantAdaptor(SkillAdaptor): output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, - preserve_code_blocks: bool = True + preserve_code_blocks: bool = True, ) -> Path: """ Package skill into JSON file for Qdrant. @@ -217,7 +223,7 @@ class QdrantAdaptor(SkillAdaptor): metadata, enable_chunking=enable_chunking, chunk_max_tokens=chunk_max_tokens, - preserve_code_blocks=preserve_code_blocks + preserve_code_blocks=preserve_code_blocks, ) # Write to file diff --git a/src/skill_seekers/cli/adaptors/streaming_adaptor.py b/src/skill_seekers/cli/adaptors/streaming_adaptor.py index 164a3ed..09319e2 100644 --- a/src/skill_seekers/cli/adaptors/streaming_adaptor.py +++ b/src/skill_seekers/cli/adaptors/streaming_adaptor.py @@ -36,7 +36,7 @@ class StreamingAdaptorMixin: chunk_size: int = 4000, chunk_overlap: int = 200, batch_size: int = 100, - progress_callback: callable | None = None + progress_callback: callable | None = None, ) -> Path: """ Package skill using streaming ingestion. @@ -60,9 +60,7 @@ class StreamingAdaptorMixin: # Initialize streaming ingester ingester = StreamingIngester( - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - batch_size=batch_size + chunk_size=chunk_size, chunk_overlap=chunk_overlap, batch_size=batch_size ) print(f"\n📊 Streaming ingestion starting...") @@ -77,9 +75,11 @@ class StreamingAdaptorMixin: nonlocal last_update # Update every 10 chunks if progress.processed_chunks - last_update >= 10: - print(f" {progress.progress_percent:.1f}% - " - f"{progress.processed_chunks}/{progress.total_chunks} chunks " - f"({progress.chunks_per_second:.1f} chunks/sec)") + print( + f" {progress.progress_percent:.1f}% - " + f"{progress.processed_chunks}/{progress.total_chunks} chunks " + f"({progress.chunks_per_second:.1f} chunks/sec)" + ) last_update = progress.processed_chunks if progress_callback: @@ -97,10 +97,7 @@ class StreamingAdaptorMixin: # Convert chunks to platform format print(f"\n📦 Converting to {self.PLATFORM_NAME} format...") - package_data = self._convert_chunks_to_platform_format( - all_chunks, - skill_dir.name - ) + package_data = self._convert_chunks_to_platform_format(all_chunks, skill_dir.name) # Determine output filename if output_path.is_dir() or str(output_path).endswith("/"): @@ -114,8 +111,7 @@ class StreamingAdaptorMixin: # Write output output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text( - json.dumps(package_data, indent=2, ensure_ascii=False), - encoding="utf-8" + json.dumps(package_data, indent=2, ensure_ascii=False), encoding="utf-8" ) print(f"✅ Package created: {output_path}") @@ -124,9 +120,7 @@ class StreamingAdaptorMixin: return output_path def _convert_chunks_to_platform_format( - self, - chunks: list[tuple[str, dict]], - skill_name: str + self, chunks: list[tuple[str, dict]], skill_name: str ) -> dict: """ Convert chunks to platform-specific format. @@ -156,14 +150,11 @@ class StreamingAdaptorMixin: "metadatas": metadatas, "ids": ids, "total_chunks": len(chunks), - "streaming": True + "streaming": True, } def estimate_chunks( - self, - skill_dir: Path, - chunk_size: int = 4000, - chunk_overlap: int = 200 + self, skill_dir: Path, chunk_size: int = 4000, chunk_overlap: int = 200 ) -> dict[str, Any]: """ Estimate chunking for a skill directory. @@ -179,10 +170,7 @@ class StreamingAdaptorMixin: Estimation statistics """ skill_dir = Path(skill_dir) - StreamingIngester( - chunk_size=chunk_size, - chunk_overlap=chunk_overlap - ) + StreamingIngester(chunk_size=chunk_size, chunk_overlap=chunk_overlap) # Count files and estimate chunks total_docs = 0 @@ -201,11 +189,9 @@ class StreamingAdaptorMixin: total_chars += char_count estimated_chunks += chunk_count - file_stats.append({ - "file": "SKILL.md", - "chars": char_count, - "estimated_chunks": chunk_count - }) + file_stats.append( + {"file": "SKILL.md", "chars": char_count, "estimated_chunks": chunk_count} + ) # Reference files refs_dir = skill_dir / "references" @@ -214,17 +200,21 @@ class StreamingAdaptorMixin: if ref_file.is_file() and not ref_file.name.startswith("."): content = ref_file.read_text(encoding="utf-8") char_count = len(content) - chunk_count = max(1, (char_count - chunk_overlap) // (chunk_size - chunk_overlap) + 1) + chunk_count = max( + 1, (char_count - chunk_overlap) // (chunk_size - chunk_overlap) + 1 + ) total_docs += 1 total_chars += char_count estimated_chunks += chunk_count - file_stats.append({ - "file": ref_file.name, - "chars": char_count, - "estimated_chunks": chunk_count - }) + file_stats.append( + { + "file": ref_file.name, + "chars": char_count, + "estimated_chunks": chunk_count, + } + ) return { "skill_name": skill_dir.name, @@ -235,7 +225,7 @@ class StreamingAdaptorMixin: "chunk_overlap": chunk_overlap, "file_stats": file_stats, "estimated_memory_mb": (total_chars * 2) / (1024 * 1024), # UTF-8 estimate - "recommended_streaming": total_chars > 1_000_000 or total_docs > 100 + "recommended_streaming": total_chars > 1_000_000 or total_docs > 100, } @@ -251,25 +241,27 @@ class StreamingLangChainAdaptor(StreamingAdaptorMixin): documents = [] for chunk_text, chunk_meta in chunks: - documents.append({ - "page_content": chunk_text, - "metadata": { - "source": chunk_meta["source"], - "category": chunk_meta["category"], - "file": chunk_meta["file"], - "chunk_id": chunk_meta["chunk_id"], - "chunk_index": chunk_meta["chunk_index"], - "total_chunks": chunk_meta["total_chunks"], - "type": chunk_meta.get("type", "documentation"), - "version": chunk_meta.get("version", "1.0.0"), + documents.append( + { + "page_content": chunk_text, + "metadata": { + "source": chunk_meta["source"], + "category": chunk_meta["category"], + "file": chunk_meta["file"], + "chunk_id": chunk_meta["chunk_id"], + "chunk_index": chunk_meta["chunk_index"], + "total_chunks": chunk_meta["total_chunks"], + "type": chunk_meta.get("type", "documentation"), + "version": chunk_meta.get("version", "1.0.0"), + }, } - }) + ) return { "documents": documents, "total_chunks": len(chunks), "streaming": True, - "format": "LangChain Document" + "format": "LangChain Document", } @@ -287,14 +279,16 @@ class StreamingChromaAdaptor(StreamingAdaptorMixin): for chunk_text, chunk_meta in chunks: documents.append(chunk_text) - metadatas.append({ - "source": chunk_meta["source"], - "category": chunk_meta["category"], - "file": chunk_meta["file"], - "chunk_index": chunk_meta["chunk_index"], - "total_chunks": chunk_meta["total_chunks"], - "type": chunk_meta.get("type", "documentation"), - }) + metadatas.append( + { + "source": chunk_meta["source"], + "category": chunk_meta["category"], + "file": chunk_meta["file"], + "chunk_index": chunk_meta["chunk_index"], + "total_chunks": chunk_meta["total_chunks"], + "type": chunk_meta.get("type", "documentation"), + } + ) ids.append(chunk_meta["chunk_id"]) return { @@ -303,7 +297,7 @@ class StreamingChromaAdaptor(StreamingAdaptorMixin): "ids": ids, "collection_name": skill_name.replace("_", "-"), "total_chunks": len(chunks), - "streaming": True + "streaming": True, } @@ -339,11 +333,7 @@ def demo_streaming(): print("=" * 60) output = adaptor.package_streaming( - skill_dir, - Path("output"), - chunk_size=2000, - chunk_overlap=100, - batch_size=50 + skill_dir, Path("output"), chunk_size=2000, chunk_overlap=100, batch_size=50 ) print(f"\n✅ Complete! Output: {output}") diff --git a/src/skill_seekers/cli/adaptors/weaviate.py b/src/skill_seekers/cli/adaptors/weaviate.py index 5159dcd..c06081c 100644 --- a/src/skill_seekers/cli/adaptors/weaviate.py +++ b/src/skill_seekers/cli/adaptors/weaviate.py @@ -104,11 +104,7 @@ class WeaviateAdaptor(SkillAdaptor): } def format_skill_md( - self, - skill_dir: Path, - metadata: SkillMetadata, - enable_chunking: bool = False, - **kwargs + self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs ) -> str: """ Format skill as JSON for Weaviate ingestion. @@ -148,24 +144,26 @@ class WeaviateAdaptor(SkillAdaptor): content, obj_metadata, enable_chunking=enable_chunking, - chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), - preserve_code_blocks=kwargs.get('preserve_code_blocks', True), - source_file="SKILL.md" + chunk_max_tokens=kwargs.get("chunk_max_tokens", 512), + preserve_code_blocks=kwargs.get("preserve_code_blocks", True), + source_file="SKILL.md", ) # Add all chunks as objects for chunk_text, chunk_meta in chunks: - objects.append({ - "id": self._generate_uuid(chunk_text, chunk_meta), - "properties": { - "content": chunk_text, - "source": chunk_meta.get("source", metadata.name), - "category": chunk_meta.get("category", "overview"), - "file": chunk_meta.get("file", "SKILL.md"), - "type": chunk_meta.get("type", "documentation"), - "version": chunk_meta.get("version", metadata.version), - }, - }) + objects.append( + { + "id": self._generate_uuid(chunk_text, chunk_meta), + "properties": { + "content": chunk_text, + "source": chunk_meta.get("source", metadata.name), + "category": chunk_meta.get("category", "overview"), + "file": chunk_meta.get("file", "SKILL.md"), + "type": chunk_meta.get("type", "documentation"), + "version": chunk_meta.get("version", metadata.version), + }, + } + ) # Convert all reference files using base helper method for ref_file, ref_content in self._iterate_references(skill_dir): @@ -186,24 +184,26 @@ class WeaviateAdaptor(SkillAdaptor): ref_content, obj_metadata, enable_chunking=enable_chunking, - chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), - preserve_code_blocks=kwargs.get('preserve_code_blocks', True), - source_file=ref_file.name + chunk_max_tokens=kwargs.get("chunk_max_tokens", 512), + preserve_code_blocks=kwargs.get("preserve_code_blocks", True), + source_file=ref_file.name, ) # Add all chunks as objects for chunk_text, chunk_meta in chunks: - objects.append({ - "id": self._generate_uuid(chunk_text, chunk_meta), - "properties": { - "content": chunk_text, - "source": chunk_meta.get("source", metadata.name), - "category": chunk_meta.get("category", category), - "file": chunk_meta.get("file", ref_file.name), - "type": chunk_meta.get("type", "reference"), - "version": chunk_meta.get("version", metadata.version), - }, - }) + objects.append( + { + "id": self._generate_uuid(chunk_text, chunk_meta), + "properties": { + "content": chunk_text, + "source": chunk_meta.get("source", metadata.name), + "category": chunk_meta.get("category", category), + "file": chunk_meta.get("file", ref_file.name), + "type": chunk_meta.get("type", "reference"), + "version": chunk_meta.get("version", metadata.version), + }, + } + ) # Generate schema class_name = "".join(word.capitalize() for word in metadata.name.split("_")) @@ -222,7 +222,7 @@ class WeaviateAdaptor(SkillAdaptor): output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, - preserve_code_blocks: bool = True + preserve_code_blocks: bool = True, ) -> Path: """ Package skill into JSON file for Weaviate. @@ -258,7 +258,7 @@ class WeaviateAdaptor(SkillAdaptor): metadata, enable_chunking=enable_chunking, chunk_max_tokens=chunk_max_tokens, - preserve_code_blocks=preserve_code_blocks + preserve_code_blocks=preserve_code_blocks, ) # Write to file @@ -310,7 +310,7 @@ class WeaviateAdaptor(SkillAdaptor): except ImportError: return { "success": False, - "message": "weaviate-client not installed. Run: pip install weaviate-client" + "message": "weaviate-client not installed. Run: pip install weaviate-client", } # Load package @@ -319,16 +319,16 @@ class WeaviateAdaptor(SkillAdaptor): # Connect to Weaviate try: - if kwargs.get('use_cloud') and api_key: + if kwargs.get("use_cloud") and api_key: # Weaviate Cloud print(f"🌐 Connecting to Weaviate Cloud: {kwargs.get('cluster_url')}") client = weaviate.Client( - url=kwargs.get('cluster_url'), - auth_client_secret=weaviate.AuthApiKey(api_key=api_key) + url=kwargs.get("cluster_url"), + auth_client_secret=weaviate.AuthApiKey(api_key=api_key), ) else: # Local Weaviate instance - weaviate_url = kwargs.get('weaviate_url', 'http://localhost:8080') + weaviate_url = kwargs.get("weaviate_url", "http://localhost:8080") print(f"🌐 Connecting to Weaviate at: {weaviate_url}") client = weaviate.Client(url=weaviate_url) @@ -336,69 +336,67 @@ class WeaviateAdaptor(SkillAdaptor): if not client.is_ready(): return { "success": False, - "message": "Weaviate server not ready. Make sure Weaviate is running:\n docker run -p 8080:8080 semitechnologies/weaviate:latest" + "message": "Weaviate server not ready. Make sure Weaviate is running:\n docker run -p 8080:8080 semitechnologies/weaviate:latest", } except Exception as e: return { "success": False, - "message": f"Failed to connect to Weaviate: {e}\n\nMake sure Weaviate is running or provide correct credentials." + "message": f"Failed to connect to Weaviate: {e}\n\nMake sure Weaviate is running or provide correct credentials.", } # Create schema try: - client.schema.create_class(data['schema']) + client.schema.create_class(data["schema"]) print(f"✅ Created schema: {data['class_name']}") except Exception as e: if "already exists" in str(e).lower(): print(f"ℹ️ Schema already exists: {data['class_name']}") else: - return { - "success": False, - "message": f"Schema creation failed: {e}" - } + return {"success": False, "message": f"Schema creation failed: {e}"} # Handle embeddings - embedding_function = kwargs.get('embedding_function') + embedding_function = kwargs.get("embedding_function") try: with client.batch as batch: batch.batch_size = 100 - if embedding_function == 'openai': + if embedding_function == "openai": # Generate embeddings with OpenAI print("🔄 Generating OpenAI embeddings and uploading...") embeddings = self._generate_openai_embeddings( - [obj['properties']['content'] for obj in data['objects']], - api_key=kwargs.get('openai_api_key') + [obj["properties"]["content"] for obj in data["objects"]], + api_key=kwargs.get("openai_api_key"), ) - for i, obj in enumerate(data['objects']): + for i, obj in enumerate(data["objects"]): batch.add_data_object( - data_object=obj['properties'], - class_name=data['class_name'], - uuid=obj['id'], - vector=embeddings[i] + data_object=obj["properties"], + class_name=data["class_name"], + uuid=obj["id"], + vector=embeddings[i], ) if (i + 1) % 100 == 0: print(f" ✓ Uploaded {i + 1}/{len(data['objects'])} objects") - elif embedding_function == 'sentence-transformers': + elif embedding_function == "sentence-transformers": # Use sentence-transformers print("🔄 Generating sentence-transformer embeddings and uploading...") try: from sentence_transformers import SentenceTransformer - model = SentenceTransformer('all-MiniLM-L6-v2') - contents = [obj['properties']['content'] for obj in data['objects']] + + model = SentenceTransformer("all-MiniLM-L6-v2") + contents = [obj["properties"]["content"] for obj in data["objects"]] embeddings = model.encode(contents, show_progress_bar=True).tolist() - for i, obj in enumerate(data['objects']): + for i, obj in enumerate(data["objects"]): batch.add_data_object( - data_object=obj['properties'], - class_name=data['class_name'], - uuid=obj['id'], - vector=embeddings[i] + data_object=obj["properties"], + class_name=data["class_name"], + uuid=obj["id"], + vector=embeddings[i], ) if (i + 1) % 100 == 0: @@ -407,42 +405,37 @@ class WeaviateAdaptor(SkillAdaptor): except ImportError: return { "success": False, - "message": "sentence-transformers not installed. Run: pip install sentence-transformers" + "message": "sentence-transformers not installed. Run: pip install sentence-transformers", } else: # No embeddings - Weaviate will use its configured vectorizer print("🔄 Uploading objects (Weaviate will generate embeddings)...") - for i, obj in enumerate(data['objects']): + for i, obj in enumerate(data["objects"]): batch.add_data_object( - data_object=obj['properties'], - class_name=data['class_name'], - uuid=obj['id'] + data_object=obj["properties"], + class_name=data["class_name"], + uuid=obj["id"], ) if (i + 1) % 100 == 0: print(f" ✓ Uploaded {i + 1}/{len(data['objects'])} objects") - count = len(data['objects']) + count = len(data["objects"]) print(f"✅ Upload complete! {count} objects added to Weaviate") return { "success": True, "message": f"Uploaded {count} objects to Weaviate class '{data['class_name']}'", - "class_name": data['class_name'], - "count": count + "class_name": data["class_name"], + "count": count, } except Exception as e: - return { - "success": False, - "message": f"Upload failed: {e}" - } + return {"success": False, "message": f"Upload failed: {e}"} def _generate_openai_embeddings( - self, - documents: list[str], - api_key: str = None + self, documents: list[str], api_key: str = None ) -> list[list[float]]: """ Generate embeddings using OpenAI API. @@ -455,12 +448,13 @@ class WeaviateAdaptor(SkillAdaptor): List of embedding vectors """ import os + try: from openai import OpenAI except ImportError: raise ImportError("openai not installed. Run: pip install openai") from None - api_key = api_key or os.getenv('OPENAI_API_KEY') + api_key = api_key or os.getenv("OPENAI_API_KEY") if not api_key: raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key") @@ -473,14 +467,16 @@ class WeaviateAdaptor(SkillAdaptor): print(f" Generating embeddings for {len(documents)} documents...") for i in range(0, len(documents), batch_size): - batch = documents[i:i+batch_size] + batch = documents[i : i + batch_size] try: response = client.embeddings.create( input=batch, - model="text-embedding-3-small" # Cheapest, fastest + model="text-embedding-3-small", # Cheapest, fastest ) embeddings.extend([item.embedding for item in response.data]) - print(f" ✓ Generated {min(i+batch_size, len(documents))}/{len(documents)} embeddings") + print( + f" ✓ Generated {min(i + batch_size, len(documents))}/{len(documents)} embeddings" + ) except Exception as e: raise Exception(f"OpenAI embedding generation failed: {e}") from e diff --git a/src/skill_seekers/cli/architectural_pattern_detector.py b/src/skill_seekers/cli/architectural_pattern_detector.py index 66b0a25..e4d86af 100644 --- a/src/skill_seekers/cli/architectural_pattern_detector.py +++ b/src/skill_seekers/cli/architectural_pattern_detector.py @@ -101,10 +101,38 @@ class ArchitecturalPatternDetector: # Web Frameworks "Django": ["django", "manage.py", "settings.py", "urls.py"], "Flask": ["flask", "app.py", "wsgi.py"], - "Spring": ["springframework", "org.springframework", "@Controller", "@Service", "@Repository"], - "ASP.NET": ["Microsoft.AspNetCore", "System.Web", "Controllers", "Models", "Views", ".cshtml", "Startup.cs"], - "Rails": ["rails", "action", "app/models", "app/views", "app/controllers", "config/routes.rb"], - "Angular": ["@angular", "angular", "app.module.ts", "@Component", "@Injectable", "angular.json"], + "Spring": [ + "springframework", + "org.springframework", + "@Controller", + "@Service", + "@Repository", + ], + "ASP.NET": [ + "Microsoft.AspNetCore", + "System.Web", + "Controllers", + "Models", + "Views", + ".cshtml", + "Startup.cs", + ], + "Rails": [ + "rails", + "action", + "app/models", + "app/views", + "app/controllers", + "config/routes.rb", + ], + "Angular": [ + "@angular", + "angular", + "app.module.ts", + "@Component", + "@Injectable", + "angular.json", + ], "React": ["react", "package.json", "components"], "Vue.js": ["vue", ".vue", "components"], "Express": ["express", "app.js", "routes"], @@ -208,7 +236,9 @@ class ArchitecturalPatternDetector: # Create searchable import string import_content = " ".join(all_imports) - logger.debug(f"Collected {len(all_imports)} imports from {len([f for f in files if f.get('imports')])} files for framework detection") + logger.debug( + f"Collected {len(all_imports)} imports from {len([f for f in files if f.get('imports')])} files for framework detection" + ) # Also check actual directory structure for game engine markers # (project.godot, .unity, .uproject are config files, not in analyzed files) @@ -245,7 +275,9 @@ class ArchitecturalPatternDetector: # Check in file paths, directory structure, AND imports path_matches = sum(1 for marker in markers if marker.lower() in all_content.lower()) dir_matches = sum(1 for marker in markers if marker.lower() in dir_content.lower()) - import_matches = sum(1 for marker in markers if marker.lower() in import_content.lower()) + import_matches = sum( + 1 for marker in markers if marker.lower() in import_content.lower() + ) # Strategy: Prioritize import-based detection (more accurate) # If we have import matches, they're strong signals - use them alone @@ -257,7 +289,9 @@ class ArchitecturalPatternDetector: elif (path_matches + dir_matches) >= 2: # Path/directory-based detection (requires 2+ matches) detected.append(framework) - logger.info(f" 📦 Detected framework: {framework} (path:{path_matches} dir:{dir_matches})") + logger.info( + f" 📦 Detected framework: {framework} (path:{path_matches} dir:{dir_matches})" + ) return detected diff --git a/src/skill_seekers/cli/benchmark_cli.py b/src/skill_seekers/cli/benchmark_cli.py index 59927fd..c9fefaa 100644 --- a/src/skill_seekers/cli/benchmark_cli.py +++ b/src/skill_seekers/cli/benchmark_cli.py @@ -77,7 +77,9 @@ def run_embedding_benchmark(runner, config): with bench.timer("batch_embedding"), bench.memory("batch_embedding"): embeddings = generator.generate_batch(texts, model=model) - bench.metric("embeddings_per_sec", len(embeddings) / bench.result.timings[-1].duration, "emb/sec") + bench.metric( + "embeddings_per_sec", len(embeddings) / bench.result.timings[-1].duration, "emb/sec" + ) name = config.get("name", "embedding-benchmark") report = runner.run(name, benchmark_func) @@ -97,7 +99,7 @@ def run_storage_benchmark(runner, config): storage = get_storage_adaptor(provider, bucket=bucket) # Create test file - with NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f: + with NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f: f.write("Test data" * 1000) test_file = Path(f.name) @@ -128,10 +130,7 @@ def compare_command(args): """Compare two benchmarks.""" runner = BenchmarkRunner() - comparison = runner.compare( - baseline_path=Path(args.baseline), - current_path=Path(args.current) - ) + comparison = runner.compare(baseline_path=Path(args.baseline), current_path=Path(args.current)) print(f"\n📊 Comparison: {comparison.name}\n") print(f"Overall: {comparison.overall_improvement}\n") @@ -213,7 +212,7 @@ def cleanup_command(args): def main(): """Main entry point.""" parser = argparse.ArgumentParser( - description='Performance benchmarking suite', + description="Performance benchmarking suite", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: @@ -233,54 +232,46 @@ Examples: # Cleanup old benchmarks skill-seekers-benchmark cleanup --keep 5 - """ + """, ) - subparsers = parser.add_subparsers(dest='command', help='Command to execute') + subparsers = parser.add_subparsers(dest="command", help="Command to execute") # Run command - run_parser = subparsers.add_parser('run', help='Run benchmark') - run_parser.add_argument('--config', required=True, help='Benchmark config file') + run_parser = subparsers.add_parser("run", help="Run benchmark") + run_parser.add_argument("--config", required=True, help="Benchmark config file") run_parser.add_argument( - '--output-dir', '-o', - default='benchmarks', - help='Output directory (default: benchmarks)' + "--output-dir", "-o", default="benchmarks", help="Output directory (default: benchmarks)" ) # Compare command - compare_parser = subparsers.add_parser('compare', help='Compare two benchmarks') - compare_parser.add_argument('--baseline', required=True, help='Baseline benchmark') - compare_parser.add_argument('--current', required=True, help='Current benchmark') + compare_parser = subparsers.add_parser("compare", help="Compare two benchmarks") + compare_parser.add_argument("--baseline", required=True, help="Baseline benchmark") + compare_parser.add_argument("--current", required=True, help="Current benchmark") compare_parser.add_argument( - '--fail-on-regression', - action='store_true', - help='Exit with error if regressions detected' + "--fail-on-regression", action="store_true", help="Exit with error if regressions detected" ) # List command - list_parser = subparsers.add_parser('list', help='List saved benchmarks') + list_parser = subparsers.add_parser("list", help="List saved benchmarks") list_parser.add_argument( - '--output-dir', '-o', - default='benchmarks', - help='Benchmark directory (default: benchmarks)' + "--output-dir", "-o", default="benchmarks", help="Benchmark directory (default: benchmarks)" ) # Show command - show_parser = subparsers.add_parser('show', help='Show benchmark details') - show_parser.add_argument('path', help='Path to benchmark file') + show_parser = subparsers.add_parser("show", help="Show benchmark details") + show_parser.add_argument("path", help="Path to benchmark file") # Cleanup command - cleanup_parser = subparsers.add_parser('cleanup', help='Cleanup old benchmarks') + cleanup_parser = subparsers.add_parser("cleanup", help="Cleanup old benchmarks") cleanup_parser.add_argument( - '--output-dir', '-o', - default='benchmarks', - help='Benchmark directory (default: benchmarks)' + "--output-dir", "-o", default="benchmarks", help="Benchmark directory (default: benchmarks)" ) cleanup_parser.add_argument( - '--keep', + "--keep", type=int, default=5, - help='Number of latest benchmarks to keep per name (default: 5)' + help="Number of latest benchmarks to keep per name (default: 5)", ) args = parser.parse_args() @@ -290,20 +281,20 @@ Examples: sys.exit(1) try: - if args.command == 'run': + if args.command == "run": run_command(args) - elif args.command == 'compare': + elif args.command == "compare": compare_command(args) - elif args.command == 'list': + elif args.command == "list": list_command(args) - elif args.command == 'show': + elif args.command == "show": show_command(args) - elif args.command == 'cleanup': + elif args.command == "cleanup": cleanup_command(args) except Exception as e: print(f"\n❌ Error: {e}", file=sys.stderr) sys.exit(1) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/skill_seekers/cli/cloud_storage_cli.py b/src/skill_seekers/cli/cloud_storage_cli.py index 8f25cb3..f8fa950 100644 --- a/src/skill_seekers/cli/cloud_storage_cli.py +++ b/src/skill_seekers/cli/cloud_storage_cli.py @@ -15,18 +15,13 @@ from .storage import get_storage_adaptor def upload_command(args): """Handle upload subcommand.""" adaptor = get_storage_adaptor( - args.provider, - bucket=args.bucket, - container=args.container, - **parse_extra_args(args.extra) + args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra) ) if Path(args.local_path).is_dir(): print(f"📁 Uploading directory: {args.local_path}") uploaded_files = adaptor.upload_directory( - args.local_path, - args.remote_path, - exclude_patterns=args.exclude + args.local_path, args.remote_path, exclude_patterns=args.exclude ) print(f"✅ Uploaded {len(uploaded_files)} files") if args.verbose: @@ -41,19 +36,13 @@ def upload_command(args): def download_command(args): """Handle download subcommand.""" adaptor = get_storage_adaptor( - args.provider, - bucket=args.bucket, - container=args.container, - **parse_extra_args(args.extra) + args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra) ) # Check if remote path is a directory (ends with /) - if args.remote_path.endswith('/'): + if args.remote_path.endswith("/"): print(f"📁 Downloading directory: {args.remote_path}") - downloaded_files = adaptor.download_directory( - args.remote_path, - args.local_path - ) + downloaded_files = adaptor.download_directory(args.remote_path, args.local_path) print(f"✅ Downloaded {len(downloaded_files)} files") if args.verbose: for file_path in downloaded_files: @@ -67,10 +56,7 @@ def download_command(args): def list_command(args): """Handle list subcommand.""" adaptor = get_storage_adaptor( - args.provider, - bucket=args.bucket, - container=args.container, - **parse_extra_args(args.extra) + args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra) ) print(f"📋 Listing files: {args.prefix or '(root)'}") @@ -99,15 +85,12 @@ def list_command(args): def delete_command(args): """Handle delete subcommand.""" adaptor = get_storage_adaptor( - args.provider, - bucket=args.bucket, - container=args.container, - **parse_extra_args(args.extra) + args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra) ) if not args.force: response = input(f"⚠️ Delete {args.remote_path}? [y/N]: ") - if response.lower() != 'y': + if response.lower() != "y": print("❌ Deletion cancelled") return @@ -119,10 +102,7 @@ def delete_command(args): def url_command(args): """Handle url subcommand.""" adaptor = get_storage_adaptor( - args.provider, - bucket=args.bucket, - container=args.container, - **parse_extra_args(args.extra) + args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra) ) print(f"🔗 Generating signed URL: {args.remote_path}") @@ -134,10 +114,7 @@ def url_command(args): def copy_command(args): """Handle copy subcommand.""" adaptor = get_storage_adaptor( - args.provider, - bucket=args.bucket, - container=args.container, - **parse_extra_args(args.extra) + args.provider, bucket=args.bucket, container=args.container, **parse_extra_args(args.extra) ) print(f"📋 Copying: {args.source_path} → {args.dest_path}") @@ -147,7 +124,7 @@ def copy_command(args): def format_size(size_bytes: int) -> str: """Format file size in human-readable format.""" - for unit in ['B', 'KB', 'MB', 'GB', 'TB']: + for unit in ["B", "KB", "MB", "GB", "TB"]: if size_bytes < 1024.0: return f"{size_bytes:.1f}{unit}" size_bytes /= 1024.0 @@ -161,11 +138,11 @@ def parse_extra_args(extra: list | None) -> dict: result = {} for arg in extra: - if '=' in arg: - key, value = arg.split('=', 1) - result[key.lstrip('-')] = value + if "=" in arg: + key, value = arg.split("=", 1) + result[key.lstrip("-")] = value else: - result[arg.lstrip('-')] = True + result[arg.lstrip("-")] = True return result @@ -173,7 +150,7 @@ def parse_extra_args(extra: list | None) -> dict: def main(): """Main entry point.""" parser = argparse.ArgumentParser( - description='Cloud storage operations for Skill Seekers', + description="Cloud storage operations for Skill Seekers", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: @@ -197,114 +174,66 @@ Provider-specific options: S3: --region=us-west-2 --endpoint-url=https://... GCS: --project=my-project --credentials-path=/path/to/creds.json Azure: --account-name=myaccount --account-key=... - """ + """, ) # Global arguments parser.add_argument( - '--provider', - choices=['s3', 'gcs', 'azure'], - required=True, - help='Cloud storage provider' - ) - parser.add_argument( - '--bucket', - help='S3/GCS bucket name (for S3/GCS)' - ) - parser.add_argument( - '--container', - help='Azure container name (for Azure)' - ) - parser.add_argument( - '--verbose', '-v', - action='store_true', - help='Verbose output' + "--provider", choices=["s3", "gcs", "azure"], required=True, help="Cloud storage provider" ) + parser.add_argument("--bucket", help="S3/GCS bucket name (for S3/GCS)") + parser.add_argument("--container", help="Azure container name (for Azure)") + parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") - subparsers = parser.add_subparsers(dest='command', help='Command to execute') + subparsers = parser.add_subparsers(dest="command", help="Command to execute") # Upload command - upload_parser = subparsers.add_parser('upload', help='Upload file or directory') - upload_parser.add_argument('local_path', help='Local file or directory path') - upload_parser.add_argument('remote_path', help='Remote path in cloud storage') + upload_parser = subparsers.add_parser("upload", help="Upload file or directory") + upload_parser.add_argument("local_path", help="Local file or directory path") + upload_parser.add_argument("remote_path", help="Remote path in cloud storage") upload_parser.add_argument( - '--exclude', - action='append', - help='Glob patterns to exclude (for directories)' - ) - upload_parser.add_argument( - 'extra', - nargs='*', - help='Provider-specific options (--key=value)' + "--exclude", action="append", help="Glob patterns to exclude (for directories)" ) + upload_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)") # Download command - download_parser = subparsers.add_parser('download', help='Download file or directory') - download_parser.add_argument('remote_path', help='Remote path in cloud storage') - download_parser.add_argument('local_path', help='Local destination path') - download_parser.add_argument( - 'extra', - nargs='*', - help='Provider-specific options (--key=value)' - ) + download_parser = subparsers.add_parser("download", help="Download file or directory") + download_parser.add_argument("remote_path", help="Remote path in cloud storage") + download_parser.add_argument("local_path", help="Local destination path") + download_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)") # List command - list_parser = subparsers.add_parser('list', help='List files in cloud storage') + list_parser = subparsers.add_parser("list", help="List files in cloud storage") + list_parser.add_argument("--prefix", default="", help="Prefix to filter files") list_parser.add_argument( - '--prefix', - default='', - help='Prefix to filter files' - ) - list_parser.add_argument( - '--max-results', - type=int, - default=1000, - help='Maximum number of results' - ) - list_parser.add_argument( - 'extra', - nargs='*', - help='Provider-specific options (--key=value)' + "--max-results", type=int, default=1000, help="Maximum number of results" ) + list_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)") # Delete command - delete_parser = subparsers.add_parser('delete', help='Delete file from cloud storage') - delete_parser.add_argument('remote_path', help='Remote path in cloud storage') + delete_parser = subparsers.add_parser("delete", help="Delete file from cloud storage") + delete_parser.add_argument("remote_path", help="Remote path in cloud storage") delete_parser.add_argument( - '--force', '-f', - action='store_true', - help='Skip confirmation prompt' - ) - delete_parser.add_argument( - 'extra', - nargs='*', - help='Provider-specific options (--key=value)' + "--force", "-f", action="store_true", help="Skip confirmation prompt" ) + delete_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)") # URL command - url_parser = subparsers.add_parser('url', help='Generate signed URL') - url_parser.add_argument('remote_path', help='Remote path in cloud storage') + url_parser = subparsers.add_parser("url", help="Generate signed URL") + url_parser.add_argument("remote_path", help="Remote path in cloud storage") url_parser.add_argument( - '--expires-in', + "--expires-in", type=int, default=3600, - help='URL expiration time in seconds (default: 3600)' - ) - url_parser.add_argument( - 'extra', - nargs='*', - help='Provider-specific options (--key=value)' + help="URL expiration time in seconds (default: 3600)", ) + url_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)") # Copy command - copy_parser = subparsers.add_parser('copy', help='Copy file within cloud storage') - copy_parser.add_argument('source_path', help='Source path') - copy_parser.add_argument('dest_path', help='Destination path') - copy_parser.add_argument( - 'extra', - nargs='*', - help='Provider-specific options (--key=value)' - ) + copy_parser = subparsers.add_parser("copy", help="Copy file within cloud storage") + copy_parser.add_argument("source_path", help="Source path") + copy_parser.add_argument("dest_path", help="Destination path") + copy_parser.add_argument("extra", nargs="*", help="Provider-specific options (--key=value)") args = parser.parse_args() @@ -313,26 +242,26 @@ Provider-specific options: sys.exit(1) # Validate bucket/container based on provider - if args.provider in ['s3', 'gcs'] and not args.bucket: + if args.provider in ["s3", "gcs"] and not args.bucket: print(f"❌ Error: --bucket is required for {args.provider.upper()}", file=sys.stderr) sys.exit(1) - elif args.provider == 'azure' and not args.container: + elif args.provider == "azure" and not args.container: print("❌ Error: --container is required for Azure", file=sys.stderr) sys.exit(1) try: # Execute command - if args.command == 'upload': + if args.command == "upload": upload_command(args) - elif args.command == 'download': + elif args.command == "download": download_command(args) - elif args.command == 'list': + elif args.command == "list": list_command(args) - elif args.command == 'delete': + elif args.command == "delete": delete_command(args) - elif args.command == 'url': + elif args.command == "url": url_command(args) - elif args.command == 'copy': + elif args.command == "copy": copy_command(args) except FileNotFoundError as e: @@ -342,9 +271,10 @@ Provider-specific options: print(f"❌ Error: {e}", file=sys.stderr) if args.verbose: import traceback + traceback.print_exc() sys.exit(1) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/skill_seekers/cli/code_analyzer.py b/src/skill_seekers/cli/code_analyzer.py index 273161d..95b288e 100644 --- a/src/skill_seekers/cli/code_analyzer.py +++ b/src/skill_seekers/cli/code_analyzer.py @@ -376,8 +376,8 @@ class CodeAnalyzer: for match in re.finditer(pattern, content): module = match.group(1) # Extract package name (before first /) - package = module.split('/')[0] - if package and not package.startswith('.'): # Skip relative imports + package = module.split("/")[0] + if package and not package.startswith("."): # Skip relative imports imports.append(package) return { @@ -694,11 +694,11 @@ class CodeAnalyzer: for match in re.finditer(using_pattern, content): namespace = match.group(1).strip() # Skip using aliases (using Foo = Bar.Baz) - if '=' not in namespace: + if "=" not in namespace: # Extract base namespace (first 1-2 segments) - parts = namespace.split('.') + parts = namespace.split(".") if len(parts) >= 2: - base_ns = '.'.join(parts[:2]) + base_ns = ".".join(parts[:2]) imports.append(base_ns) elif len(parts) == 1: imports.append(parts[0]) @@ -1130,10 +1130,10 @@ class CodeAnalyzer: for match in re.finditer(import_pattern, content): import_path = match.group(1).strip() # Extract package name (first 2-3 segments for framework detection) - parts = import_path.split('.') + parts = import_path.split(".") if len(parts) >= 2: # Get base package (e.g., "org.springframework" from "org.springframework.boot.SpringApplication") - package = '.'.join(parts[:2]) + package = ".".join(parts[:2]) imports.append(package) return { @@ -1303,7 +1303,7 @@ class CodeAnalyzer: for match in re.finditer(require_pattern, content): module = match.group(1) # Extract gem name (before first /) - gem = module.split('/')[0] + gem = module.split("/")[0] imports.append(gem) return { @@ -1443,7 +1443,7 @@ class CodeAnalyzer: for match in re.finditer(use_pattern, content): namespace = match.group(1).strip() # Extract vendor name (first segment) - parts = namespace.split('\\') + parts = namespace.split("\\") if parts: vendor = parts[0] imports.append(vendor.lower()) diff --git a/src/skill_seekers/cli/codebase_scraper.py b/src/skill_seekers/cli/codebase_scraper.py index 08a40e2..7696d59 100644 --- a/src/skill_seekers/cli/codebase_scraper.py +++ b/src/skill_seekers/cli/codebase_scraper.py @@ -1036,11 +1036,15 @@ def analyze_codebase( # Save summary statistics summary_json = pattern_output / "summary.json" with open(summary_json, "w", encoding="utf-8") as f: - json.dump({ - "statistics": stats, - "thresholds": multi_level["thresholds"], - "files_analyzed": len(pattern_results), - }, f, indent=2) + json.dump( + { + "statistics": stats, + "thresholds": multi_level["thresholds"], + "files_analyzed": len(pattern_results), + }, + f, + indent=2, + ) # Log results with breakdown by confidence logger.info(f"✅ Detected {stats['total']} patterns in {len(pattern_results)} files") @@ -1931,21 +1935,15 @@ def _check_deprecated_flags(args): "⚠️ DEPRECATED: --ai-mode local → use --enhance-level without API key instead" ) elif args.ai_mode == "none": - warnings.append( - "⚠️ DEPRECATED: --ai-mode none → use --enhance-level 0 instead" - ) + warnings.append("⚠️ DEPRECATED: --ai-mode none → use --enhance-level 0 instead") # Deprecated: --quick flag if hasattr(args, "quick") and args.quick: - warnings.append( - "⚠️ DEPRECATED: --quick → use --preset quick instead" - ) + warnings.append("⚠️ DEPRECATED: --quick → use --preset quick instead") # Deprecated: --comprehensive flag if hasattr(args, "comprehensive") and args.comprehensive: - warnings.append( - "⚠️ DEPRECATED: --comprehensive → use --preset comprehensive instead" - ) + warnings.append("⚠️ DEPRECATED: --comprehensive → use --preset comprehensive instead") # Show warnings if any found if warnings: @@ -2000,24 +1998,22 @@ Examples: parser.add_argument( "--preset", choices=["quick", "standard", "comprehensive"], - help="Analysis preset: quick (1-2 min), standard (5-10 min, DEFAULT), comprehensive (20-60 min)" + help="Analysis preset: quick (1-2 min), standard (5-10 min, DEFAULT), comprehensive (20-60 min)", ) parser.add_argument( - "--preset-list", - action="store_true", - help="Show available presets and exit" + "--preset-list", action="store_true", help="Show available presets and exit" ) # Legacy preset flags (kept for backward compatibility) parser.add_argument( "--quick", action="store_true", - help="[DEPRECATED] Quick analysis - use '--preset quick' instead" + help="[DEPRECATED] Quick analysis - use '--preset quick' instead", ) parser.add_argument( "--comprehensive", action="store_true", - help="[DEPRECATED] Comprehensive analysis - use '--preset comprehensive' instead" + help="[DEPRECATED] Comprehensive analysis - use '--preset comprehensive' instead", ) parser.add_argument( @@ -2129,6 +2125,7 @@ Examples: # Handle --preset-list flag BEFORE parse_args() to avoid required --directory validation if "--preset-list" in sys.argv: from skill_seekers.cli.presets import PresetManager + print(PresetManager.format_preset_help()) return 0 @@ -2155,6 +2152,7 @@ Examples: # Apply preset using PresetManager if preset_name: from skill_seekers.cli.presets import PresetManager + try: preset_args = PresetManager.apply_preset(preset_name, vars(args)) # Update args with preset values @@ -2162,9 +2160,7 @@ Examples: setattr(args, key, value) preset = PresetManager.get_preset(preset_name) - logger.info( - f"{preset.icon} {preset.name} analysis mode: {preset.description}" - ) + logger.info(f"{preset.icon} {preset.name} analysis mode: {preset.description}") except ValueError as e: logger.error(f"❌ {e}") return 1 diff --git a/src/skill_seekers/cli/embedding_pipeline.py b/src/skill_seekers/cli/embedding_pipeline.py index ef7a200..6650a86 100644 --- a/src/skill_seekers/cli/embedding_pipeline.py +++ b/src/skill_seekers/cli/embedding_pipeline.py @@ -19,6 +19,7 @@ import numpy as np @dataclass class EmbeddingConfig: """Configuration for embedding generation.""" + provider: str # 'openai', 'cohere', 'huggingface', 'local' model: str dimension: int @@ -31,6 +32,7 @@ class EmbeddingConfig: @dataclass class EmbeddingResult: """Result of embedding generation.""" + embeddings: list[list[float]] metadata: dict[str, Any] = field(default_factory=dict) cached_count: int = 0 @@ -42,6 +44,7 @@ class EmbeddingResult: @dataclass class CostTracker: """Track embedding generation costs.""" + total_tokens: int = 0 total_requests: int = 0 cache_hits: int = 0 @@ -64,12 +67,12 @@ class CostTracker: cache_rate = (self.cache_hits / self.total_requests * 100) if self.total_requests > 0 else 0 return { - 'total_requests': self.total_requests, - 'total_tokens': self.total_tokens, - 'cache_hits': self.cache_hits, - 'cache_misses': self.cache_misses, - 'cache_rate': f"{cache_rate:.1f}%", - 'estimated_cost': f"${self.estimated_cost:.4f}" + "total_requests": self.total_requests, + "total_tokens": self.total_tokens, + "cache_hits": self.cache_hits, + "cache_misses": self.cache_misses, + "cache_rate": f"{cache_rate:.1f}%", + "estimated_cost": f"${self.estimated_cost:.4f}", } @@ -97,18 +100,18 @@ class OpenAIEmbeddingProvider(EmbeddingProvider): # Pricing per 1M tokens (as of 2026) PRICING = { - 'text-embedding-ada-002': 0.10, - 'text-embedding-3-small': 0.02, - 'text-embedding-3-large': 0.13, + "text-embedding-ada-002": 0.10, + "text-embedding-3-small": 0.02, + "text-embedding-3-large": 0.13, } DIMENSIONS = { - 'text-embedding-ada-002': 1536, - 'text-embedding-3-small': 1536, - 'text-embedding-3-large': 3072, + "text-embedding-ada-002": 1536, + "text-embedding-3-small": 1536, + "text-embedding-3-large": 3072, } - def __init__(self, model: str = 'text-embedding-ada-002', api_key: str | None = None): + def __init__(self, model: str = "text-embedding-ada-002", api_key: str | None = None): """Initialize OpenAI provider.""" self.model = model self.api_key = api_key @@ -119,9 +122,12 @@ class OpenAIEmbeddingProvider(EmbeddingProvider): if self._client is None: try: from openai import OpenAI + self._client = OpenAI(api_key=self.api_key) except ImportError: - raise ImportError("OpenAI package not installed. Install with: pip install openai") from None + raise ImportError( + "OpenAI package not installed. Install with: pip install openai" + ) from None return self._client def generate_embeddings(self, texts: list[str]) -> list[list[float]]: @@ -130,10 +136,7 @@ class OpenAIEmbeddingProvider(EmbeddingProvider): embeddings = [] for text in texts: - response = client.embeddings.create( - model=self.model, - input=text - ) + response = client.embeddings.create(model=self.model, input=text) embeddings.append(response.data[0].embedding) return embeddings @@ -207,7 +210,7 @@ class EmbeddingCache: if cache_file.exists(): try: data = json.loads(cache_file.read_text()) - embedding = data['embedding'] + embedding = data["embedding"] self._memory_cache[cache_key] = embedding return embedding except Exception: @@ -226,12 +229,16 @@ class EmbeddingCache: if self.cache_dir: cache_file = self.cache_dir / f"{cache_key}.json" try: - cache_file.write_text(json.dumps({ - 'text_hash': cache_key, - 'model': model, - 'embedding': embedding, - 'timestamp': time.time() - })) + cache_file.write_text( + json.dumps( + { + "text_hash": cache_key, + "model": model, + "embedding": embedding, + "timestamp": time.time(), + } + ) + ) except Exception as e: print(f"⚠️ Warning: Failed to write cache: {e}") @@ -252,9 +259,9 @@ class EmbeddingPipeline: def _create_provider(self) -> EmbeddingProvider: """Create provider based on config.""" - if self.config.provider == 'openai': + if self.config.provider == "openai": return OpenAIEmbeddingProvider(self.config.model) - elif self.config.provider == 'local': + elif self.config.provider == "local": return LocalEmbeddingProvider(self.config.dimension) else: raise ValueError(f"Unknown provider: {self.config.provider}") @@ -264,11 +271,7 @@ class EmbeddingPipeline: # Rough estimate: 1 token ≈ 4 characters return len(text) // 4 - def generate_batch( - self, - texts: list[str], - show_progress: bool = True - ) -> EmbeddingResult: + def generate_batch(self, texts: list[str], show_progress: bool = True) -> EmbeddingResult: """ Generate embeddings for batch of texts. @@ -293,7 +296,7 @@ class EmbeddingPipeline: # Process in batches for i in range(0, len(texts), self.config.batch_size): - batch = texts[i:i + self.config.batch_size] + batch = texts[i : i + self.config.batch_size] batch_embeddings = [] to_generate = [] to_generate_indices = [] @@ -331,7 +334,7 @@ class EmbeddingPipeline: if show_progress and len(texts) > self.config.batch_size: progress = min(i + self.config.batch_size, len(texts)) - print(f" Progress: {progress}/{len(texts)} ({progress/len(texts)*100:.1f}%)") + print(f" Progress: {progress}/{len(texts)} ({progress / len(texts) * 100:.1f}%)") total_time = time.time() - start_time @@ -342,21 +345,21 @@ class EmbeddingPipeline: print(f" Generated: {generated_count}") print(f" Time: {total_time:.2f}s") - if self.config.provider != 'local': + if self.config.provider != "local": stats = self.cost_tracker.get_stats() print(f" Cost: {stats['estimated_cost']}") return EmbeddingResult( embeddings=embeddings, metadata={ - 'provider': self.config.provider, - 'model': self.config.model, - 'dimension': self.provider.get_dimension() + "provider": self.config.provider, + "model": self.config.model, + "dimension": self.provider.get_dimension(), }, cached_count=cached_count, generated_count=generated_count, total_time=total_time, - cost_estimate=self.cost_tracker.estimated_cost + cost_estimate=self.cost_tracker.estimated_cost, ) def validate_dimensions(self, embeddings: list[list[float]]) -> bool: @@ -373,8 +376,10 @@ class EmbeddingPipeline: for i, embedding in enumerate(embeddings): if len(embedding) != expected_dim: - print(f"❌ Dimension mismatch at index {i}: " - f"expected {expected_dim}, got {len(embedding)}") + print( + f"❌ Dimension mismatch at index {i}: " + f"expected {expected_dim}, got {len(embedding)}" + ) return False return True @@ -390,11 +395,11 @@ def example_usage(): # Configure pipeline config = EmbeddingConfig( - provider='local', # Use 'openai' for production - model='text-embedding-ada-002', + provider="local", # Use 'openai' for production + model="text-embedding-ada-002", dimension=384, batch_size=50, - cache_dir=Path("output/.embeddings_cache") + cache_dir=Path("output/.embeddings_cache"), ) # Initialize pipeline diff --git a/src/skill_seekers/cli/enhance_skill_local.py b/src/skill_seekers/cli/enhance_skill_local.py index 6950b9d..bc2c680 100644 --- a/src/skill_seekers/cli/enhance_skill_local.py +++ b/src/skill_seekers/cli/enhance_skill_local.py @@ -175,8 +175,7 @@ class LocalSkillEnhancer: dangerous_chars = [";", "&", "|", "$", "`", "\n", "\r"] if any(char in cmd_template for char in dangerous_chars): raise ValueError( - "Custom command contains dangerous shell characters. " - f"Command: {cmd_template}" + f"Custom command contains dangerous shell characters. Command: {cmd_template}" ) try: @@ -888,9 +887,7 @@ rm {prompt_file} print("❌ SKILL.md not found after enhancement") return False else: - print( - f"❌ {self.agent_display} returned error (exit code: {result.returncode})" - ) + print(f"❌ {self.agent_display} returned error (exit code: {result.returncode})") if result.stderr: print(f" Error: {result.stderr[:200]}") return False diff --git a/src/skill_seekers/cli/incremental_updater.py b/src/skill_seekers/cli/incremental_updater.py index a6b3ff3..a2e2f36 100644 --- a/src/skill_seekers/cli/incremental_updater.py +++ b/src/skill_seekers/cli/incremental_updater.py @@ -16,6 +16,7 @@ from datetime import datetime @dataclass class DocumentVersion: """Version information for a document.""" + file_path: str content_hash: str size_bytes: int @@ -26,6 +27,7 @@ class DocumentVersion: @dataclass class ChangeSet: """Set of changes detected.""" + added: list[DocumentVersion] modified: list[DocumentVersion] deleted: list[str] @@ -45,6 +47,7 @@ class ChangeSet: @dataclass class UpdateMetadata: """Metadata for an incremental update.""" + timestamp: str previous_version: str new_version: str @@ -86,7 +89,7 @@ class IncrementalUpdater: sha256 = hashlib.sha256() try: - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: while chunk := f.read(8192): sha256.update(chunk) return sha256.hexdigest() @@ -111,7 +114,7 @@ class IncrementalUpdater: content_hash=self._compute_file_hash(skill_md), size_bytes=skill_md.stat().st_size, last_modified=skill_md.stat().st_mtime, - version=1 + version=1, ) # Scan references @@ -125,7 +128,7 @@ class IncrementalUpdater: content_hash=self._compute_file_hash(ref_file), size_bytes=ref_file.stat().st_size, last_modified=ref_file.stat().st_mtime, - version=1 + version=1, ) return versions @@ -157,9 +160,8 @@ class IncrementalUpdater: "timestamp": datetime.now().isoformat(), "version": "1.0.0", "documents": { - file_path: asdict(version) - for file_path, version in self.current_versions.items() - } + file_path: asdict(version) for file_path, version in self.current_versions.items() + }, } self.version_file.write_text(json.dumps(data, indent=2)) @@ -180,10 +182,7 @@ class IncrementalUpdater: if not has_previous: # First time - all files are "added" return ChangeSet( - added=list(self.current_versions.values()), - modified=[], - deleted=[], - unchanged=[] + added=list(self.current_versions.values()), modified=[], deleted=[], unchanged=[] ) # Detect changes @@ -215,18 +214,10 @@ class IncrementalUpdater: else: unchanged.append(current) - return ChangeSet( - added=added, - modified=modified, - deleted=deleted, - unchanged=unchanged - ) + return ChangeSet(added=added, modified=modified, deleted=deleted, unchanged=unchanged) def generate_update_package( - self, - change_set: ChangeSet, - output_path: Path, - include_content: bool = True + self, change_set: ChangeSet, output_path: Path, include_content: bool = True ) -> Path: """ Generate incremental update package. @@ -250,11 +241,11 @@ class IncrementalUpdater: "added": len(change_set.added), "modified": len(change_set.modified), "deleted": len(change_set.deleted), - "unchanged": len(change_set.unchanged) + "unchanged": len(change_set.unchanged), }, - "total_changes": change_set.total_changes + "total_changes": change_set.total_changes, }, - "changes": {} + "changes": {}, } # Include changed documents @@ -267,7 +258,7 @@ class IncrementalUpdater: "version": doc.version, "content": file_path.read_text(encoding="utf-8"), "hash": doc.content_hash, - "size": doc.size_bytes + "size": doc.size_bytes, } # Modified documents @@ -278,14 +269,12 @@ class IncrementalUpdater: "version": doc.version, "content": file_path.read_text(encoding="utf-8"), "hash": doc.content_hash, - "size": doc.size_bytes + "size": doc.size_bytes, } # Deleted documents for file_path in change_set.deleted: - update_data["changes"][file_path] = { - "action": "delete" - } + update_data["changes"][file_path] = {"action": "delete"} # Write package output_path.parent.mkdir(parents=True, exist_ok=True) @@ -332,7 +321,9 @@ class IncrementalUpdater: if prev: size_diff = doc.size_bytes - prev.size_bytes size_str = f"{size_diff:+,} bytes" if size_diff != 0 else "same size" - lines.append(f" ~ {doc.file_path} (v{prev.version} → v{doc.version}, {size_str})") + lines.append( + f" ~ {doc.file_path} (v{prev.version} → v{doc.version}, {size_str})" + ) else: lines.append(f" ~ {doc.file_path} (v{doc.version})") lines.append("") @@ -473,4 +464,5 @@ def main(): if __name__ == "__main__": import sys + sys.exit(main()) diff --git a/src/skill_seekers/cli/language_detector.py b/src/skill_seekers/cli/language_detector.py index 4582ca7..e6fa971 100644 --- a/src/skill_seekers/cli/language_detector.py +++ b/src/skill_seekers/cli/language_detector.py @@ -369,8 +369,6 @@ LANGUAGE_PATTERNS: dict[str, list[tuple[str, int]]] = { (r"\$[0-9]+", 4), (r"->", 3), ], - - # ===== Markup/Config Languages ===== "html": [ (r"", 5), diff --git a/src/skill_seekers/cli/main.py b/src/skill_seekers/cli/main.py index 1bcdecb..4b26948 100644 --- a/src/skill_seekers/cli/main.py +++ b/src/skill_seekers/cli/main.py @@ -42,25 +42,25 @@ from skill_seekers.cli import __version__ # Command module mapping (command name -> module path) COMMAND_MODULES = { - 'config': 'skill_seekers.cli.config_command', - 'scrape': 'skill_seekers.cli.doc_scraper', - 'github': 'skill_seekers.cli.github_scraper', - 'pdf': 'skill_seekers.cli.pdf_scraper', - 'unified': 'skill_seekers.cli.unified_scraper', - 'enhance': 'skill_seekers.cli.enhance_skill_local', - 'enhance-status': 'skill_seekers.cli.enhance_status', - 'package': 'skill_seekers.cli.package_skill', - 'upload': 'skill_seekers.cli.upload_skill', - 'estimate': 'skill_seekers.cli.estimate_pages', - 'extract-test-examples': 'skill_seekers.cli.test_example_extractor', - 'install-agent': 'skill_seekers.cli.install_agent', - 'analyze': 'skill_seekers.cli.codebase_scraper', - 'install': 'skill_seekers.cli.install_skill', - 'resume': 'skill_seekers.cli.resume_command', - 'stream': 'skill_seekers.cli.streaming_ingest', - 'update': 'skill_seekers.cli.incremental_updater', - 'multilang': 'skill_seekers.cli.multilang_support', - 'quality': 'skill_seekers.cli.quality_metrics', + "config": "skill_seekers.cli.config_command", + "scrape": "skill_seekers.cli.doc_scraper", + "github": "skill_seekers.cli.github_scraper", + "pdf": "skill_seekers.cli.pdf_scraper", + "unified": "skill_seekers.cli.unified_scraper", + "enhance": "skill_seekers.cli.enhance_skill_local", + "enhance-status": "skill_seekers.cli.enhance_status", + "package": "skill_seekers.cli.package_skill", + "upload": "skill_seekers.cli.upload_skill", + "estimate": "skill_seekers.cli.estimate_pages", + "extract-test-examples": "skill_seekers.cli.test_example_extractor", + "install-agent": "skill_seekers.cli.install_agent", + "analyze": "skill_seekers.cli.codebase_scraper", + "install": "skill_seekers.cli.install_skill", + "resume": "skill_seekers.cli.resume_command", + "stream": "skill_seekers.cli.streaming_ingest", + "update": "skill_seekers.cli.incremental_updater", + "multilang": "skill_seekers.cli.multilang_support", + "quality": "skill_seekers.cli.quality_metrics", } @@ -124,12 +124,21 @@ def _reconstruct_argv(command: str, args: argparse.Namespace) -> list[str]: # Convert args to sys.argv format for key, value in vars(args).items(): - if key == 'command': + if key == "command": continue # Handle positional arguments (no -- prefix) - if key in ['url', 'directory', 'file', 'job_id', 'skill_directory', 'zip_file', 'config', 'input_file']: - if value is not None and value != '': + if key in [ + "url", + "directory", + "file", + "job_id", + "skill_directory", + "zip_file", + "config", + "input_file", + ]: + if value is not None and value != "": argv.append(str(value)) continue @@ -172,7 +181,7 @@ def main(argv: list[str] | None = None) -> int: return 1 # Special handling for 'analyze' command (has post-processing) - if args.command == 'analyze': + if args.command == "analyze": return _handle_analyze_command(args) # Standard delegation for all other commands @@ -200,6 +209,7 @@ def main(argv: list[str] | None = None) -> int: # Show traceback in verbose mode import traceback + if hasattr(args, "verbose") and getattr(args, "verbose", False): traceback.print_exc() @@ -226,13 +236,16 @@ def _handle_analyze_command(args: argparse.Namespace) -> int: # Handle preset flags (depth and features) if args.quick: - sys.argv.extend([ - "--depth", "surface", - "--skip-patterns", - "--skip-test-examples", - "--skip-how-to-guides", - "--skip-config-patterns", - ]) + sys.argv.extend( + [ + "--depth", + "surface", + "--skip-patterns", + "--skip-test-examples", + "--skip-how-to-guides", + "--skip-config-patterns", + ] + ) elif args.comprehensive: sys.argv.extend(["--depth", "full"]) elif args.depth: @@ -246,6 +259,7 @@ def _handle_analyze_command(args: argparse.Namespace) -> int: elif args.enhance: try: from skill_seekers.cli.config_manager import get_config_manager + config = get_config_manager() enhance_level = config.get_default_enhance_level() except Exception: diff --git a/src/skill_seekers/cli/multilang_support.py b/src/skill_seekers/cli/multilang_support.py index 8074ef4..e456742 100644 --- a/src/skill_seekers/cli/multilang_support.py +++ b/src/skill_seekers/cli/multilang_support.py @@ -15,6 +15,7 @@ import json @dataclass class LanguageInfo: """Language information for a document.""" + code: str # ISO 639-1 code (e.g., 'en', 'es', 'zh') name: str # Full name (e.g., 'English', 'Spanish', 'Chinese') confidence: float # Detection confidence (0.0-1.0) @@ -24,6 +25,7 @@ class LanguageInfo: @dataclass class TranslationStatus: """Translation status for a document.""" + source_language: str target_languages: list[str] translated_languages: set[str] @@ -40,74 +42,81 @@ class LanguageDetector: # Common word patterns by language LANGUAGE_PATTERNS = { - 'en': [ - r'\b(the|and|is|are|in|to|of|for|with|on|at|by|from)\b', - r'\b(this|that|these|those|what|which|who|where|when)\b', + "en": [ + r"\b(the|and|is|are|in|to|of|for|with|on|at|by|from)\b", + r"\b(this|that|these|those|what|which|who|where|when)\b", ], - 'es': [ - r'\b(el|la|los|las|de|en|y|a|es|por|para|con|su)\b', - r'\b(que|no|un|una|como|más|pero|muy|todo|ya)\b', + "es": [ + r"\b(el|la|los|las|de|en|y|a|es|por|para|con|su)\b", + r"\b(que|no|un|una|como|más|pero|muy|todo|ya)\b", ], - 'fr': [ - r'\b(le|la|les|de|et|en|un|une|pour|dans|que|sur|avec)\b', - r'\b(est|sont|ce|qui|plus|ne|pas|nous|vous|tout)\b', + "fr": [ + r"\b(le|la|les|de|et|en|un|une|pour|dans|que|sur|avec)\b", + r"\b(est|sont|ce|qui|plus|ne|pas|nous|vous|tout)\b", ], - 'de': [ - r'\b(der|die|das|und|in|zu|den|von|ist|mit|für|auf)\b', - r'\b(ein|eine|nicht|sich|auch|werden|an|als|ich|sie)\b', + "de": [ + r"\b(der|die|das|und|in|zu|den|von|ist|mit|für|auf)\b", + r"\b(ein|eine|nicht|sich|auch|werden|an|als|ich|sie)\b", ], - 'zh': [ - r'[\u4e00-\u9fff]', # Chinese characters - r'(的|了|和|是|在|有|我|他|不|这)', + "zh": [ + r"[\u4e00-\u9fff]", # Chinese characters + r"(的|了|和|是|在|有|我|他|不|这)", ], - 'ja': [ - r'[\u3040-\u309f]', # Hiragana - r'[\u30a0-\u30ff]', # Katakana - r'[\u4e00-\u9faf]', # Kanji + "ja": [ + r"[\u3040-\u309f]", # Hiragana + r"[\u30a0-\u30ff]", # Katakana + r"[\u4e00-\u9faf]", # Kanji ], - 'ko': [ - r'[\uac00-\ud7af]', # Hangul - r'(의|가|이|은|들|는|좀|잘|께|을)', + "ko": [ + r"[\uac00-\ud7af]", # Hangul + r"(의|가|이|은|들|는|좀|잘|께|을)", ], - 'ru': [ - r'[\u0400-\u04ff]', # Cyrillic - r'\b(и|в|не|на|с|что|он|по|а|как|это|все)\b', + "ru": [ + r"[\u0400-\u04ff]", # Cyrillic + r"\b(и|в|не|на|с|что|он|по|а|как|это|все)\b", ], - 'pt': [ - r'\b(o|a|de|e|do|da|em|um|para|é|com|não|os|as)\b', - r'\b(que|se|mais|por|dos|das|como|mas|uma|ou)\b', + "pt": [ + r"\b(o|a|de|e|do|da|em|um|para|é|com|não|os|as)\b", + r"\b(que|se|mais|por|dos|das|como|mas|uma|ou)\b", ], - 'it': [ - r'\b(il|la|di|e|a|da|in|che|per|un|una|non|del)\b', - r'\b(con|alla|della|al|nel|sono|come|più|ma|dei)\b', + "it": [ + r"\b(il|la|di|e|a|da|in|che|per|un|una|non|del)\b", + r"\b(con|alla|della|al|nel|sono|come|più|ma|dei)\b", ], - 'ar': [ - r'[\u0600-\u06ff]', # Arabic - r'(في|من|على|إلى|هذا|ما|أن|كان|هو|التي)', + "ar": [ + r"[\u0600-\u06ff]", # Arabic + r"(في|من|على|إلى|هذا|ما|أن|كان|هو|التي)", ], } # Language names LANGUAGE_NAMES = { - 'en': 'English', - 'es': 'Spanish', - 'fr': 'French', - 'de': 'German', - 'zh': 'Chinese', - 'ja': 'Japanese', - 'ko': 'Korean', - 'ru': 'Russian', - 'pt': 'Portuguese', - 'it': 'Italian', - 'ar': 'Arabic', + "en": "English", + "es": "Spanish", + "fr": "French", + "de": "German", + "zh": "Chinese", + "ja": "Japanese", + "ko": "Korean", + "ru": "Russian", + "pt": "Portuguese", + "it": "Italian", + "ar": "Arabic", } # Script types SCRIPTS = { - 'en': 'Latin', 'es': 'Latin', 'fr': 'Latin', 'de': 'Latin', - 'pt': 'Latin', 'it': 'Latin', - 'zh': 'Han', 'ja': 'Japanese', 'ko': 'Hangul', - 'ru': 'Cyrillic', 'ar': 'Arabic', + "en": "Latin", + "es": "Latin", + "fr": "Latin", + "de": "Latin", + "pt": "Latin", + "it": "Latin", + "zh": "Han", + "ja": "Japanese", + "ko": "Hangul", + "ru": "Cyrillic", + "ar": "Arabic", } def detect(self, text: str, sample_size: int = 2000) -> LanguageInfo: @@ -122,7 +131,7 @@ class LanguageDetector: LanguageInfo with detected language """ if not text.strip(): - return LanguageInfo('en', 'English', 0.0) + return LanguageInfo("en", "English", 0.0) # Sample text for efficiency sample = text[:sample_size].lower() @@ -140,7 +149,7 @@ class LanguageDetector: # Find best match if not scores or max(scores.values()) == 0: # Default to English - return LanguageInfo('en', 'English', 0.1) + return LanguageInfo("en", "English", 0.1) best_lang = max(scores, key=scores.get) total_score = sum(scores.values()) @@ -150,7 +159,7 @@ class LanguageDetector: code=best_lang, name=self.LANGUAGE_NAMES.get(best_lang, best_lang.upper()), confidence=min(confidence, 1.0), - script=self.SCRIPTS.get(best_lang) + script=self.SCRIPTS.get(best_lang), ) def detect_from_filename(self, filename: str) -> str | None: @@ -170,12 +179,12 @@ class LanguageDetector: ISO 639-1 language code or None """ # Pattern: file.en.md - match = re.search(r'\.([a-z]{2})\.md$', filename) + match = re.search(r"\.([a-z]{2})\.md$", filename) if match and match.group(1) in self.LANGUAGE_NAMES: return match.group(1) # Pattern: file_en.md or file-en.md - match = re.search(r'[_-]([a-z]{2})\.md$', filename) + match = re.search(r"[_-]([a-z]{2})\.md$", filename) if match and match.group(1) in self.LANGUAGE_NAMES: return match.group(1) @@ -200,7 +209,7 @@ class MultiLanguageManager: file_path: str, content: str, metadata: dict | None = None, - force_language: str | None = None + force_language: str | None = None, ) -> None: """ Add document with language detection. @@ -218,7 +227,7 @@ class MultiLanguageManager: code=lang_code, name=self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()), confidence=1.0, - script=self.detector.SCRIPTS.get(lang_code) + script=self.detector.SCRIPTS.get(lang_code), ) else: # Try filename pattern first @@ -229,7 +238,7 @@ class MultiLanguageManager: code=lang_code, name=self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()), confidence=0.95, - script=self.detector.SCRIPTS.get(lang_code) + script=self.detector.SCRIPTS.get(lang_code), ) else: # Detect from content @@ -245,13 +254,13 @@ class MultiLanguageManager: self.documents[lang_code] = [] doc = { - 'file_path': file_path, - 'content': content, - 'language': lang_info.code, - 'language_name': lang_info.name, - 'confidence': lang_info.confidence, - 'script': lang_info.script, - 'metadata': metadata or {} + "file_path": file_path, + "content": content, + "language": lang_info.code, + "language_name": lang_info.name, + "confidence": lang_info.confidence, + "script": lang_info.script, + "metadata": metadata or {}, } self.documents[lang_code].append(doc) @@ -284,7 +293,7 @@ class MultiLanguageManager: Returns: Translation status summary """ - base_lang = base_language or self.primary_language or 'en' + base_lang = base_language or self.primary_language or "en" all_languages = set(self.documents.keys()) base_count = self.get_document_count(base_lang) @@ -295,7 +304,7 @@ class MultiLanguageManager: target_languages=[], translated_languages=set(), missing_languages=set(), - completeness=0.0 + completeness=0.0, ) # Check which languages have translations @@ -305,7 +314,7 @@ class MultiLanguageManager: translated.add(lang) # Commonly expected languages for completeness - expected_languages = {'en', 'es', 'fr', 'de', 'zh', 'ja'} + expected_languages = {"en", "es", "fr", "de", "zh", "ja"} missing = expected_languages - all_languages completeness = len(all_languages) / len(expected_languages) @@ -315,7 +324,7 @@ class MultiLanguageManager: target_languages=list(all_languages - {base_lang}), translated_languages=translated, missing_languages=missing, - completeness=min(completeness, 1.0) + completeness=min(completeness, 1.0), ) def export_by_language(self, output_dir: Path) -> dict[str, Path]: @@ -337,10 +346,10 @@ class MultiLanguageManager: lang_file = output_dir / f"documents_{lang_code}.json" export_data = { - 'language': lang_code, - 'language_name': self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()), - 'document_count': len(docs), - 'documents': docs + "language": lang_code, + "language_name": self.detector.LANGUAGE_NAMES.get(lang_code, lang_code.upper()), + "document_count": len(docs), + "documents": docs, } lang_file.write_text(json.dumps(export_data, indent=2, ensure_ascii=False)) @@ -419,9 +428,7 @@ def main(): skill_md = skill_dir / "SKILL.md" if skill_md.exists(): manager.add_document( - "SKILL.md", - skill_md.read_text(encoding="utf-8"), - {"category": "overview"} + "SKILL.md", skill_md.read_text(encoding="utf-8"), {"category": "overview"} ) # Load reference files @@ -429,9 +436,7 @@ def main(): if refs_dir.exists(): for ref_file in refs_dir.glob("*.md"): manager.add_document( - ref_file.name, - ref_file.read_text(encoding="utf-8"), - {"category": ref_file.stem} + ref_file.name, ref_file.read_text(encoding="utf-8"), {"category": ref_file.stem} ) # Detect languages @@ -460,4 +465,5 @@ def main(): if __name__ == "__main__": import sys + sys.exit(main()) diff --git a/src/skill_seekers/cli/package_skill.py b/src/skill_seekers/cli/package_skill.py index 9be22b1..6a31744 100644 --- a/src/skill_seekers/cli/package_skill.py +++ b/src/skill_seekers/cli/package_skill.py @@ -113,7 +113,15 @@ def package_skill( output_dir = skill_path.parent # Auto-enable chunking for RAG platforms - RAG_PLATFORMS = ['langchain', 'llama-index', 'haystack', 'weaviate', 'chroma', 'faiss', 'qdrant'] + RAG_PLATFORMS = [ + "langchain", + "llama-index", + "haystack", + "weaviate", + "chroma", + "faiss", + "qdrant", + ] if target in RAG_PLATFORMS and not enable_chunking: print(f"ℹ️ Auto-enabling chunking for {target} platform") @@ -126,17 +134,19 @@ def package_skill( if streaming: print(f" Mode: Streaming (chunk_size={chunk_size}, overlap={chunk_overlap})") elif enable_chunking: - print(f" Chunking: Enabled (max_tokens={chunk_max_tokens}, preserve_code={preserve_code_blocks})") + print( + f" Chunking: Enabled (max_tokens={chunk_max_tokens}, preserve_code={preserve_code_blocks})" + ) try: # Use streaming if requested and supported - if streaming and hasattr(adaptor, 'package_streaming'): + if streaming and hasattr(adaptor, "package_streaming"): package_path = adaptor.package_streaming( skill_path, output_dir, chunk_size=chunk_size, chunk_overlap=chunk_overlap, - batch_size=batch_size + batch_size=batch_size, ) elif streaming: print("⚠️ Streaming not supported for this platform, using standard packaging") @@ -145,7 +155,7 @@ def package_skill( output_dir, enable_chunking=enable_chunking, chunk_max_tokens=chunk_max_tokens, - preserve_code_blocks=preserve_code_blocks + preserve_code_blocks=preserve_code_blocks, ) else: package_path = adaptor.package( @@ -153,7 +163,7 @@ def package_skill( output_dir, enable_chunking=enable_chunking, chunk_max_tokens=chunk_max_tokens, - preserve_code_blocks=preserve_code_blocks + preserve_code_blocks=preserve_code_blocks, ) print(f" Output: {package_path}") @@ -212,7 +222,19 @@ Examples: parser.add_argument( "--target", - choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "haystack", "weaviate", "chroma", "faiss", "qdrant"], + choices=[ + "claude", + "gemini", + "openai", + "markdown", + "langchain", + "llama-index", + "haystack", + "weaviate", + "chroma", + "faiss", + "qdrant", + ], default="claude", help="Target LLM platform (default: claude)", ) diff --git a/src/skill_seekers/cli/parsers/__init__.py b/src/skill_seekers/cli/parsers/__init__.py index f6c59a8..0db900a 100644 --- a/src/skill_seekers/cli/parsers/__init__.py +++ b/src/skill_seekers/cli/parsers/__init__.py @@ -3,6 +3,7 @@ This module registers all subcommand parsers and provides a factory function to create them. """ + from .base import SubcommandParser # Import all parser classes diff --git a/src/skill_seekers/cli/parsers/analyze_parser.py b/src/skill_seekers/cli/parsers/analyze_parser.py index e6c792e..34e1d1c 100644 --- a/src/skill_seekers/cli/parsers/analyze_parser.py +++ b/src/skill_seekers/cli/parsers/analyze_parser.py @@ -1,4 +1,5 @@ """Analyze subcommand parser.""" + from .base import SubcommandParser @@ -21,26 +22,26 @@ class AnalyzeParser(SubcommandParser): """Add analyze-specific arguments.""" parser.add_argument("--directory", required=True, help="Directory to analyze") parser.add_argument( - "--output", default="output/codebase/", help="Output directory (default: output/codebase/)" + "--output", + default="output/codebase/", + help="Output directory (default: output/codebase/)", ) # Preset selection (NEW - recommended way) parser.add_argument( "--preset", choices=["quick", "standard", "comprehensive"], - help="Analysis preset: quick (1-2 min), standard (5-10 min, DEFAULT), comprehensive (20-60 min)" + help="Analysis preset: quick (1-2 min), standard (5-10 min, DEFAULT), comprehensive (20-60 min)", ) parser.add_argument( - "--preset-list", - action="store_true", - help="Show available presets and exit" + "--preset-list", action="store_true", help="Show available presets and exit" ) # Legacy preset flags (kept for backward compatibility) parser.add_argument( "--quick", action="store_true", - help="[DEPRECATED] Quick analysis - use '--preset quick' instead" + help="[DEPRECATED] Quick analysis - use '--preset quick' instead", ) parser.add_argument( "--comprehensive", @@ -71,15 +72,9 @@ class AnalyzeParser(SubcommandParser): help="AI enhancement level: 0=off, 1=SKILL.md only (default), 2=+Architecture+Config, 3=full", ) parser.add_argument("--skip-api-reference", action="store_true", help="Skip API docs") - parser.add_argument( - "--skip-dependency-graph", action="store_true", help="Skip dep graph" - ) - parser.add_argument( - "--skip-patterns", action="store_true", help="Skip pattern detection" - ) - parser.add_argument( - "--skip-test-examples", action="store_true", help="Skip test examples" - ) + parser.add_argument("--skip-dependency-graph", action="store_true", help="Skip dep graph") + parser.add_argument("--skip-patterns", action="store_true", help="Skip pattern detection") + parser.add_argument("--skip-test-examples", action="store_true", help="Skip test examples") parser.add_argument("--skip-how-to-guides", action="store_true", help="Skip guides") parser.add_argument("--skip-config-patterns", action="store_true", help="Skip config") parser.add_argument( diff --git a/src/skill_seekers/cli/parsers/base.py b/src/skill_seekers/cli/parsers/base.py index 80b535a..765cf01 100644 --- a/src/skill_seekers/cli/parsers/base.py +++ b/src/skill_seekers/cli/parsers/base.py @@ -1,4 +1,5 @@ """Base parser class for subcommands.""" + from abc import ABC, abstractmethod import argparse @@ -48,10 +49,6 @@ class SubcommandParser(ABC): Returns: Configured ArgumentParser for this subcommand """ - parser = subparsers.add_parser( - self.name, - help=self.help, - description=self.description - ) + parser = subparsers.add_parser(self.name, help=self.help, description=self.description) self.add_arguments(parser) return parser diff --git a/src/skill_seekers/cli/parsers/config_parser.py b/src/skill_seekers/cli/parsers/config_parser.py index 7d288ba..f78c36e 100644 --- a/src/skill_seekers/cli/parsers/config_parser.py +++ b/src/skill_seekers/cli/parsers/config_parser.py @@ -1,4 +1,5 @@ """Config subcommand parser.""" + from .base import SubcommandParser @@ -22,9 +23,7 @@ class ConfigParser(SubcommandParser): parser.add_argument( "--github", action="store_true", help="Go directly to GitHub token setup" ) - parser.add_argument( - "--api-keys", action="store_true", help="Go directly to API keys setup" - ) + parser.add_argument("--api-keys", action="store_true", help="Go directly to API keys setup") parser.add_argument( "--show", action="store_true", help="Show current configuration and exit" ) diff --git a/src/skill_seekers/cli/parsers/enhance_parser.py b/src/skill_seekers/cli/parsers/enhance_parser.py index 5bc6d4f..a8c0da6 100644 --- a/src/skill_seekers/cli/parsers/enhance_parser.py +++ b/src/skill_seekers/cli/parsers/enhance_parser.py @@ -1,4 +1,5 @@ """Enhance subcommand parser.""" + from .base import SubcommandParser diff --git a/src/skill_seekers/cli/parsers/enhance_status_parser.py b/src/skill_seekers/cli/parsers/enhance_status_parser.py index 299a4ac..229098c 100644 --- a/src/skill_seekers/cli/parsers/enhance_status_parser.py +++ b/src/skill_seekers/cli/parsers/enhance_status_parser.py @@ -1,4 +1,5 @@ """Enhance-status subcommand parser.""" + from .base import SubcommandParser @@ -20,10 +21,6 @@ class EnhanceStatusParser(SubcommandParser): def add_arguments(self, parser): """Add enhance-status-specific arguments.""" parser.add_argument("skill_directory", help="Skill directory path") - parser.add_argument( - "--watch", "-w", action="store_true", help="Watch in real-time" - ) + parser.add_argument("--watch", "-w", action="store_true", help="Watch in real-time") parser.add_argument("--json", action="store_true", help="JSON output") - parser.add_argument( - "--interval", type=int, default=2, help="Watch interval in seconds" - ) + parser.add_argument("--interval", type=int, default=2, help="Watch interval in seconds") diff --git a/src/skill_seekers/cli/parsers/estimate_parser.py b/src/skill_seekers/cli/parsers/estimate_parser.py index f4c2a14..5a21bdc 100644 --- a/src/skill_seekers/cli/parsers/estimate_parser.py +++ b/src/skill_seekers/cli/parsers/estimate_parser.py @@ -1,4 +1,5 @@ """Estimate subcommand parser.""" + from .base import SubcommandParser diff --git a/src/skill_seekers/cli/parsers/github_parser.py b/src/skill_seekers/cli/parsers/github_parser.py index 1d9801c..ef93342 100644 --- a/src/skill_seekers/cli/parsers/github_parser.py +++ b/src/skill_seekers/cli/parsers/github_parser.py @@ -1,4 +1,5 @@ """GitHub subcommand parser.""" + from .base import SubcommandParser @@ -24,9 +25,7 @@ class GitHubParser(SubcommandParser): parser.add_argument("--name", help="Skill name") parser.add_argument("--description", help="Skill description") parser.add_argument("--enhance", action="store_true", help="AI enhancement (API)") - parser.add_argument( - "--enhance-local", action="store_true", help="AI enhancement (local)" - ) + parser.add_argument("--enhance-local", action="store_true", help="AI enhancement (local)") parser.add_argument("--api-key", type=str, help="Anthropic API key for --enhance") parser.add_argument( "--non-interactive", diff --git a/src/skill_seekers/cli/parsers/install_agent_parser.py b/src/skill_seekers/cli/parsers/install_agent_parser.py index c61714f..884d56e 100644 --- a/src/skill_seekers/cli/parsers/install_agent_parser.py +++ b/src/skill_seekers/cli/parsers/install_agent_parser.py @@ -1,4 +1,5 @@ """Install-agent subcommand parser.""" + from .base import SubcommandParser @@ -19,9 +20,7 @@ class InstallAgentParser(SubcommandParser): def add_arguments(self, parser): """Add install-agent-specific arguments.""" - parser.add_argument( - "skill_directory", help="Skill directory path (e.g., output/react/)" - ) + parser.add_argument("skill_directory", help="Skill directory path (e.g., output/react/)") parser.add_argument( "--agent", required=True, diff --git a/src/skill_seekers/cli/parsers/install_parser.py b/src/skill_seekers/cli/parsers/install_parser.py index f0f58ce..3d48e6d 100644 --- a/src/skill_seekers/cli/parsers/install_parser.py +++ b/src/skill_seekers/cli/parsers/install_parser.py @@ -1,4 +1,5 @@ """Install subcommand parser.""" + from .base import SubcommandParser diff --git a/src/skill_seekers/cli/parsers/multilang_parser.py b/src/skill_seekers/cli/parsers/multilang_parser.py index e92958b..68bdb55 100644 --- a/src/skill_seekers/cli/parsers/multilang_parser.py +++ b/src/skill_seekers/cli/parsers/multilang_parser.py @@ -1,4 +1,5 @@ """Multilang subcommand parser.""" + from .base import SubcommandParser diff --git a/src/skill_seekers/cli/parsers/package_parser.py b/src/skill_seekers/cli/parsers/package_parser.py index 4296437..9c82541 100644 --- a/src/skill_seekers/cli/parsers/package_parser.py +++ b/src/skill_seekers/cli/parsers/package_parser.py @@ -1,4 +1,5 @@ """Package subcommand parser.""" + from .base import SubcommandParser @@ -20,27 +21,72 @@ class PackageParser(SubcommandParser): def add_arguments(self, parser): """Add package-specific arguments.""" parser.add_argument("skill_directory", help="Skill directory path (e.g., output/react/)") - parser.add_argument("--no-open", action="store_true", help="Don't open output folder after packaging") - parser.add_argument("--skip-quality-check", action="store_true", help="Skip quality checks before packaging") + parser.add_argument( + "--no-open", action="store_true", help="Don't open output folder after packaging" + ) + parser.add_argument( + "--skip-quality-check", action="store_true", help="Skip quality checks before packaging" + ) parser.add_argument( "--target", choices=[ - "claude", "gemini", "openai", "markdown", - "langchain", "llama-index", "haystack", - "weaviate", "chroma", "faiss", "qdrant" + "claude", + "gemini", + "openai", + "markdown", + "langchain", + "llama-index", + "haystack", + "weaviate", + "chroma", + "faiss", + "qdrant", ], default="claude", help="Target LLM platform (default: claude)", ) - parser.add_argument("--upload", action="store_true", help="Automatically upload after packaging (requires platform API key)") + parser.add_argument( + "--upload", + action="store_true", + help="Automatically upload after packaging (requires platform API key)", + ) # Streaming options - parser.add_argument("--streaming", action="store_true", help="Use streaming ingestion for large docs (memory-efficient)") - parser.add_argument("--chunk-size", type=int, default=4000, help="Maximum characters per chunk (streaming mode, default: 4000)") - parser.add_argument("--chunk-overlap", type=int, default=200, help="Overlap between chunks (streaming mode, default: 200)") - parser.add_argument("--batch-size", type=int, default=100, help="Number of chunks per batch (streaming mode, default: 100)") + parser.add_argument( + "--streaming", + action="store_true", + help="Use streaming ingestion for large docs (memory-efficient)", + ) + parser.add_argument( + "--chunk-size", + type=int, + default=4000, + help="Maximum characters per chunk (streaming mode, default: 4000)", + ) + parser.add_argument( + "--chunk-overlap", + type=int, + default=200, + help="Overlap between chunks (streaming mode, default: 200)", + ) + parser.add_argument( + "--batch-size", + type=int, + default=100, + help="Number of chunks per batch (streaming mode, default: 100)", + ) # RAG chunking options - parser.add_argument("--chunk", action="store_true", help="Enable intelligent chunking for RAG platforms (auto-enabled for RAG adaptors)") - parser.add_argument("--chunk-tokens", type=int, default=512, help="Maximum tokens per chunk (default: 512)") - parser.add_argument("--no-preserve-code", action="store_true", help="Allow code block splitting (default: code blocks preserved)") + parser.add_argument( + "--chunk", + action="store_true", + help="Enable intelligent chunking for RAG platforms (auto-enabled for RAG adaptors)", + ) + parser.add_argument( + "--chunk-tokens", type=int, default=512, help="Maximum tokens per chunk (default: 512)" + ) + parser.add_argument( + "--no-preserve-code", + action="store_true", + help="Allow code block splitting (default: code blocks preserved)", + ) diff --git a/src/skill_seekers/cli/parsers/pdf_parser.py b/src/skill_seekers/cli/parsers/pdf_parser.py index e54242b..6ce91ee 100644 --- a/src/skill_seekers/cli/parsers/pdf_parser.py +++ b/src/skill_seekers/cli/parsers/pdf_parser.py @@ -1,4 +1,5 @@ """PDF subcommand parser.""" + from .base import SubcommandParser diff --git a/src/skill_seekers/cli/parsers/quality_parser.py b/src/skill_seekers/cli/parsers/quality_parser.py index f9750b7..69803fe 100644 --- a/src/skill_seekers/cli/parsers/quality_parser.py +++ b/src/skill_seekers/cli/parsers/quality_parser.py @@ -1,4 +1,5 @@ """Quality subcommand parser.""" + from .base import SubcommandParser diff --git a/src/skill_seekers/cli/parsers/resume_parser.py b/src/skill_seekers/cli/parsers/resume_parser.py index ad8bc2f..9bb5d07 100644 --- a/src/skill_seekers/cli/parsers/resume_parser.py +++ b/src/skill_seekers/cli/parsers/resume_parser.py @@ -1,4 +1,5 @@ """Resume subcommand parser.""" + from .base import SubcommandParser diff --git a/src/skill_seekers/cli/parsers/scrape_parser.py b/src/skill_seekers/cli/parsers/scrape_parser.py index 68fed31..7184802 100644 --- a/src/skill_seekers/cli/parsers/scrape_parser.py +++ b/src/skill_seekers/cli/parsers/scrape_parser.py @@ -1,4 +1,5 @@ """Scrape subcommand parser.""" + from .base import SubcommandParser @@ -24,15 +25,16 @@ class ScrapeParser(SubcommandParser): parser.add_argument("--name", help="Skill name") parser.add_argument("--description", help="Skill description") parser.add_argument( - "--max-pages", type=int, dest="max_pages", help="Maximum pages to scrape (override config)" + "--max-pages", + type=int, + dest="max_pages", + help="Maximum pages to scrape (override config)", ) parser.add_argument( "--skip-scrape", action="store_true", help="Skip scraping, use cached data" ) parser.add_argument("--enhance", action="store_true", help="AI enhancement (API)") - parser.add_argument( - "--enhance-local", action="store_true", help="AI enhancement (local)" - ) + parser.add_argument("--enhance-local", action="store_true", help="AI enhancement (local)") parser.add_argument("--dry-run", action="store_true", help="Dry run mode") parser.add_argument( "--async", dest="async_mode", action="store_true", help="Use async scraping" diff --git a/src/skill_seekers/cli/parsers/stream_parser.py b/src/skill_seekers/cli/parsers/stream_parser.py index 0834a49..6ee513a 100644 --- a/src/skill_seekers/cli/parsers/stream_parser.py +++ b/src/skill_seekers/cli/parsers/stream_parser.py @@ -1,4 +1,5 @@ """Stream subcommand parser.""" + from .base import SubcommandParser diff --git a/src/skill_seekers/cli/parsers/test_examples_parser.py b/src/skill_seekers/cli/parsers/test_examples_parser.py index 76971ff..da2bde9 100644 --- a/src/skill_seekers/cli/parsers/test_examples_parser.py +++ b/src/skill_seekers/cli/parsers/test_examples_parser.py @@ -1,4 +1,5 @@ """Extract-test-examples subcommand parser.""" + from .base import SubcommandParser @@ -19,9 +20,7 @@ class TestExamplesParser(SubcommandParser): def add_arguments(self, parser): """Add extract-test-examples-specific arguments.""" - parser.add_argument( - "directory", nargs="?", help="Directory containing test files" - ) + parser.add_argument("directory", nargs="?", help="Directory containing test files") parser.add_argument("--file", help="Single test file to analyze") parser.add_argument( "--language", help="Filter by programming language (python, javascript, etc.)" @@ -36,6 +35,4 @@ class TestExamplesParser(SubcommandParser): "--max-per-file", type=int, default=10, help="Maximum examples per file (default: 10)" ) parser.add_argument("--json", action="store_true", help="Output JSON format") - parser.add_argument( - "--markdown", action="store_true", help="Output Markdown format" - ) + parser.add_argument("--markdown", action="store_true", help="Output Markdown format") diff --git a/src/skill_seekers/cli/parsers/unified_parser.py b/src/skill_seekers/cli/parsers/unified_parser.py index 8b1d5f2..97b9377 100644 --- a/src/skill_seekers/cli/parsers/unified_parser.py +++ b/src/skill_seekers/cli/parsers/unified_parser.py @@ -1,4 +1,5 @@ """Unified subcommand parser.""" + from .base import SubcommandParser diff --git a/src/skill_seekers/cli/parsers/update_parser.py b/src/skill_seekers/cli/parsers/update_parser.py index 1a90425..eaa321a 100644 --- a/src/skill_seekers/cli/parsers/update_parser.py +++ b/src/skill_seekers/cli/parsers/update_parser.py @@ -1,4 +1,5 @@ """Update subcommand parser.""" + from .base import SubcommandParser diff --git a/src/skill_seekers/cli/parsers/upload_parser.py b/src/skill_seekers/cli/parsers/upload_parser.py index b9c0793..d807b62 100644 --- a/src/skill_seekers/cli/parsers/upload_parser.py +++ b/src/skill_seekers/cli/parsers/upload_parser.py @@ -1,4 +1,5 @@ """Upload subcommand parser.""" + from .base import SubcommandParser @@ -19,7 +20,9 @@ class UploadParser(SubcommandParser): def add_arguments(self, parser): """Add upload-specific arguments.""" - parser.add_argument("package_file", help="Path to skill package file (e.g., output/react.zip)") + parser.add_argument( + "package_file", help="Path to skill package file (e.g., output/react.zip)" + ) parser.add_argument( "--target", @@ -33,22 +36,34 @@ class UploadParser(SubcommandParser): # ChromaDB upload options parser.add_argument( "--chroma-url", - help="ChromaDB URL (default: http://localhost:8000 for HTTP, or use --persist-directory for local)" + help="ChromaDB URL (default: http://localhost:8000 for HTTP, or use --persist-directory for local)", ) parser.add_argument( "--persist-directory", - help="Local directory for persistent ChromaDB storage (default: ./chroma_db)" + help="Local directory for persistent ChromaDB storage (default: ./chroma_db)", ) # Embedding options parser.add_argument( "--embedding-function", choices=["openai", "sentence-transformers", "none"], - help="Embedding function for ChromaDB/Weaviate (default: platform default)" + help="Embedding function for ChromaDB/Weaviate (default: platform default)", + ) + parser.add_argument( + "--openai-api-key", help="OpenAI API key for embeddings (or set OPENAI_API_KEY env var)" ) - parser.add_argument("--openai-api-key", help="OpenAI API key for embeddings (or set OPENAI_API_KEY env var)") # Weaviate upload options - parser.add_argument("--weaviate-url", default="http://localhost:8080", help="Weaviate URL (default: http://localhost:8080)") - parser.add_argument("--use-cloud", action="store_true", help="Use Weaviate Cloud (requires --api-key and --cluster-url)") - parser.add_argument("--cluster-url", help="Weaviate Cloud cluster URL (e.g., https://xxx.weaviate.network)") + parser.add_argument( + "--weaviate-url", + default="http://localhost:8080", + help="Weaviate URL (default: http://localhost:8080)", + ) + parser.add_argument( + "--use-cloud", + action="store_true", + help="Use Weaviate Cloud (requires --api-key and --cluster-url)", + ) + parser.add_argument( + "--cluster-url", help="Weaviate Cloud cluster URL (e.g., https://xxx.weaviate.network)" + ) diff --git a/src/skill_seekers/cli/pattern_recognizer.py b/src/skill_seekers/cli/pattern_recognizer.py index 5664abd..aed5d88 100644 --- a/src/skill_seekers/cli/pattern_recognizer.py +++ b/src/skill_seekers/cli/pattern_recognizer.py @@ -30,14 +30,14 @@ logger = logging.getLogger(__name__) # Confidence thresholds for pattern filtering (Issue #240) CONFIDENCE_THRESHOLDS = { - 'critical': 0.80, # High-confidence patterns for ARCHITECTURE.md - 'high': 0.70, # Include in detailed analysis - 'medium': 0.60, # Include with warning/context - 'low': 0.50, # Minimum detection threshold + "critical": 0.80, # High-confidence patterns for ARCHITECTURE.md + "high": 0.70, # Include in detailed analysis + "medium": 0.60, # Include with warning/context + "low": 0.50, # Minimum detection threshold } # Default minimum confidence for pattern detection -DEFAULT_MIN_CONFIDENCE = CONFIDENCE_THRESHOLDS['low'] +DEFAULT_MIN_CONFIDENCE = CONFIDENCE_THRESHOLDS["low"] @dataclass @@ -1697,9 +1697,11 @@ def create_multi_level_report(pattern_results: list[dict]) -> dict: all_patterns_sorted = sorted(all_patterns, key=lambda p: p.get("confidence", 0.0), reverse=True) # Filter by confidence levels - critical = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS['critical']) - high_confidence = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS['high']) - medium = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS['medium']) + critical = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS["critical"]) + high_confidence = filter_patterns_by_confidence( + all_patterns_sorted, CONFIDENCE_THRESHOLDS["high"] + ) + medium = filter_patterns_by_confidence(all_patterns_sorted, CONFIDENCE_THRESHOLDS["medium"]) return { "all_patterns": all_patterns_sorted, diff --git a/src/skill_seekers/cli/presets.py b/src/skill_seekers/cli/presets.py index 402d31c..55b573c 100644 --- a/src/skill_seekers/cli/presets.py +++ b/src/skill_seekers/cli/presets.py @@ -3,6 +3,7 @@ Provides predefined analysis configurations with clear trade-offs between speed and comprehensiveness. """ + from dataclasses import dataclass @@ -13,6 +14,7 @@ class AnalysisPreset: Defines a complete analysis configuration including depth, feature flags, and AI enhancement level. """ + name: str description: str depth: str # surface, deep, full @@ -29,54 +31,52 @@ PRESETS = { description="Fast basic analysis (1-2 min, essential features only)", depth="surface", features={ - "api_reference": True, # ON - Essential for API docs + "api_reference": True, # ON - Essential for API docs "dependency_graph": False, # OFF - Slow, not critical for quick - "patterns": False, # OFF - Slow pattern detection - "test_examples": False, # OFF - Time-consuming extraction - "how_to_guides": False, # OFF - Requires AI enhancement - "config_patterns": False, # OFF - Not critical for quick scan - "docs": True, # ON - README/docs are essential + "patterns": False, # OFF - Slow pattern detection + "test_examples": False, # OFF - Time-consuming extraction + "how_to_guides": False, # OFF - Requires AI enhancement + "config_patterns": False, # OFF - Not critical for quick scan + "docs": True, # ON - README/docs are essential }, enhance_level=0, # No AI enhancement (fast) estimated_time="1-2 minutes", - icon="⚡" + icon="⚡", ), - "standard": AnalysisPreset( name="Standard", description="Balanced analysis (5-10 min, core features, DEFAULT)", depth="deep", features={ - "api_reference": True, # ON - Core feature - "dependency_graph": True, # ON - Valuable insights - "patterns": True, # ON - Design pattern detection - "test_examples": True, # ON - Real usage examples - "how_to_guides": False, # OFF - Requires AI (slow) - "config_patterns": True, # ON - Configuration docs - "docs": True, # ON - Project documentation + "api_reference": True, # ON - Core feature + "dependency_graph": True, # ON - Valuable insights + "patterns": True, # ON - Design pattern detection + "test_examples": True, # ON - Real usage examples + "how_to_guides": False, # OFF - Requires AI (slow) + "config_patterns": True, # ON - Configuration docs + "docs": True, # ON - Project documentation }, enhance_level=1, # SKILL.md enhancement only estimated_time="5-10 minutes", - icon="🎯" + icon="🎯", ), - "comprehensive": AnalysisPreset( name="Comprehensive", description="Full analysis (20-60 min, all features + AI)", depth="full", features={ - "api_reference": True, # ON - Complete API docs - "dependency_graph": True, # ON - Full dependency analysis - "patterns": True, # ON - All design patterns - "test_examples": True, # ON - All test examples - "how_to_guides": True, # ON - AI-generated guides - "config_patterns": True, # ON - All configuration patterns - "docs": True, # ON - All project docs + "api_reference": True, # ON - Complete API docs + "dependency_graph": True, # ON - Full dependency analysis + "patterns": True, # ON - All design patterns + "test_examples": True, # ON - All test examples + "how_to_guides": True, # ON - AI-generated guides + "config_patterns": True, # ON - All configuration patterns + "docs": True, # ON - All project docs }, enhance_level=3, # Full AI enhancement (all features) estimated_time="20-60 minutes", - icon="🚀" - ) + icon="🚀", + ), } @@ -142,10 +142,7 @@ class PresetManager: raise ValueError(f"Unknown preset: {preset_name}") # Start with preset defaults - updated_args = { - 'depth': preset.depth, - 'enhance_level': preset.enhance_level - } + updated_args = {"depth": preset.depth, "enhance_level": preset.enhance_level} # Convert feature flags to skip_* arguments # feature=False → skip_feature=True (disabled) diff --git a/src/skill_seekers/cli/quality_metrics.py b/src/skill_seekers/cli/quality_metrics.py index 915ff47..c8f8d0d 100644 --- a/src/skill_seekers/cli/quality_metrics.py +++ b/src/skill_seekers/cli/quality_metrics.py @@ -16,6 +16,7 @@ from enum import Enum class MetricLevel(Enum): """Metric severity level.""" + INFO = "info" WARNING = "warning" ERROR = "error" @@ -25,6 +26,7 @@ class MetricLevel(Enum): @dataclass class QualityMetric: """Individual quality metric.""" + name: str value: float # 0.0-1.0 (or 0-100 percentage) level: MetricLevel @@ -35,6 +37,7 @@ class QualityMetric: @dataclass class QualityScore: """Overall quality score.""" + total_score: float # 0-100 completeness: float # 0-100 accuracy: float # 0-100 @@ -46,6 +49,7 @@ class QualityScore: @dataclass class QualityReport: """Complete quality report.""" + timestamp: str skill_name: str overall_score: QualityScore @@ -64,10 +68,17 @@ class QualityAnalyzer: # Thresholds for quality grades GRADE_THRESHOLDS = { - 'A+': 95, 'A': 90, 'A-': 85, - 'B+': 80, 'B': 75, 'B-': 70, - 'C+': 65, 'C': 60, 'C-': 55, - 'D': 50, 'F': 0 + "A+": 95, + "A": 90, + "A-": 85, + "B+": 80, + "B": 75, + "B-": 70, + "C+": 65, + "C": 60, + "C-": 55, + "D": 50, + "F": 0, } def __init__(self, skill_dir: Path): @@ -102,7 +113,7 @@ class QualityAnalyzer: score += 10 # Has sections (10 points) - if content.count('#') >= 5: + if content.count("#") >= 5: score += 10 # References directory (20 points) @@ -134,13 +145,15 @@ class QualityAnalyzer: if len(suggestions) == 0: suggestions.append("Expand documentation coverage") - self.metrics.append(QualityMetric( - name="Completeness", - value=completeness, - level=level, - description=f"Documentation completeness: {completeness:.1f}%", - suggestions=suggestions - )) + self.metrics.append( + QualityMetric( + name="Completeness", + value=completeness, + level=level, + description=f"Documentation completeness: {completeness:.1f}%", + suggestions=suggestions, + ) + ) return completeness @@ -166,14 +179,14 @@ class QualityAnalyzer: content = skill_md.read_text(encoding="utf-8") # Check for TODO markers (deduct 5 points each, max 20) - todo_count = content.lower().count('todo') + todo_count = content.lower().count("todo") if todo_count > 0: deduction = min(todo_count * 5, 20) score -= deduction issues.append(f"Found {todo_count} TODO markers") # Check for placeholder text (deduct 10) - placeholders = ['lorem ipsum', 'placeholder', 'coming soon'] + placeholders = ["lorem ipsum", "placeholder", "coming soon"] for placeholder in placeholders: if placeholder in content.lower(): score -= 10 @@ -195,13 +208,15 @@ class QualityAnalyzer: if accuracy < 100 and issues: suggestions.extend(issues[:3]) # Top 3 issues - self.metrics.append(QualityMetric( - name="Accuracy", - value=accuracy, - level=level, - description=f"Documentation accuracy: {accuracy:.1f}%", - suggestions=suggestions - )) + self.metrics.append( + QualityMetric( + name="Accuracy", + value=accuracy, + level=level, + description=f"Documentation accuracy: {accuracy:.1f}%", + suggestions=suggestions, + ) + ) return accuracy @@ -234,13 +249,13 @@ class QualityAnalyzer: # Check for specific types (20 points each) ref_names = [f.stem.lower() for f in ref_files] - if any('getting' in name or 'start' in name for name in ref_names): + if any("getting" in name or "start" in name for name in ref_names): score += 20 - if any('api' in name or 'reference' in name for name in ref_names): + if any("api" in name or "reference" in name for name in ref_names): score += 20 - if any('example' in name or 'tutorial' in name for name in ref_names): + if any("example" in name or "tutorial" in name for name in ref_names): score += 20 # Has diverse content (10 points) @@ -258,13 +273,15 @@ class QualityAnalyzer: suggestions.append("Add API reference documentation") suggestions.append("Expand documentation coverage") - self.metrics.append(QualityMetric( - name="Coverage", - value=coverage, - level=level, - description=f"Documentation coverage: {coverage:.1f}%", - suggestions=suggestions - )) + self.metrics.append( + QualityMetric( + name="Coverage", + value=coverage, + level=level, + description=f"Documentation coverage: {coverage:.1f}%", + suggestions=suggestions, + ) + ) return coverage @@ -308,56 +325,54 @@ class QualityAnalyzer: if health < 100: suggestions.extend(issues[:3]) - self.metrics.append(QualityMetric( - name="Health", - value=health, - level=level, - description=f"Skill health: {health:.1f}%", - suggestions=suggestions - )) + self.metrics.append( + QualityMetric( + name="Health", + value=health, + level=level, + description=f"Skill health: {health:.1f}%", + suggestions=suggestions, + ) + ) return health def calculate_statistics(self) -> dict[str, Any]: """Calculate skill statistics.""" stats = { - 'total_files': 0, - 'total_size_bytes': 0, - 'markdown_files': 0, - 'reference_files': 0, - 'total_characters': 0, - 'total_words': 0 + "total_files": 0, + "total_size_bytes": 0, + "markdown_files": 0, + "reference_files": 0, + "total_characters": 0, + "total_words": 0, } # Count files and sizes for md_file in self.skill_dir.rglob("*.md"): - stats['total_files'] += 1 - stats['markdown_files'] += 1 + stats["total_files"] += 1 + stats["markdown_files"] += 1 size = md_file.stat().st_size - stats['total_size_bytes'] += size + stats["total_size_bytes"] += size # Count words try: content = md_file.read_text(encoding="utf-8") - stats['total_characters'] += len(content) - stats['total_words'] += len(content.split()) + stats["total_characters"] += len(content) + stats["total_words"] += len(content.split()) except Exception: pass # Count references refs_dir = self.skill_dir / "references" if refs_dir.exists(): - stats['reference_files'] = len(list(refs_dir.glob("*.md"))) + stats["reference_files"] = len(list(refs_dir.glob("*.md"))) self.statistics = stats return stats def calculate_overall_score( - self, - completeness: float, - accuracy: float, - coverage: float, - health: float + self, completeness: float, accuracy: float, coverage: float, health: float ) -> QualityScore: """ Calculate overall quality score. @@ -368,15 +383,10 @@ class QualityAnalyzer: - Coverage: 25% - Health: 20% """ - total = ( - completeness * 0.30 + - accuracy * 0.25 + - coverage * 0.25 + - health * 0.20 - ) + total = completeness * 0.30 + accuracy * 0.25 + coverage * 0.25 + health * 0.20 # Determine grade - grade = 'F' + grade = "F" for g, threshold in self.GRADE_THRESHOLDS.items(): if total >= threshold: grade = g @@ -388,7 +398,7 @@ class QualityAnalyzer: accuracy=accuracy, coverage=coverage, health=health, - grade=grade + grade=grade, ) def generate_recommendations(self, score: QualityScore) -> list[str]: @@ -431,9 +441,7 @@ class QualityAnalyzer: health = self.analyze_health() # Calculate overall score - overall_score = self.calculate_overall_score( - completeness, accuracy, coverage, health - ) + overall_score = self.calculate_overall_score(completeness, accuracy, coverage, health) # Calculate statistics stats = self.calculate_statistics() @@ -447,7 +455,7 @@ class QualityAnalyzer: overall_score=overall_score, metrics=self.metrics, statistics=stats, - recommendations=recommendations + recommendations=recommendations, ) def format_report(self, report: QualityReport) -> str: @@ -484,7 +492,7 @@ class QualityAnalyzer: MetricLevel.INFO: "✅", MetricLevel.WARNING: "⚠️", MetricLevel.ERROR: "❌", - MetricLevel.CRITICAL: "🔴" + MetricLevel.CRITICAL: "🔴", }.get(metric.level, "ℹ️") lines.append(f" {icon} {metric.name}: {metric.value:.1f}%") @@ -553,4 +561,5 @@ def main(): if __name__ == "__main__": import sys + sys.exit(main()) diff --git a/src/skill_seekers/cli/rag_chunker.py b/src/skill_seekers/cli/rag_chunker.py index d2f3c50..23f8340 100644 --- a/src/skill_seekers/cli/rag_chunker.py +++ b/src/skill_seekers/cli/rag_chunker.py @@ -75,10 +75,7 @@ class RAGChunker: return len(text) // self.chars_per_token def chunk_document( - self, - text: str, - metadata: dict, - source_file: str | None = None + self, text: str, metadata: dict, source_file: str | None = None ) -> list[dict]: """ Chunk single document into RAG-ready chunks. @@ -125,11 +122,13 @@ class RAGChunker: if source_file: chunk_metadata["source_file"] = source_file - result.append({ - "chunk_id": f"{metadata.get('source', 'unknown')}_{i}", - "page_content": chunk_text.strip(), - "metadata": chunk_metadata - }) + result.append( + { + "chunk_id": f"{metadata.get('source', 'unknown')}_{i}", + "page_content": chunk_text.strip(), + "metadata": chunk_metadata, + } + ) logger.info( f"Created {len(result)} chunks from {source_file or 'document'} " @@ -153,14 +152,10 @@ class RAGChunker: # Chunk main SKILL.md skill_md = skill_dir / "SKILL.md" if skill_md.exists(): - with open(skill_md, encoding='utf-8') as f: + with open(skill_md, encoding="utf-8") as f: content = f.read() - metadata = { - "source": skill_dir.name, - "category": "overview", - "file_type": "skill_md" - } + metadata = {"source": skill_dir.name, "category": "overview", "file_type": "skill_md"} chunks = self.chunk_document(content, metadata, source_file="SKILL.md") all_chunks.extend(chunks) @@ -169,26 +164,21 @@ class RAGChunker: references_dir = skill_dir / "references" if references_dir.exists(): for ref_file in references_dir.glob("*.md"): - with open(ref_file, encoding='utf-8') as f: + with open(ref_file, encoding="utf-8") as f: content = f.read() metadata = { "source": skill_dir.name, "category": ref_file.stem, - "file_type": "reference" + "file_type": "reference", } chunks = self.chunk_document( - content, - metadata, - source_file=str(ref_file.relative_to(skill_dir)) + content, metadata, source_file=str(ref_file.relative_to(skill_dir)) ) all_chunks.extend(chunks) - logger.info( - f"Chunked skill directory {skill_dir.name}: " - f"{len(all_chunks)} total chunks" - ) + logger.info(f"Chunked skill directory {skill_dir.name}: {len(all_chunks)} total chunks") return all_chunks @@ -207,32 +197,25 @@ class RAGChunker: # Match code blocks (``` fenced blocks) # Use DOTALL flag to match across newlines - code_block_pattern = r'```[^\n]*\n.*?```' + code_block_pattern = r"```[^\n]*\n.*?```" def replacer(match): idx = len(code_blocks) - code_blocks.append({ - "index": idx, - "content": match.group(0), - "start": match.start(), - "end": match.end() - }) + code_blocks.append( + { + "index": idx, + "content": match.group(0), + "start": match.start(), + "end": match.end(), + } + ) return placeholder_pattern.format(idx=idx) - text_with_placeholders = re.sub( - code_block_pattern, - replacer, - text, - flags=re.DOTALL - ) + text_with_placeholders = re.sub(code_block_pattern, replacer, text, flags=re.DOTALL) return text_with_placeholders, code_blocks - def _reinsert_code_blocks( - self, - chunks: list[str], - code_blocks: list[dict] - ) -> list[str]: + def _reinsert_code_blocks(self, chunks: list[str], code_blocks: list[dict]) -> list[str]: """ Re-insert code blocks into chunks. @@ -249,7 +232,7 @@ class RAGChunker: for block in code_blocks: placeholder = f"<>" if placeholder in chunk: - chunk = chunk.replace(placeholder, block['content']) + chunk = chunk.replace(placeholder, block["content"]) result.append(chunk) return result @@ -268,15 +251,15 @@ class RAGChunker: # Paragraph boundaries (double newline) if self.preserve_paragraphs: - for match in re.finditer(r'\n\n+', text): + for match in re.finditer(r"\n\n+", text): boundaries.append(match.end()) # Section headers (# Header) - for match in re.finditer(r'\n#{1,6}\s+.+\n', text): + for match in re.finditer(r"\n#{1,6}\s+.+\n", text): boundaries.append(match.start()) # Single newlines (less preferred, but useful) - for match in re.finditer(r'\n', text): + for match in re.finditer(r"\n", text): boundaries.append(match.start()) # Add artificial boundaries for large documents @@ -352,7 +335,9 @@ class RAGChunker: # Add chunk if it meets minimum size requirement # (unless the entire text is smaller than target size) - if chunk_text.strip() and (len(text) <= target_size_chars or len(chunk_text) >= min_size_chars): + if chunk_text.strip() and ( + len(text) <= target_size_chars or len(chunk_text) >= min_size_chars + ): chunks.append(chunk_text) # Move to next chunk with overlap @@ -383,7 +368,7 @@ class RAGChunker: """ output_path.parent.mkdir(parents=True, exist_ok=True) - with open(output_path, 'w', encoding='utf-8') as f: + with open(output_path, "w", encoding="utf-8") as f: json.dump(chunks, f, indent=2, ensure_ascii=False) logger.info(f"Saved {len(chunks)} chunks to {output_path}") @@ -393,7 +378,9 @@ def main(): """CLI entry point for testing RAG chunker.""" import argparse - parser = argparse.ArgumentParser(description="RAG Chunker - Semantic chunking for RAG pipelines") + parser = argparse.ArgumentParser( + description="RAG Chunker - Semantic chunking for RAG pipelines" + ) parser.add_argument("skill_dir", type=Path, help="Path to skill directory") parser.add_argument("--output", "-o", type=Path, help="Output JSON file") parser.add_argument("--chunk-size", type=int, default=512, help="Target chunk size in tokens") diff --git a/src/skill_seekers/cli/storage/__init__.py b/src/skill_seekers/cli/storage/__init__.py index 3689310..123190d 100644 --- a/src/skill_seekers/cli/storage/__init__.py +++ b/src/skill_seekers/cli/storage/__init__.py @@ -59,27 +59,26 @@ def get_storage_adaptor(provider: str, **kwargs) -> BaseStorageAdaptor: account_name='myaccount') """ adaptors = { - 's3': S3StorageAdaptor, - 'gcs': GCSStorageAdaptor, - 'azure': AzureStorageAdaptor, + "s3": S3StorageAdaptor, + "gcs": GCSStorageAdaptor, + "azure": AzureStorageAdaptor, } provider_lower = provider.lower() if provider_lower not in adaptors: - supported = ', '.join(adaptors.keys()) + supported = ", ".join(adaptors.keys()) raise ValueError( - f"Unsupported storage provider: {provider}. " - f"Supported providers: {supported}" + f"Unsupported storage provider: {provider}. Supported providers: {supported}" ) return adaptors[provider_lower](**kwargs) __all__ = [ - 'BaseStorageAdaptor', - 'StorageObject', - 'S3StorageAdaptor', - 'GCSStorageAdaptor', - 'AzureStorageAdaptor', - 'get_storage_adaptor', + "BaseStorageAdaptor", + "StorageObject", + "S3StorageAdaptor", + "GCSStorageAdaptor", + "AzureStorageAdaptor", + "get_storage_adaptor", ] diff --git a/src/skill_seekers/cli/storage/azure_storage.py b/src/skill_seekers/cli/storage/azure_storage.py index 55687e1..1588cf9 100644 --- a/src/skill_seekers/cli/storage/azure_storage.py +++ b/src/skill_seekers/cli/storage/azure_storage.py @@ -9,6 +9,7 @@ from datetime import datetime, timedelta try: from azure.storage.blob import BlobServiceClient, BlobSasPermissions, generate_blob_sas from azure.core.exceptions import ResourceNotFoundError + AZURE_AVAILABLE = True except ImportError: AZURE_AVAILABLE = False @@ -65,38 +66,30 @@ class AzureStorageAdaptor(BaseStorageAdaptor): "Install with: pip install azure-storage-blob" ) - if 'container' not in kwargs: + if "container" not in kwargs: raise ValueError("container parameter is required for Azure storage") - self.container_name = kwargs['container'] + self.container_name = kwargs["container"] # Initialize BlobServiceClient - if 'connection_string' in kwargs: - connection_string = kwargs['connection_string'] + if "connection_string" in kwargs: + connection_string = kwargs["connection_string"] else: - connection_string = os.getenv('AZURE_STORAGE_CONNECTION_STRING') + connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING") if connection_string: - self.blob_service_client = BlobServiceClient.from_connection_string( - connection_string - ) + self.blob_service_client = BlobServiceClient.from_connection_string(connection_string) # Extract account name from connection string self.account_name = None self.account_key = None - for part in connection_string.split(';'): - if part.startswith('AccountName='): - self.account_name = part.split('=', 1)[1] - elif part.startswith('AccountKey='): - self.account_key = part.split('=', 1)[1] + for part in connection_string.split(";"): + if part.startswith("AccountName="): + self.account_name = part.split("=", 1)[1] + elif part.startswith("AccountKey="): + self.account_key = part.split("=", 1)[1] else: - account_name = kwargs.get( - 'account_name', - os.getenv('AZURE_STORAGE_ACCOUNT_NAME') - ) - account_key = kwargs.get( - 'account_key', - os.getenv('AZURE_STORAGE_ACCOUNT_KEY') - ) + account_name = kwargs.get("account_name", os.getenv("AZURE_STORAGE_ACCOUNT_NAME")) + account_key = kwargs.get("account_key", os.getenv("AZURE_STORAGE_ACCOUNT_KEY")) if not account_name or not account_key: raise ValueError( @@ -108,13 +101,10 @@ class AzureStorageAdaptor(BaseStorageAdaptor): self.account_key = account_key account_url = f"https://{account_name}.blob.core.windows.net" self.blob_service_client = BlobServiceClient( - account_url=account_url, - credential=account_key + account_url=account_url, credential=account_key ) - self.container_client = self.blob_service_client.get_container_client( - self.container_name - ) + self.container_client = self.blob_service_client.get_container_client(self.container_name) def upload_file( self, local_path: str, remote_path: str, metadata: dict[str, str] | None = None @@ -128,11 +118,7 @@ class AzureStorageAdaptor(BaseStorageAdaptor): blob_client = self.container_client.get_blob_client(remote_path) with open(local_file, "rb") as data: - blob_client.upload_blob( - data, - overwrite=True, - metadata=metadata - ) + blob_client.upload_blob(data, overwrite=True, metadata=metadata) return f"https://{self.account_name}.blob.core.windows.net/{self.container_name}/{remote_path}" except Exception as e: @@ -164,25 +150,26 @@ class AzureStorageAdaptor(BaseStorageAdaptor): except Exception as e: raise Exception(f"Azure deletion failed: {e}") from e - def list_files( - self, prefix: str = "", max_results: int = 1000 - ) -> list[StorageObject]: + def list_files(self, prefix: str = "", max_results: int = 1000) -> list[StorageObject]: """List files in Azure container.""" try: blobs = self.container_client.list_blobs( - name_starts_with=prefix, - results_per_page=max_results + name_starts_with=prefix, results_per_page=max_results ) files = [] for blob in blobs: - files.append(StorageObject( - key=blob.name, - size=blob.size, - last_modified=blob.last_modified.isoformat() if blob.last_modified else None, - etag=blob.etag, - metadata=blob.metadata - )) + files.append( + StorageObject( + key=blob.name, + size=blob.size, + last_modified=blob.last_modified.isoformat() + if blob.last_modified + else None, + etag=blob.etag, + metadata=blob.metadata, + ) + ) return files except Exception as e: @@ -205,9 +192,7 @@ class AzureStorageAdaptor(BaseStorageAdaptor): raise FileNotFoundError(f"Remote file not found: {remote_path}") if not self.account_name or not self.account_key: - raise ValueError( - "Account name and key are required for SAS URL generation" - ) + raise ValueError("Account name and key are required for SAS URL generation") sas_token = generate_blob_sas( account_name=self.account_name, @@ -215,7 +200,7 @@ class AzureStorageAdaptor(BaseStorageAdaptor): blob_name=remote_path, account_key=self.account_key, permission=BlobSasPermissions(read=True), - expiry=datetime.utcnow() + timedelta(seconds=expires_in) + expiry=datetime.utcnow() + timedelta(seconds=expires_in), ) return f"{blob_client.url}?{sas_token}" @@ -239,12 +224,13 @@ class AzureStorageAdaptor(BaseStorageAdaptor): # Wait for copy to complete properties = dest_blob.get_blob_properties() - while properties.copy.status == 'pending': + while properties.copy.status == "pending": import time + time.sleep(0.1) properties = dest_blob.get_blob_properties() - if properties.copy.status != 'success': + if properties.copy.status != "success": raise Exception(f"Copy failed with status: {properties.copy.status}") except FileNotFoundError: diff --git a/src/skill_seekers/cli/storage/base_storage.py b/src/skill_seekers/cli/storage/base_storage.py index dd64c8b..d2b16cf 100644 --- a/src/skill_seekers/cli/storage/base_storage.py +++ b/src/skill_seekers/cli/storage/base_storage.py @@ -95,9 +95,7 @@ class BaseStorageAdaptor(ABC): pass @abstractmethod - def list_files( - self, prefix: str = "", max_results: int = 1000 - ) -> list[StorageObject]: + def list_files(self, prefix: str = "", max_results: int = 1000) -> list[StorageObject]: """ List files in cloud storage. @@ -191,9 +189,7 @@ class BaseStorageAdaptor(ABC): return uploaded_files - def download_directory( - self, remote_prefix: str, local_dir: str - ) -> list[str]: + def download_directory(self, remote_prefix: str, local_dir: str) -> list[str]: """ Download directory from cloud storage. @@ -245,9 +241,7 @@ class BaseStorageAdaptor(ABC): raise FileNotFoundError(f"File not found: {remote_path}") return files[0].size - def copy_file( - self, source_path: str, dest_path: str - ) -> None: + def copy_file(self, source_path: str, dest_path: str) -> None: """ Copy file within cloud storage. diff --git a/src/skill_seekers/cli/storage/gcs_storage.py b/src/skill_seekers/cli/storage/gcs_storage.py index 5e091c5..1330186 100644 --- a/src/skill_seekers/cli/storage/gcs_storage.py +++ b/src/skill_seekers/cli/storage/gcs_storage.py @@ -9,6 +9,7 @@ from datetime import timedelta try: from google.cloud import storage from google.cloud.exceptions import NotFound + GCS_AVAILABLE = True except ImportError: GCS_AVAILABLE = False @@ -63,19 +64,19 @@ class GCSStorageAdaptor(BaseStorageAdaptor): "Install with: pip install google-cloud-storage" ) - if 'bucket' not in kwargs: + if "bucket" not in kwargs: raise ValueError("bucket parameter is required for GCS storage") - self.bucket_name = kwargs['bucket'] - self.project = kwargs.get('project', os.getenv('GOOGLE_CLOUD_PROJECT')) + self.bucket_name = kwargs["bucket"] + self.project = kwargs.get("project", os.getenv("GOOGLE_CLOUD_PROJECT")) # Initialize GCS client client_kwargs = {} if self.project: - client_kwargs['project'] = self.project + client_kwargs["project"] = self.project - if 'credentials_path' in kwargs: - os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = kwargs['credentials_path'] + if "credentials_path" in kwargs: + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = kwargs["credentials_path"] self.storage_client = storage.Client(**client_kwargs) self.bucket = self.storage_client.bucket(self.bucket_name) @@ -122,26 +123,24 @@ class GCSStorageAdaptor(BaseStorageAdaptor): except Exception as e: raise Exception(f"GCS deletion failed: {e}") from e - def list_files( - self, prefix: str = "", max_results: int = 1000 - ) -> list[StorageObject]: + def list_files(self, prefix: str = "", max_results: int = 1000) -> list[StorageObject]: """List files in GCS bucket.""" try: blobs = self.storage_client.list_blobs( - self.bucket_name, - prefix=prefix, - max_results=max_results + self.bucket_name, prefix=prefix, max_results=max_results ) files = [] for blob in blobs: - files.append(StorageObject( - key=blob.name, - size=blob.size, - last_modified=blob.updated.isoformat() if blob.updated else None, - etag=blob.etag, - metadata=blob.metadata - )) + files.append( + StorageObject( + key=blob.name, + size=blob.size, + last_modified=blob.updated.isoformat() if blob.updated else None, + etag=blob.etag, + metadata=blob.metadata, + ) + ) return files except Exception as e: @@ -164,9 +163,7 @@ class GCSStorageAdaptor(BaseStorageAdaptor): raise FileNotFoundError(f"Remote file not found: {remote_path}") url = blob.generate_signed_url( - version="v4", - expiration=timedelta(seconds=expires_in), - method="GET" + version="v4", expiration=timedelta(seconds=expires_in), method="GET" ) return url except FileNotFoundError: @@ -182,11 +179,7 @@ class GCSStorageAdaptor(BaseStorageAdaptor): if not source_blob.exists(): raise FileNotFoundError(f"Source file not found: {source_path}") - self.bucket.copy_blob( - source_blob, - self.bucket, - dest_path - ) + self.bucket.copy_blob(source_blob, self.bucket, dest_path) except FileNotFoundError: raise except Exception as e: diff --git a/src/skill_seekers/cli/storage/s3_storage.py b/src/skill_seekers/cli/storage/s3_storage.py index 57af41b..40f8d78 100644 --- a/src/skill_seekers/cli/storage/s3_storage.py +++ b/src/skill_seekers/cli/storage/s3_storage.py @@ -8,6 +8,7 @@ from pathlib import Path try: import boto3 from botocore.exceptions import ClientError + BOTO3_AVAILABLE = True except ImportError: BOTO3_AVAILABLE = False @@ -63,33 +64,30 @@ class S3StorageAdaptor(BaseStorageAdaptor): super().__init__(**kwargs) if not BOTO3_AVAILABLE: - raise ImportError( - "boto3 is required for S3 storage. " - "Install with: pip install boto3" - ) + raise ImportError("boto3 is required for S3 storage. Install with: pip install boto3") - if 'bucket' not in kwargs: + if "bucket" not in kwargs: raise ValueError("bucket parameter is required for S3 storage") - self.bucket = kwargs['bucket'] - self.region = kwargs.get('region', os.getenv('AWS_DEFAULT_REGION', 'us-east-1')) + self.bucket = kwargs["bucket"] + self.region = kwargs.get("region", os.getenv("AWS_DEFAULT_REGION", "us-east-1")) # Initialize S3 client client_kwargs = { - 'region_name': self.region, + "region_name": self.region, } - if 'endpoint_url' in kwargs: - client_kwargs['endpoint_url'] = kwargs['endpoint_url'] + if "endpoint_url" in kwargs: + client_kwargs["endpoint_url"] = kwargs["endpoint_url"] - if 'aws_access_key_id' in kwargs: - client_kwargs['aws_access_key_id'] = kwargs['aws_access_key_id'] + if "aws_access_key_id" in kwargs: + client_kwargs["aws_access_key_id"] = kwargs["aws_access_key_id"] - if 'aws_secret_access_key' in kwargs: - client_kwargs['aws_secret_access_key'] = kwargs['aws_secret_access_key'] + if "aws_secret_access_key" in kwargs: + client_kwargs["aws_secret_access_key"] = kwargs["aws_secret_access_key"] - self.s3_client = boto3.client('s3', **client_kwargs) - self.s3_resource = boto3.resource('s3', **client_kwargs) + self.s3_client = boto3.client("s3", **client_kwargs) + self.s3_resource = boto3.resource("s3", **client_kwargs) def upload_file( self, local_path: str, remote_path: str, metadata: dict[str, str] | None = None @@ -101,14 +99,14 @@ class S3StorageAdaptor(BaseStorageAdaptor): extra_args = {} if metadata: - extra_args['Metadata'] = metadata + extra_args["Metadata"] = metadata try: self.s3_client.upload_file( str(local_file), self.bucket, remote_path, - ExtraArgs=extra_args if extra_args else None + ExtraArgs=extra_args if extra_args else None, ) return f"s3://{self.bucket}/{remote_path}" except ClientError as e: @@ -120,50 +118,41 @@ class S3StorageAdaptor(BaseStorageAdaptor): local_file.parent.mkdir(parents=True, exist_ok=True) try: - self.s3_client.download_file( - self.bucket, - remote_path, - str(local_file) - ) + self.s3_client.download_file(self.bucket, remote_path, str(local_file)) except ClientError as e: - if e.response['Error']['Code'] == '404': + if e.response["Error"]["Code"] == "404": raise FileNotFoundError(f"Remote file not found: {remote_path}") from e raise Exception(f"S3 download failed: {e}") from e def delete_file(self, remote_path: str) -> None: """Delete file from S3.""" try: - self.s3_client.delete_object( - Bucket=self.bucket, - Key=remote_path - ) + self.s3_client.delete_object(Bucket=self.bucket, Key=remote_path) except ClientError as e: raise Exception(f"S3 deletion failed: {e}") from e - def list_files( - self, prefix: str = "", max_results: int = 1000 - ) -> list[StorageObject]: + def list_files(self, prefix: str = "", max_results: int = 1000) -> list[StorageObject]: """List files in S3 bucket.""" try: - paginator = self.s3_client.get_paginator('list_objects_v2') + paginator = self.s3_client.get_paginator("list_objects_v2") page_iterator = paginator.paginate( - Bucket=self.bucket, - Prefix=prefix, - PaginationConfig={'MaxItems': max_results} + Bucket=self.bucket, Prefix=prefix, PaginationConfig={"MaxItems": max_results} ) files = [] for page in page_iterator: - if 'Contents' not in page: + if "Contents" not in page: continue - for obj in page['Contents']: - files.append(StorageObject( - key=obj['Key'], - size=obj['Size'], - last_modified=obj['LastModified'].isoformat(), - etag=obj.get('ETag', '').strip('"') - )) + for obj in page["Contents"]: + files.append( + StorageObject( + key=obj["Key"], + size=obj["Size"], + last_modified=obj["LastModified"].isoformat(), + etag=obj.get("ETag", "").strip('"'), + ) + ) return files except ClientError as e: @@ -172,13 +161,10 @@ class S3StorageAdaptor(BaseStorageAdaptor): def file_exists(self, remote_path: str) -> bool: """Check if file exists in S3.""" try: - self.s3_client.head_object( - Bucket=self.bucket, - Key=remote_path - ) + self.s3_client.head_object(Bucket=self.bucket, Key=remote_path) return True except ClientError as e: - if e.response['Error']['Code'] == '404': + if e.response["Error"]["Code"] == "404": return False raise Exception(f"S3 head_object failed: {e}") from e @@ -186,12 +172,9 @@ class S3StorageAdaptor(BaseStorageAdaptor): """Generate presigned URL for S3 object.""" try: url = self.s3_client.generate_presigned_url( - 'get_object', - Params={ - 'Bucket': self.bucket, - 'Key': remote_path - }, - ExpiresIn=expires_in + "get_object", + Params={"Bucket": self.bucket, "Key": remote_path}, + ExpiresIn=expires_in, ) return url except ClientError as e: @@ -200,16 +183,9 @@ class S3StorageAdaptor(BaseStorageAdaptor): def copy_file(self, source_path: str, dest_path: str) -> None: """Copy file within S3 bucket (server-side copy).""" try: - copy_source = { - 'Bucket': self.bucket, - 'Key': source_path - } - self.s3_client.copy_object( - CopySource=copy_source, - Bucket=self.bucket, - Key=dest_path - ) + copy_source = {"Bucket": self.bucket, "Key": source_path} + self.s3_client.copy_object(CopySource=copy_source, Bucket=self.bucket, Key=dest_path) except ClientError as e: - if e.response['Error']['Code'] == '404': + if e.response["Error"]["Code"] == "404": raise FileNotFoundError(f"Source file not found: {source_path}") from e raise Exception(f"S3 copy failed: {e}") from e diff --git a/src/skill_seekers/cli/streaming_ingest.py b/src/skill_seekers/cli/streaming_ingest.py index dd732ad..152e4fc 100644 --- a/src/skill_seekers/cli/streaming_ingest.py +++ b/src/skill_seekers/cli/streaming_ingest.py @@ -17,6 +17,7 @@ import time @dataclass class ChunkMetadata: """Metadata for a document chunk.""" + chunk_id: str source: str category: str @@ -30,6 +31,7 @@ class ChunkMetadata: @dataclass class IngestionProgress: """Progress tracking for streaming ingestion.""" + total_documents: int processed_documents: int total_chunks: int @@ -81,7 +83,7 @@ class StreamingIngester: chunk_size: int = 4000, chunk_overlap: int = 200, batch_size: int = 100, - max_memory_mb: int = 500 + max_memory_mb: int = 500, ): """ Initialize streaming ingester. @@ -103,7 +105,7 @@ class StreamingIngester: content: str, metadata: dict, chunk_size: int | None = None, - chunk_overlap: int | None = None + chunk_overlap: int | None = None, ) -> Iterator[tuple[str, ChunkMetadata]]: """ Split document into overlapping chunks. @@ -130,7 +132,7 @@ class StreamingIngester: chunk_index=0, total_chunks=1, char_start=0, - char_end=len(content) + char_end=len(content), ) yield content, chunk_meta return @@ -162,7 +164,7 @@ class StreamingIngester: chunk_index=i, total_chunks=total_chunks, char_start=start, - char_end=end + char_end=end, ) yield chunk_text, chunk_meta @@ -170,17 +172,12 @@ class StreamingIngester: def _generate_chunk_id(self, content: str, metadata: dict, chunk_index: int) -> str: """Generate deterministic chunk ID.""" id_string = ( - f"{metadata.get('source', '')}-" - f"{metadata.get('file', '')}-" - f"{chunk_index}-" - f"{content[:50]}" + f"{metadata.get('source', '')}-{metadata.get('file', '')}-{chunk_index}-{content[:50]}" ) return hashlib.md5(id_string.encode()).hexdigest() def stream_skill_directory( - self, - skill_dir: Path, - callback: callable | None = None + self, skill_dir: Path, callback: callable | None = None ) -> Iterator[tuple[str, dict]]: """ Stream all documents from skill directory. @@ -218,7 +215,7 @@ class StreamingIngester: processed_chunks=0, failed_chunks=0, bytes_processed=0, - start_time=time.time() + start_time=time.time(), ) # Process each document @@ -235,11 +232,13 @@ class StreamingIngester: "category": category, "file": filename, "type": "documentation" if filename == "SKILL.md" else "reference", - "version": "1.0.0" + "version": "1.0.0", } # Chunk document and yield chunks - for chunk_count, (chunk_text, chunk_meta) in enumerate(self.chunk_document(content, metadata), start=1): + for chunk_count, (chunk_text, chunk_meta) in enumerate( + self.chunk_document(content, metadata), start=1 + ): self.progress.total_chunks += 1 # Convert chunk metadata to dict @@ -272,9 +271,7 @@ class StreamingIngester: continue def batch_iterator( - self, - chunks: Iterator[tuple[str, dict]], - batch_size: int | None = None + self, chunks: Iterator[tuple[str, dict]], batch_size: int | None = None ) -> Iterator[list[tuple[str, dict]]]: """ Group chunks into batches for efficient processing. @@ -321,7 +318,7 @@ class StreamingIngester: "failed_chunks": self.progress.failed_chunks, "bytes_processed": self.progress.bytes_processed, }, - "state": state + "state": state, } checkpoint_path.write_text(json.dumps(checkpoint_data, indent=2)) @@ -384,23 +381,25 @@ def main(): parser = argparse.ArgumentParser(description="Stream and chunk skill documents") parser.add_argument("input", help="Input file or directory path") parser.add_argument("--chunk-size", type=int, default=4000, help="Chunk size in characters") - parser.add_argument("--chunk-overlap", type=int, default=200, help="Chunk overlap in characters") + parser.add_argument( + "--chunk-overlap", type=int, default=200, help="Chunk overlap in characters" + ) parser.add_argument("--batch-size", type=int, default=100, help="Batch size for processing") parser.add_argument("--checkpoint", help="Checkpoint file path") args = parser.parse_args() # Initialize ingester ingester = StreamingIngester( - chunk_size=args.chunk_size, - chunk_overlap=args.chunk_overlap, - batch_size=args.batch_size + chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap, batch_size=args.batch_size ) # Progress callback def on_progress(progress: IngestionProgress): if progress.processed_chunks % 10 == 0: - print(f"Progress: {progress.progress_percent:.1f}% - " - f"{progress.processed_chunks}/{progress.total_chunks} chunks") + print( + f"Progress: {progress.progress_percent:.1f}% - " + f"{progress.processed_chunks}/{progress.total_chunks} chunks" + ) # Stream input input_path = Path(args.input) @@ -416,17 +415,23 @@ def main(): metadata = {"source": input_path.stem, "file": input_path.name} file_chunks = ingester.chunk_document(content, metadata) # Convert to generator format matching stream_skill_directory - chunks = ((text, { - "content": text, - "chunk_id": meta.chunk_id, - "source": meta.source, - "category": meta.category, - "file": meta.file, - "chunk_index": meta.chunk_index, - "total_chunks": meta.total_chunks, - "char_start": meta.char_start, - "char_end": meta.char_end, - }) for text, meta in file_chunks) + chunks = ( + ( + text, + { + "content": text, + "chunk_id": meta.chunk_id, + "source": meta.source, + "category": meta.category, + "file": meta.file, + "chunk_index": meta.chunk_index, + "total_chunks": meta.total_chunks, + "char_start": meta.char_start, + "char_end": meta.char_end, + }, + ) + for text, meta in file_chunks + ) # Process in batches all_chunks = [] @@ -437,8 +442,7 @@ def main(): # Save checkpoint if specified if args.checkpoint: ingester.save_checkpoint( - Path(args.checkpoint), - {"processed_batches": len(all_chunks) // args.batch_size} + Path(args.checkpoint), {"processed_batches": len(all_chunks) // args.batch_size} ) # Final progress @@ -449,4 +453,5 @@ def main(): if __name__ == "__main__": import sys + sys.exit(main()) diff --git a/src/skill_seekers/cli/sync_cli.py b/src/skill_seekers/cli/sync_cli.py index ff54d8c..616c307 100644 --- a/src/skill_seekers/cli/sync_cli.py +++ b/src/skill_seekers/cli/sync_cli.py @@ -22,9 +22,7 @@ def handle_signal(_signum, _frame): def start_command(args): """Start monitoring.""" monitor = SyncMonitor( - config_path=args.config, - check_interval=args.interval, - auto_update=args.auto_update + config_path=args.config, check_interval=args.interval, auto_update=args.auto_update ) # Register signal handlers @@ -42,6 +40,7 @@ def start_command(args): # Keep running while True: import time + time.sleep(1) except KeyboardInterrupt: @@ -53,7 +52,7 @@ def check_command(args): """Check for changes once.""" monitor = SyncMonitor( config_path=args.config, - check_interval=3600 # Not used for single check + check_interval=3600, # Not used for single check ) print(f"🔍 Checking {args.config} for changes...") @@ -82,7 +81,7 @@ def check_command(args): print(f" • {change.url}") if change.diff and args.diff: print(f" Diff preview (first 5 lines):") - for line in change.diff.split('\n')[:5]: + for line in change.diff.split("\n")[:5]: print(f" {line}") if report.deleted: @@ -95,10 +94,7 @@ def check_command(args): def stats_command(args): """Show monitoring statistics.""" - monitor = SyncMonitor( - config_path=args.config, - check_interval=3600 - ) + monitor = SyncMonitor(config_path=args.config, check_interval=3600) stats = monitor.stats() @@ -117,7 +113,7 @@ def reset_command(args): state_file = Path(f"{args.skill_name}_sync.json") if state_file.exists(): - if args.force or input(f"⚠️ Reset state for {args.skill_name}? [y/N]: ").lower() == 'y': + if args.force or input(f"⚠️ Reset state for {args.skill_name}? [y/N]: ").lower() == "y": state_file.unlink() print(f"✅ State reset for {args.skill_name}") else: @@ -129,7 +125,7 @@ def reset_command(args): def main(): """Main entry point.""" parser = argparse.ArgumentParser( - description='Monitor documentation for changes and update skills', + description="Monitor documentation for changes and update skills", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: @@ -153,52 +149,39 @@ Examples: # Reset state skill-seekers-sync reset --skill-name react - """ + """, ) - subparsers = parser.add_subparsers(dest='command', help='Command to execute') + subparsers = parser.add_subparsers(dest="command", help="Command to execute") # Start command - start_parser = subparsers.add_parser('start', help='Start continuous monitoring') - start_parser.add_argument('--config', required=True, help='Path to skill config file') + start_parser = subparsers.add_parser("start", help="Start continuous monitoring") + start_parser.add_argument("--config", required=True, help="Path to skill config file") start_parser.add_argument( - '--interval', '-i', + "--interval", + "-i", type=int, default=3600, - help='Check interval in seconds (default: 3600 = 1 hour)' + help="Check interval in seconds (default: 3600 = 1 hour)", ) start_parser.add_argument( - '--auto-update', - action='store_true', - help='Automatically rebuild skill on changes' + "--auto-update", action="store_true", help="Automatically rebuild skill on changes" ) # Check command - check_parser = subparsers.add_parser('check', help='Check for changes once') - check_parser.add_argument('--config', required=True, help='Path to skill config file') - check_parser.add_argument( - '--diff', '-d', - action='store_true', - help='Generate content diffs' - ) - check_parser.add_argument( - '--verbose', '-v', - action='store_true', - help='Show detailed output' - ) + check_parser = subparsers.add_parser("check", help="Check for changes once") + check_parser.add_argument("--config", required=True, help="Path to skill config file") + check_parser.add_argument("--diff", "-d", action="store_true", help="Generate content diffs") + check_parser.add_argument("--verbose", "-v", action="store_true", help="Show detailed output") # Stats command - stats_parser = subparsers.add_parser('stats', help='Show monitoring statistics') - stats_parser.add_argument('--config', required=True, help='Path to skill config file') + stats_parser = subparsers.add_parser("stats", help="Show monitoring statistics") + stats_parser.add_argument("--config", required=True, help="Path to skill config file") # Reset command - reset_parser = subparsers.add_parser('reset', help='Reset monitoring state') - reset_parser.add_argument('--skill-name', required=True, help='Skill name') - reset_parser.add_argument( - '--force', '-f', - action='store_true', - help='Skip confirmation' - ) + reset_parser = subparsers.add_parser("reset", help="Reset monitoring state") + reset_parser.add_argument("--skill-name", required=True, help="Skill name") + reset_parser.add_argument("--force", "-f", action="store_true", help="Skip confirmation") args = parser.parse_args() @@ -207,18 +190,18 @@ Examples: sys.exit(1) try: - if args.command == 'start': + if args.command == "start": start_command(args) - elif args.command == 'check': + elif args.command == "check": check_command(args) - elif args.command == 'stats': + elif args.command == "stats": stats_command(args) - elif args.command == 'reset': + elif args.command == "reset": reset_command(args) except Exception as e: print(f"\n❌ Error: {e}", file=sys.stderr) sys.exit(1) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/skill_seekers/cli/upload_skill.py b/src/skill_seekers/cli/upload_skill.py index bd245dd..5ade937 100755 --- a/src/skill_seekers/cli/upload_skill.py +++ b/src/skill_seekers/cli/upload_skill.py @@ -59,7 +59,7 @@ def upload_skill_api(package_path, target="claude", api_key=None, **kwargs): api_key = os.environ.get(adaptor.get_env_var_name(), "").strip() # API key validation only for platforms that require it - if target in ['claude', 'gemini', 'openai']: + if target in ["claude", "gemini", "openai"]: if not api_key: return False, f"{adaptor.get_env_var_name()} not set. Export your API key first." @@ -172,41 +172,39 @@ Examples: # ChromaDB upload options parser.add_argument( "--chroma-url", - help="ChromaDB URL (default: http://localhost:8000 for HTTP, or use --persist-directory for local)" + help="ChromaDB URL (default: http://localhost:8000 for HTTP, or use --persist-directory for local)", ) parser.add_argument( "--persist-directory", - help="Local directory for persistent ChromaDB storage (default: ./chroma_db)" + help="Local directory for persistent ChromaDB storage (default: ./chroma_db)", ) parser.add_argument( "--embedding-function", choices=["openai", "sentence-transformers", "none"], - help="Embedding function for ChromaDB/Weaviate (default: platform default)" + help="Embedding function for ChromaDB/Weaviate (default: platform default)", ) parser.add_argument( - "--openai-api-key", - help="OpenAI API key for embeddings (or set OPENAI_API_KEY env var)" + "--openai-api-key", help="OpenAI API key for embeddings (or set OPENAI_API_KEY env var)" ) # Weaviate upload options parser.add_argument( "--weaviate-url", default="http://localhost:8080", - help="Weaviate URL (default: http://localhost:8080)" + help="Weaviate URL (default: http://localhost:8080)", ) parser.add_argument( "--use-cloud", action="store_true", - help="Use Weaviate Cloud (requires --api-key and --cluster-url)" + help="Use Weaviate Cloud (requires --api-key and --cluster-url)", ) parser.add_argument( - "--cluster-url", - help="Weaviate Cloud cluster URL (e.g., https://xxx.weaviate.network)" + "--cluster-url", help="Weaviate Cloud cluster URL (e.g., https://xxx.weaviate.network)" ) args = parser.parse_args() @@ -214,28 +212,30 @@ Examples: # Build kwargs for vector DB upload upload_kwargs = {} - if args.target == 'chroma': + if args.target == "chroma": if args.chroma_url: - upload_kwargs['chroma_url'] = args.chroma_url + upload_kwargs["chroma_url"] = args.chroma_url if args.persist_directory: - upload_kwargs['persist_directory'] = args.persist_directory + upload_kwargs["persist_directory"] = args.persist_directory if args.embedding_function: - upload_kwargs['embedding_function'] = args.embedding_function + upload_kwargs["embedding_function"] = args.embedding_function if args.openai_api_key: - upload_kwargs['openai_api_key'] = args.openai_api_key + upload_kwargs["openai_api_key"] = args.openai_api_key - elif args.target == 'weaviate': - upload_kwargs['weaviate_url'] = args.weaviate_url - upload_kwargs['use_cloud'] = args.use_cloud + elif args.target == "weaviate": + upload_kwargs["weaviate_url"] = args.weaviate_url + upload_kwargs["use_cloud"] = args.use_cloud if args.cluster_url: - upload_kwargs['cluster_url'] = args.cluster_url + upload_kwargs["cluster_url"] = args.cluster_url if args.embedding_function: - upload_kwargs['embedding_function'] = args.embedding_function + upload_kwargs["embedding_function"] = args.embedding_function if args.openai_api_key: - upload_kwargs['openai_api_key'] = args.openai_api_key + upload_kwargs["openai_api_key"] = args.openai_api_key # Upload skill - success, message = upload_skill_api(args.package_file, args.target, args.api_key, **upload_kwargs) + success, message = upload_skill_api( + args.package_file, args.target, args.api_key, **upload_kwargs + ) if success: sys.exit(0) diff --git a/src/skill_seekers/embedding/__init__.py b/src/skill_seekers/embedding/__init__.py index a37b71e..d2cedaf 100644 --- a/src/skill_seekers/embedding/__init__.py +++ b/src/skill_seekers/embedding/__init__.py @@ -23,9 +23,9 @@ from .generator import EmbeddingGenerator from .cache import EmbeddingCache __all__ = [ - 'EmbeddingRequest', - 'EmbeddingResponse', - 'BatchEmbeddingRequest', - 'EmbeddingGenerator', - 'EmbeddingCache', + "EmbeddingRequest", + "EmbeddingResponse", + "BatchEmbeddingRequest", + "EmbeddingGenerator", + "EmbeddingCache", ] diff --git a/src/skill_seekers/embedding/cache.py b/src/skill_seekers/embedding/cache.py index fcb4bda..fcb31a0 100644 --- a/src/skill_seekers/embedding/cache.py +++ b/src/skill_seekers/embedding/cache.py @@ -74,12 +74,7 @@ class EmbeddingCache: self.conn.commit() - def set( - self, - hash_key: str, - embedding: list[float], - model: str - ) -> None: + def set(self, hash_key: str, embedding: list[float], model: str) -> None: """ Store embedding in cache. @@ -94,11 +89,14 @@ class EmbeddingCache: embedding_json = json.dumps(embedding) dimensions = len(embedding) - cursor.execute(""" + cursor.execute( + """ INSERT OR REPLACE INTO embeddings (hash, embedding, model, dimensions, created_at, accessed_at, access_count) VALUES (?, ?, ?, ?, ?, ?, 1) - """, (hash_key, embedding_json, model, dimensions, now, now)) + """, + (hash_key, embedding_json, model, dimensions, now, now), + ) self.conn.commit() @@ -115,11 +113,14 @@ class EmbeddingCache: cursor = self.conn.cursor() # Get embedding - cursor.execute(""" + cursor.execute( + """ SELECT embedding, created_at FROM embeddings WHERE hash = ? - """, (hash_key,)) + """, + (hash_key,), + ) row = cursor.fetchone() if not row: @@ -136,11 +137,14 @@ class EmbeddingCache: # Update access stats now = datetime.utcnow().isoformat() - cursor.execute(""" + cursor.execute( + """ UPDATE embeddings SET accessed_at = ?, access_count = access_count + 1 WHERE hash = ? - """, (now, hash_key)) + """, + (now, hash_key), + ) self.conn.commit() return json.loads(embedding_json) @@ -178,11 +182,14 @@ class EmbeddingCache: """ cursor = self.conn.cursor() - cursor.execute(""" + cursor.execute( + """ SELECT created_at FROM embeddings WHERE hash = ? - """, (hash_key,)) + """, + (hash_key,), + ) row = cursor.fetchone() if not row: @@ -206,10 +213,13 @@ class EmbeddingCache: """ cursor = self.conn.cursor() - cursor.execute(""" + cursor.execute( + """ DELETE FROM embeddings WHERE hash = ? - """, (hash_key,)) + """, + (hash_key,), + ) self.conn.commit() @@ -226,10 +236,13 @@ class EmbeddingCache: cursor = self.conn.cursor() if model: - cursor.execute(""" + cursor.execute( + """ DELETE FROM embeddings WHERE model = ? - """, (model,)) + """, + (model,), + ) else: cursor.execute("DELETE FROM embeddings") @@ -249,10 +262,13 @@ class EmbeddingCache: cutoff = (datetime.utcnow() - timedelta(days=self.ttl_days)).isoformat() - cursor.execute(""" + cursor.execute( + """ DELETE FROM embeddings WHERE created_at < ? - """, (cutoff,)) + """, + (cutoff,), + ) deleted = cursor.rowcount self.conn.commit() @@ -300,17 +316,19 @@ class EmbeddingCache: LIMIT 10 """) top_accessed = [ - {"hash": row[0], "model": row[1], "access_count": row[2]} - for row in cursor.fetchall() + {"hash": row[0], "model": row[1], "access_count": row[2]} for row in cursor.fetchall() ] # Expired entries cutoff = (datetime.utcnow() - timedelta(days=self.ttl_days)).isoformat() - cursor.execute(""" + cursor.execute( + """ SELECT COUNT(*) FROM embeddings WHERE created_at < ? - """, (cutoff,)) + """, + (cutoff,), + ) expired = cursor.fetchone()[0] return { @@ -318,7 +336,7 @@ class EmbeddingCache: "by_model": by_model, "top_accessed": top_accessed, "expired": expired, - "ttl_days": self.ttl_days + "ttl_days": self.ttl_days, } def close(self): diff --git a/src/skill_seekers/embedding/generator.py b/src/skill_seekers/embedding/generator.py index e7e3528..c65f965 100644 --- a/src/skill_seekers/embedding/generator.py +++ b/src/skill_seekers/embedding/generator.py @@ -9,6 +9,7 @@ import numpy as np # OpenAI support try: from openai import OpenAI + OPENAI_AVAILABLE = True except ImportError: OPENAI_AVAILABLE = False @@ -16,6 +17,7 @@ except ImportError: # Sentence transformers support try: from sentence_transformers import SentenceTransformer + SENTENCE_TRANSFORMERS_AVAILABLE = True except ImportError: SENTENCE_TRANSFORMERS_AVAILABLE = False @@ -23,6 +25,7 @@ except ImportError: # Voyage AI support (recommended by Anthropic for embeddings) try: import voyageai + VOYAGE_AVAILABLE = True except ImportError: VOYAGE_AVAILABLE = False @@ -129,7 +132,7 @@ class EmbeddingGenerator: self, api_key: str | None = None, voyage_api_key: str | None = None, - cache_dir: str | None = None + cache_dir: str | None = None, ): """ Initialize embedding generator. @@ -162,8 +165,7 @@ class EmbeddingGenerator: """Get information about a model.""" if model not in self.MODELS: raise ValueError( - f"Unknown model: {model}. " - f"Available models: {', '.join(self.MODELS.keys())}" + f"Unknown model: {model}. Available models: {', '.join(self.MODELS.keys())}" ) return self.MODELS[model] @@ -171,20 +173,19 @@ class EmbeddingGenerator: """List all available models.""" models = [] for name, info in self.MODELS.items(): - models.append({ - "name": name, - "provider": info["provider"], - "dimensions": info["dimensions"], - "max_tokens": info["max_tokens"], - "cost_per_million": info.get("cost_per_million", 0.0), - }) + models.append( + { + "name": name, + "provider": info["provider"], + "dimensions": info["dimensions"], + "max_tokens": info["max_tokens"], + "cost_per_million": info.get("cost_per_million", 0.0), + } + ) return models def generate( - self, - text: str, - model: str = "text-embedding-3-small", - normalize: bool = True + self, text: str, model: str = "text-embedding-3-small", normalize: bool = True ) -> list[float]: """ Generate embedding for a single text. @@ -218,7 +219,7 @@ class EmbeddingGenerator: texts: list[str], model: str = "text-embedding-3-small", normalize: bool = True, - batch_size: int = 32 + batch_size: int = 32, ) -> tuple[list[list[float]], int]: """ Generate embeddings for multiple texts. @@ -248,24 +249,18 @@ class EmbeddingGenerator: else: raise ValueError(f"Unsupported provider: {provider}") - def _generate_openai( - self, text: str, model: str, normalize: bool - ) -> list[float]: + def _generate_openai(self, text: str, model: str, normalize: bool) -> list[float]: """Generate embedding using OpenAI API.""" if not OPENAI_AVAILABLE: raise ImportError( - "OpenAI is required for OpenAI embeddings. " - "Install with: pip install openai" + "OpenAI is required for OpenAI embeddings. Install with: pip install openai" ) if not self.openai_client: raise ValueError("OpenAI API key not provided") try: - response = self.openai_client.embeddings.create( - input=text, - model=model - ) + response = self.openai_client.embeddings.create(input=text, model=model) embedding = response.data[0].embedding if normalize: @@ -281,8 +276,7 @@ class EmbeddingGenerator: """Generate embeddings using OpenAI API in batches.""" if not OPENAI_AVAILABLE: raise ImportError( - "OpenAI is required for OpenAI embeddings. " - "Install with: pip install openai" + "OpenAI is required for OpenAI embeddings. Install with: pip install openai" ) if not self.openai_client: @@ -292,13 +286,10 @@ class EmbeddingGenerator: # Process in batches for i in range(0, len(texts), batch_size): - batch = texts[i:i + batch_size] + batch = texts[i : i + batch_size] try: - response = self.openai_client.embeddings.create( - input=batch, - model=model - ) + response = self.openai_client.embeddings.create(input=batch, model=model) batch_embeddings = [item.embedding for item in response.data] @@ -313,24 +304,18 @@ class EmbeddingGenerator: dimensions = len(all_embeddings[0]) if all_embeddings else 0 return all_embeddings, dimensions - def _generate_voyage( - self, text: str, model: str, normalize: bool - ) -> list[float]: + def _generate_voyage(self, text: str, model: str, normalize: bool) -> list[float]: """Generate embedding using Voyage AI API.""" if not VOYAGE_AVAILABLE: raise ImportError( - "voyageai is required for Voyage AI embeddings. " - "Install with: pip install voyageai" + "voyageai is required for Voyage AI embeddings. Install with: pip install voyageai" ) if not self.voyage_client: raise ValueError("Voyage API key not provided") try: - result = self.voyage_client.embed( - texts=[text], - model=model - ) + result = self.voyage_client.embed(texts=[text], model=model) embedding = result.embeddings[0] if normalize: @@ -346,8 +331,7 @@ class EmbeddingGenerator: """Generate embeddings using Voyage AI API in batches.""" if not VOYAGE_AVAILABLE: raise ImportError( - "voyageai is required for Voyage AI embeddings. " - "Install with: pip install voyageai" + "voyageai is required for Voyage AI embeddings. Install with: pip install voyageai" ) if not self.voyage_client: @@ -357,13 +341,10 @@ class EmbeddingGenerator: # Process in batches (Voyage AI supports up to 128 texts per request) for i in range(0, len(texts), batch_size): - batch = texts[i:i + batch_size] + batch = texts[i : i + batch_size] try: - result = self.voyage_client.embed( - texts=batch, - model=model - ) + result = self.voyage_client.embed(texts=batch, model=model) batch_embeddings = result.embeddings @@ -378,9 +359,7 @@ class EmbeddingGenerator: dimensions = len(all_embeddings[0]) if all_embeddings else 0 return all_embeddings, dimensions - def _generate_sentence_transformer( - self, text: str, model: str, normalize: bool - ) -> list[float]: + def _generate_sentence_transformer(self, text: str, model: str, normalize: bool) -> list[float]: """Generate embedding using sentence-transformers.""" if not SENTENCE_TRANSFORMERS_AVAILABLE: raise ImportError( @@ -417,10 +396,7 @@ class EmbeddingGenerator: # Generate embeddings in batches embeddings = st_model.encode( - texts, - batch_size=batch_size, - normalize_embeddings=normalize, - show_progress_bar=False + texts, batch_size=batch_size, normalize_embeddings=normalize, show_progress_bar=False ) dimensions = len(embeddings[0]) if len(embeddings) > 0 else 0 diff --git a/src/skill_seekers/embedding/models.py b/src/skill_seekers/embedding/models.py index 0f91862..aa88fae 100644 --- a/src/skill_seekers/embedding/models.py +++ b/src/skill_seekers/embedding/models.py @@ -14,20 +14,14 @@ class EmbeddingRequest(BaseModel): "example": { "text": "This is a test document about Python programming.", "model": "text-embedding-3-small", - "normalize": True + "normalize": True, } } ) text: str = Field(..., description="Text to generate embedding for") - model: str = Field( - default="text-embedding-3-small", - description="Embedding model to use" - ) - normalize: bool = Field( - default=True, - description="Normalize embeddings to unit length" - ) + model: str = Field(default="text-embedding-3-small", description="Embedding model to use") + normalize: bool = Field(default=True, description="Normalize embeddings to unit length") class BatchEmbeddingRequest(BaseModel): @@ -39,27 +33,20 @@ class BatchEmbeddingRequest(BaseModel): "texts": [ "First document about Python", "Second document about JavaScript", - "Third document about Rust" + "Third document about Rust", ], "model": "text-embedding-3-small", "normalize": True, - "batch_size": 32 + "batch_size": 32, } } ) texts: list[str] = Field(..., description="List of texts to embed") - model: str = Field( - default="text-embedding-3-small", - description="Embedding model to use" - ) - normalize: bool = Field( - default=True, - description="Normalize embeddings to unit length" - ) + model: str = Field(default="text-embedding-3-small", description="Embedding model to use") + normalize: bool = Field(default=True, description="Normalize embeddings to unit length") batch_size: int | None = Field( - default=32, - description="Batch size for processing (default: 32)" + default=32, description="Batch size for processing (default: 32)" ) @@ -69,10 +56,7 @@ class EmbeddingResponse(BaseModel): embedding: list[float] = Field(..., description="Generated embedding vector") model: str = Field(..., description="Model used for generation") dimensions: int = Field(..., description="Embedding dimensions") - cached: bool = Field( - default=False, - description="Whether embedding was retrieved from cache" - ) + cached: bool = Field(default=False, description="Whether embedding was retrieved from cache") class BatchEmbeddingResponse(BaseModel): @@ -82,10 +66,7 @@ class BatchEmbeddingResponse(BaseModel): model: str = Field(..., description="Model used for generation") dimensions: int = Field(..., description="Embedding dimensions") count: int = Field(..., description="Number of embeddings generated") - cached_count: int = Field( - default=0, - description="Number of embeddings retrieved from cache" - ) + cached_count: int = Field(default=0, description="Number of embeddings retrieved from cache") class SkillEmbeddingRequest(BaseModel): @@ -97,24 +78,15 @@ class SkillEmbeddingRequest(BaseModel): "skill_path": "/path/to/skill/react", "model": "text-embedding-3-small", "chunk_size": 512, - "overlap": 50 + "overlap": 50, } } ) skill_path: str = Field(..., description="Path to skill directory") - model: str = Field( - default="text-embedding-3-small", - description="Embedding model to use" - ) - chunk_size: int = Field( - default=512, - description="Chunk size for splitting documents (tokens)" - ) - overlap: int = Field( - default=50, - description="Overlap between chunks (tokens)" - ) + model: str = Field(default="text-embedding-3-small", description="Embedding model to use") + chunk_size: int = Field(default=512, description="Chunk size for splitting documents (tokens)") + overlap: int = Field(default=50, description="Overlap between chunks (tokens)") class SkillEmbeddingResponse(BaseModel): @@ -124,10 +96,7 @@ class SkillEmbeddingResponse(BaseModel): total_chunks: int = Field(..., description="Total number of chunks embedded") model: str = Field(..., description="Model used for generation") dimensions: int = Field(..., description="Embedding dimensions") - metadata: dict[str, Any] = Field( - default_factory=dict, - description="Skill metadata" - ) + metadata: dict[str, Any] = Field(default_factory=dict, description="Skill metadata") class HealthResponse(BaseModel): @@ -144,12 +113,13 @@ class ModelInfo(BaseModel): """Information about an embedding model.""" name: str = Field(..., description="Model name") - provider: str = Field(..., description="Model provider (openai, anthropic, sentence-transformers)") + provider: str = Field( + ..., description="Model provider (openai, anthropic, sentence-transformers)" + ) dimensions: int = Field(..., description="Embedding dimensions") max_tokens: int = Field(..., description="Maximum input tokens") cost_per_million: float | None = Field( - None, - description="Cost per million tokens (if applicable)" + None, description="Cost per million tokens (if applicable)" ) diff --git a/src/skill_seekers/embedding/server.py b/src/skill_seekers/embedding/server.py index f01ad16..cb7fb6c 100644 --- a/src/skill_seekers/embedding/server.py +++ b/src/skill_seekers/embedding/server.py @@ -25,6 +25,7 @@ try: from fastapi import FastAPI, HTTPException, Query from fastapi.middleware.cors import CORSMiddleware import uvicorn + FASTAPI_AVAILABLE = True except ImportError: FASTAPI_AVAILABLE = False @@ -51,7 +52,7 @@ if FASTAPI_AVAILABLE: description="Generate embeddings for text and skill content", version="1.0.0", docs_url="/docs", - redoc_url="/redoc" + redoc_url="/redoc", ) # Add CORS middleware @@ -64,13 +65,14 @@ if FASTAPI_AVAILABLE: ) # Initialize generator and cache - cache_dir = os.getenv("EMBEDDING_CACHE_DIR", os.path.expanduser("~/.cache/skill-seekers/embeddings")) + cache_dir = os.getenv( + "EMBEDDING_CACHE_DIR", os.path.expanduser("~/.cache/skill-seekers/embeddings") + ) cache_db = os.path.join(cache_dir, "embeddings.db") cache_enabled = os.getenv("EMBEDDING_CACHE_ENABLED", "true").lower() == "true" generator = EmbeddingGenerator( - api_key=os.getenv("OPENAI_API_KEY"), - voyage_api_key=os.getenv("VOYAGE_API_KEY") + api_key=os.getenv("OPENAI_API_KEY"), voyage_api_key=os.getenv("VOYAGE_API_KEY") ) cache = EmbeddingCache(cache_db) if cache_enabled else None @@ -81,7 +83,7 @@ if FASTAPI_AVAILABLE: "service": "Skill Seekers Embedding API", "version": "1.0.0", "docs": "/docs", - "health": "/health" + "health": "/health", } @app.get("/health", response_model=HealthResponse) @@ -95,7 +97,7 @@ if FASTAPI_AVAILABLE: version="1.0.0", models=models, cache_enabled=cache_enabled, - cache_size=cache_size + cache_size=cache_size, ) @app.get("/models", response_model=ModelsResponse) @@ -109,15 +111,12 @@ if FASTAPI_AVAILABLE: provider=m["provider"], dimensions=m["dimensions"], max_tokens=m["max_tokens"], - cost_per_million=m.get("cost_per_million") + cost_per_million=m.get("cost_per_million"), ) for m in models_list ] - return ModelsResponse( - models=model_infos, - count=len(model_infos) - ) + return ModelsResponse(models=model_infos, count=len(model_infos)) @app.post("/embed", response_model=EmbeddingResponse) async def embed_text(request: EmbeddingRequest): @@ -144,9 +143,7 @@ if FASTAPI_AVAILABLE: else: # Generate embedding embedding = generator.generate( - request.text, - model=request.model, - normalize=request.normalize + request.text, model=request.model, normalize=request.normalize ) # Store in cache @@ -154,10 +151,7 @@ if FASTAPI_AVAILABLE: cache.set(hash_key, embedding, request.model) return EmbeddingResponse( - embedding=embedding, - model=request.model, - dimensions=len(embedding), - cached=cached + embedding=embedding, model=request.model, dimensions=len(embedding), cached=cached ) except Exception as e: @@ -202,11 +196,13 @@ if FASTAPI_AVAILABLE: texts_to_generate, model=request.model, normalize=request.normalize, - batch_size=request.batch_size + batch_size=request.batch_size, ) # Fill in placeholders and cache - for idx, text, embedding in zip(text_indices, texts_to_generate, generated_embeddings, strict=False): + for idx, text, embedding in zip( + text_indices, texts_to_generate, generated_embeddings, strict=False + ): embeddings[idx] = embedding if cache: @@ -220,7 +216,7 @@ if FASTAPI_AVAILABLE: model=request.model, dimensions=dimensions, count=len(embeddings), - cached_count=cached_count + cached_count=cached_count, ) except Exception as e: @@ -244,12 +240,16 @@ if FASTAPI_AVAILABLE: skill_path = Path(request.skill_path) if not skill_path.exists(): - raise HTTPException(status_code=404, detail=f"Skill path not found: {request.skill_path}") + raise HTTPException( + status_code=404, detail=f"Skill path not found: {request.skill_path}" + ) # Read SKILL.md skill_md = skill_path / "SKILL.md" if not skill_md.exists(): - raise HTTPException(status_code=404, detail=f"SKILL.md not found in {request.skill_path}") + raise HTTPException( + status_code=404, detail=f"SKILL.md not found in {request.skill_path}" + ) skill_content = skill_md.read_text() @@ -262,10 +262,7 @@ if FASTAPI_AVAILABLE: # Generate embeddings for chunks embeddings, dimensions = generator.generate_batch( - chunks, - model=request.model, - normalize=True, - batch_size=32 + chunks, model=request.model, normalize=True, batch_size=32 ) # TODO: Store embeddings in vector database @@ -279,8 +276,8 @@ if FASTAPI_AVAILABLE: metadata={ "skill_path": str(skill_path), "chunks": len(chunks), - "content_length": len(skill_content) - } + "content_length": len(skill_content), + }, ) except HTTPException: @@ -298,7 +295,7 @@ if FASTAPI_AVAILABLE: @app.post("/cache/clear", response_model=dict) async def clear_cache( - model: str | None = Query(None, description="Model to clear (all if not specified)") + model: str | None = Query(None, description="Model to clear (all if not specified)"), ): """Clear cache entries.""" if not cache: @@ -306,11 +303,7 @@ if FASTAPI_AVAILABLE: deleted = cache.clear(model=model) - return { - "status": "ok", - "deleted": deleted, - "model": model or "all" - } + return {"status": "ok", "deleted": deleted, "model": model or "all"} @app.post("/cache/clear-expired", response_model=dict) async def clear_expired(): @@ -320,10 +313,7 @@ if FASTAPI_AVAILABLE: deleted = cache.clear_expired() - return { - "status": "ok", - "deleted": deleted - } + return {"status": "ok", "deleted": deleted} else: print("Error: FastAPI not available. Install with: pip install fastapi uvicorn") @@ -348,12 +338,7 @@ def main(): if cache_enabled: print(f"💾 Cache database: {cache_db}") - uvicorn.run( - "skill_seekers.embedding.server:app", - host=host, - port=port, - reload=reload - ) + uvicorn.run("skill_seekers.embedding.server:app", host=host, port=port, reload=reload) if __name__ == "__main__": diff --git a/src/skill_seekers/mcp/tools/config_tools.py b/src/skill_seekers/mcp/tools/config_tools.py index 7495275..67d363d 100644 --- a/src/skill_seekers/mcp/tools/config_tools.py +++ b/src/skill_seekers/mcp/tools/config_tools.py @@ -69,15 +69,17 @@ async def generate_config(args: dict) -> list[TextContent]: config = { "name": name, "description": description, - "sources": [{ - "type": "documentation", - "base_url": url, - "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"}, - "url_patterns": {"include": [], "exclude": []}, - "categories": {}, - "rate_limit": rate_limit, - "max_pages": max_pages, - }], + "sources": [ + { + "type": "documentation", + "base_url": url, + "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"}, + "url_patterns": {"include": [], "exclude": []}, + "categories": {}, + "rate_limit": rate_limit, + "max_pages": max_pages, + } + ], } # Save to configs directory diff --git a/src/skill_seekers/sync/__init__.py b/src/skill_seekers/sync/__init__.py index f237ba6..9edc69b 100644 --- a/src/skill_seekers/sync/__init__.py +++ b/src/skill_seekers/sync/__init__.py @@ -32,9 +32,9 @@ from .detector import ChangeDetector from .models import SyncConfig, ChangeReport, PageChange __all__ = [ - 'SyncMonitor', - 'ChangeDetector', - 'SyncConfig', - 'ChangeReport', - 'PageChange', + "SyncMonitor", + "ChangeDetector", + "SyncConfig", + "ChangeReport", + "PageChange", ] diff --git a/src/skill_seekers/sync/detector.py b/src/skill_seekers/sync/detector.py index 381850c..4eb91d2 100644 --- a/src/skill_seekers/sync/detector.py +++ b/src/skill_seekers/sync/detector.py @@ -55,7 +55,7 @@ class ChangeDetector: Returns: Hexadecimal hash string """ - return hashlib.sha256(content.encode('utf-8')).hexdigest() + return hashlib.sha256(content.encode("utf-8")).hexdigest() def fetch_page(self, url: str) -> tuple[str, dict[str, str]]: """ @@ -72,17 +72,15 @@ class ChangeDetector: requests.RequestException: If fetch fails """ response = requests.get( - url, - timeout=self.timeout, - headers={'User-Agent': 'SkillSeekers-Sync/1.0'} + url, timeout=self.timeout, headers={"User-Agent": "SkillSeekers-Sync/1.0"} ) response.raise_for_status() metadata = { - 'last-modified': response.headers.get('Last-Modified'), - 'etag': response.headers.get('ETag'), - 'content-type': response.headers.get('Content-Type'), - 'content-length': response.headers.get('Content-Length'), + "last-modified": response.headers.get("Last-Modified"), + "etag": response.headers.get("ETag"), + "content-type": response.headers.get("Content-Type"), + "content-length": response.headers.get("Content-Length"), } return response.text, metadata @@ -92,7 +90,7 @@ class ChangeDetector: url: str, old_hash: str | None = None, generate_diff: bool = False, - old_content: str | None = None + old_content: str | None = None, ) -> PageChange: """ Check if page has changed. @@ -132,7 +130,7 @@ class ChangeDetector: old_hash=old_hash, new_hash=new_hash, diff=diff, - detected_at=datetime.utcnow() + detected_at=datetime.utcnow(), ) except requests.RequestException: @@ -142,14 +140,11 @@ class ChangeDetector: change_type=ChangeType.DELETED, old_hash=old_hash, new_hash=None, - detected_at=datetime.utcnow() + detected_at=datetime.utcnow(), ) def check_pages( - self, - urls: list[str], - previous_hashes: dict[str, str], - generate_diffs: bool = False + self, urls: list[str], previous_hashes: dict[str, str], generate_diffs: bool = False ) -> ChangeReport: """ Check multiple pages for changes. @@ -185,13 +180,15 @@ class ChangeDetector: # Check for deleted pages (in previous state but not in current) for url, old_hash in previous_hashes.items(): if url not in checked_urls: - deleted.append(PageChange( - url=url, - change_type=ChangeType.DELETED, - old_hash=old_hash, - new_hash=None, - detected_at=datetime.utcnow() - )) + deleted.append( + PageChange( + url=url, + change_type=ChangeType.DELETED, + old_hash=old_hash, + new_hash=None, + detected_at=datetime.utcnow(), + ) + ) return ChangeReport( skill_name="unknown", # To be set by caller @@ -200,7 +197,7 @@ class ChangeDetector: modified=modified, deleted=deleted, unchanged=unchanged_count, - checked_at=datetime.utcnow() + checked_at=datetime.utcnow(), ) def generate_diff(self, old_content: str, new_content: str) -> str: @@ -217,15 +214,9 @@ class ChangeDetector: old_lines = old_content.splitlines(keepends=True) new_lines = new_content.splitlines(keepends=True) - diff = difflib.unified_diff( - old_lines, - new_lines, - fromfile='old', - tofile='new', - lineterm='' - ) + diff = difflib.unified_diff(old_lines, new_lines, fromfile="old", tofile="new", lineterm="") - return ''.join(diff) + return "".join(diff) def generate_summary_diff(self, old_content: str, new_content: str) -> str: """ @@ -244,16 +235,15 @@ class ChangeDetector: diff = difflib.unified_diff(old_lines, new_lines) diff_lines = list(diff) - added = sum(1 for line in diff_lines if line.startswith('+') and not line.startswith('+++')) - removed = sum(1 for line in diff_lines if line.startswith('-') and not line.startswith('---')) + added = sum(1 for line in diff_lines if line.startswith("+") and not line.startswith("+++")) + removed = sum( + 1 for line in diff_lines if line.startswith("-") and not line.startswith("---") + ) return f"+{added} -{removed} lines" def check_header_changes( - self, - url: str, - old_modified: str | None = None, - old_etag: str | None = None + self, url: str, old_modified: str | None = None, old_etag: str | None = None ) -> bool: """ Quick check using HTTP headers (no content download). @@ -269,14 +259,12 @@ class ChangeDetector: try: # Use HEAD request for efficiency response = requests.head( - url, - timeout=self.timeout, - headers={'User-Agent': 'SkillSeekers-Sync/1.0'} + url, timeout=self.timeout, headers={"User-Agent": "SkillSeekers-Sync/1.0"} ) response.raise_for_status() - new_modified = response.headers.get('Last-Modified') - new_etag = response.headers.get('ETag') + new_modified = response.headers.get("Last-Modified") + new_etag = response.headers.get("ETag") # Check if headers indicate change if old_modified and new_modified and old_modified != new_modified: @@ -289,9 +277,7 @@ class ChangeDetector: return True def batch_check_headers( - self, - urls: list[str], - previous_metadata: dict[str, dict[str, str]] + self, urls: list[str], previous_metadata: dict[str, dict[str, str]] ) -> list[str]: """ Batch check URLs using headers only. @@ -307,8 +293,8 @@ class ChangeDetector: for url in urls: old_meta = previous_metadata.get(url, {}) - old_modified = old_meta.get('last-modified') - old_etag = old_meta.get('etag') + old_modified = old_meta.get("last-modified") + old_etag = old_meta.get("etag") if self.check_header_changes(url, old_modified, old_etag): changed_urls.append(url) diff --git a/src/skill_seekers/sync/models.py b/src/skill_seekers/sync/models.py index bacf6b1..e67943c 100644 --- a/src/skill_seekers/sync/models.py +++ b/src/skill_seekers/sync/models.py @@ -10,6 +10,7 @@ from pydantic import BaseModel, Field class ChangeType(str, Enum): """Type of change detected.""" + ADDED = "added" MODIFIED = "modified" DELETED = "deleted" @@ -25,8 +26,7 @@ class PageChange(BaseModel): new_hash: str | None = Field(None, description="New content hash") diff: str | None = Field(None, description="Content diff (if available)") detected_at: datetime = Field( - default_factory=datetime.utcnow, - description="When change was detected" + default_factory=datetime.utcnow, description="When change was detected" ) class Config: @@ -37,7 +37,7 @@ class PageChange(BaseModel): "old_hash": "abc123", "new_hash": "def456", "diff": "@@ -10,3 +10,4 @@\n+New content here", - "detected_at": "2024-01-15T10:30:00Z" + "detected_at": "2024-01-15T10:30:00Z", } } @@ -52,8 +52,7 @@ class ChangeReport(BaseModel): deleted: list[PageChange] = Field(default_factory=list, description="Deleted pages") unchanged: int = Field(0, description="Number of unchanged pages") checked_at: datetime = Field( - default_factory=datetime.utcnow, - description="When check was performed" + default_factory=datetime.utcnow, description="When check was performed" ) @property @@ -72,34 +71,19 @@ class SyncConfig(BaseModel): skill_config: str = Field(..., description="Path to skill config file") check_interval: int = Field( - default=3600, - description="Check interval in seconds (default: 1 hour)" + default=3600, description="Check interval in seconds (default: 1 hour)" ) enabled: bool = Field(default=True, description="Whether sync is enabled") - auto_update: bool = Field( - default=False, - description="Automatically rebuild skill on changes" - ) - notify_on_change: bool = Field( - default=True, - description="Send notifications on changes" - ) + auto_update: bool = Field(default=False, description="Automatically rebuild skill on changes") + notify_on_change: bool = Field(default=True, description="Send notifications on changes") notification_channels: list[str] = Field( - default_factory=list, - description="Notification channels (email, slack, webhook)" - ) - webhook_url: str | None = Field( - None, - description="Webhook URL for change notifications" + default_factory=list, description="Notification channels (email, slack, webhook)" ) + webhook_url: str | None = Field(None, description="Webhook URL for change notifications") email_recipients: list[str] = Field( - default_factory=list, - description="Email recipients for notifications" - ) - slack_webhook: str | None = Field( - None, - description="Slack webhook URL" + default_factory=list, description="Email recipients for notifications" ) + slack_webhook: str | None = Field(None, description="Slack webhook URL") class Config: json_schema_extra = { @@ -111,7 +95,7 @@ class SyncConfig(BaseModel): "notify_on_change": True, "notification_channels": ["slack", "webhook"], "webhook_url": "https://example.com/webhook", - "slack_webhook": "https://hooks.slack.com/services/..." + "slack_webhook": "https://hooks.slack.com/services/...", } } @@ -125,8 +109,7 @@ class SyncState(BaseModel): total_checks: int = Field(default=0, description="Total checks performed") total_changes: int = Field(default=0, description="Total changes detected") page_hashes: dict[str, str] = Field( - default_factory=dict, - description="URL -> content hash mapping" + default_factory=dict, description="URL -> content hash mapping" ) status: str = Field(default="idle", description="Current status") error: str | None = Field(None, description="Last error message") @@ -137,15 +120,9 @@ class WebhookPayload(BaseModel): event: str = Field(..., description="Event type (change_detected, sync_complete)") skill_name: str = Field(..., description="Skill name") - timestamp: datetime = Field( - default_factory=datetime.utcnow, - description="Event timestamp" - ) + timestamp: datetime = Field(default_factory=datetime.utcnow, description="Event timestamp") changes: ChangeReport | None = Field(None, description="Change report") - metadata: dict[str, Any] = Field( - default_factory=dict, - description="Additional metadata" - ) + metadata: dict[str, Any] = Field(default_factory=dict, description="Additional metadata") class Config: json_schema_extra = { @@ -157,8 +134,8 @@ class WebhookPayload(BaseModel): "total_pages": 150, "added": [], "modified": [{"url": "https://react.dev/learn"}], - "deleted": [] + "deleted": [], }, - "metadata": {"source": "periodic_check"} + "metadata": {"source": "periodic_check"}, } } diff --git a/src/skill_seekers/sync/monitor.py b/src/skill_seekers/sync/monitor.py index 26aea11..f2c3f15 100644 --- a/src/skill_seekers/sync/monitor.py +++ b/src/skill_seekers/sync/monitor.py @@ -51,7 +51,7 @@ class SyncMonitor: check_interval: int = 3600, auto_update: bool = False, state_file: str | None = None, - on_change: Callable[[ChangeReport], None] | None = None + on_change: Callable[[ChangeReport], None] | None = None, ): """ Initialize sync monitor. @@ -72,7 +72,7 @@ class SyncMonitor: with open(self.config_path) as f: self.skill_config = json.load(f) - self.skill_name = self.skill_config.get('name', 'unknown') + self.skill_name = self.skill_config.get("name", "unknown") # State file if state_file: @@ -97,10 +97,10 @@ class SyncMonitor: with open(self.state_file) as f: data = json.load(f) # Convert datetime strings back - if data.get('last_check'): - data['last_check'] = datetime.fromisoformat(data['last_check']) - if data.get('last_change'): - data['last_change'] = datetime.fromisoformat(data['last_change']) + if data.get("last_check"): + data["last_check"] = datetime.fromisoformat(data["last_check"]) + if data.get("last_change"): + data["last_change"] = datetime.fromisoformat(data["last_change"]) return SyncState(**data) else: return SyncState(skill_name=self.skill_name) @@ -109,12 +109,12 @@ class SyncMonitor: """Save current state to file.""" # Convert datetime to ISO format data = self.state.dict() - if data.get('last_check'): - data['last_check'] = data['last_check'].isoformat() - if data.get('last_change'): - data['last_change'] = data['last_change'].isoformat() + if data.get("last_check"): + data["last_check"] = data["last_check"].isoformat() + if data.get("last_change"): + data["last_change"] = data["last_change"].isoformat() - with open(self.state_file, 'w') as f: + with open(self.state_file, "w") as f: json.dump(data, f, indent=2) def check_now(self, generate_diffs: bool = False) -> ChangeReport: @@ -132,7 +132,7 @@ class SyncMonitor: try: # Get URLs to check from config - base_url = self.skill_config.get('base_url') + base_url = self.skill_config.get("base_url") # TODO: In real implementation, get actual URLs from scraper # For now, simulate with base URL only @@ -140,9 +140,7 @@ class SyncMonitor: # Check for changes report = self.detector.check_pages( - urls=urls, - previous_hashes=self.state.page_hashes, - generate_diffs=generate_diffs + urls=urls, previous_hashes=self.state.page_hashes, generate_diffs=generate_diffs ) report.skill_name = self.skill_name @@ -192,7 +190,7 @@ class SyncMonitor: event="change_detected", skill_name=self.skill_name, changes=report, - metadata={"auto_update": self.auto_update} + metadata={"auto_update": self.auto_update}, ) self.notifier.send(payload) @@ -214,9 +212,7 @@ class SyncMonitor: self._running = True # Schedule checks - schedule.every(self.check_interval).seconds.do( - lambda: self.check_now() - ) + schedule.every(self.check_interval).seconds.do(lambda: self.check_now()) # Run in thread def run_schedule(): diff --git a/src/skill_seekers/sync/notifier.py b/src/skill_seekers/sync/notifier.py index c581a8c..7157267 100644 --- a/src/skill_seekers/sync/notifier.py +++ b/src/skill_seekers/sync/notifier.py @@ -34,7 +34,7 @@ class Notifier: webhook_url: str | None = None, slack_webhook: str | None = None, email_recipients: list[str] | None = None, - console: bool = True + console: bool = True, ): """ Initialize notifier. @@ -45,8 +45,8 @@ class Notifier: email_recipients: List of email recipients console: Whether to print to console """ - self.webhook_url = webhook_url or os.getenv('SYNC_WEBHOOK_URL') - self.slack_webhook = slack_webhook or os.getenv('SLACK_WEBHOOK_URL') + self.webhook_url = webhook_url or os.getenv("SYNC_WEBHOOK_URL") + self.slack_webhook = slack_webhook or os.getenv("SLACK_WEBHOOK_URL") self.email_recipients = email_recipients or [] self.console = console @@ -92,8 +92,8 @@ class Notifier: response = requests.post( self.webhook_url, json=payload.dict(), - headers={'Content-Type': 'application/json'}, - timeout=10 + headers={"Content-Type": "application/json"}, + timeout=10, ) response.raise_for_status() print(f"✅ Webhook notification sent to {self.webhook_url}") @@ -124,14 +124,10 @@ class Notifier: slack_payload = { "text": text, "username": "Skill Seekers Sync", - "icon_emoji": ":books:" + "icon_emoji": ":books:", } - response = requests.post( - self.slack_webhook, - json=slack_payload, - timeout=10 - ) + response = requests.post(self.slack_webhook, json=slack_payload, timeout=10) response.raise_for_status() print("✅ Slack notification sent") except Exception as e: diff --git a/tests/test_adaptor_benchmarks.py b/tests/test_adaptor_benchmarks.py index bd3f362..fe07fee 100644 --- a/tests/test_adaptor_benchmarks.py +++ b/tests/test_adaptor_benchmarks.py @@ -85,9 +85,17 @@ class TestAdaptorBenchmarks(unittest.TestCase): # Platforms to benchmark platforms = [ - "claude", "gemini", "openai", "markdown", # IDE integrations - "langchain", "llama-index", "haystack", # RAG frameworks - "weaviate", "chroma", "faiss", "qdrant" # Vector DBs + "claude", + "gemini", + "openai", + "markdown", # IDE integrations + "langchain", + "llama-index", + "haystack", # RAG frameworks + "weaviate", + "chroma", + "faiss", + "qdrant", # Vector DBs ] results = {} @@ -115,20 +123,19 @@ class TestAdaptorBenchmarks(unittest.TestCase): min_time = min(times) max_time = max(times) - results[platform] = { - "avg": avg_time, - "min": min_time, - "max": max_time - } + results[platform] = {"avg": avg_time, "min": min_time, "max": max_time} - print(f"{platform:15} - Avg: {avg_time*1000:6.2f}ms | " - f"Min: {min_time*1000:6.2f}ms | Max: {max_time*1000:6.2f}ms") + print( + f"{platform:15} - Avg: {avg_time * 1000:6.2f}ms | " + f"Min: {min_time * 1000:6.2f}ms | Max: {max_time * 1000:6.2f}ms" + ) # Performance assertions (should complete in reasonable time) for platform, metrics in results.items(): self.assertLess( - metrics["avg"], 0.5, # Should average < 500ms - f"{platform} format_skill_md too slow: {metrics['avg']*1000:.2f}ms" + metrics["avg"], + 0.5, # Should average < 500ms + f"{platform} format_skill_md too slow: {metrics['avg'] * 1000:.2f}ms", ) def test_benchmark_package_operations(self): @@ -158,12 +165,9 @@ class TestAdaptorBenchmarks(unittest.TestCase): # Get file size file_size_kb = package_path.stat().st_size / 1024 - results[platform] = { - "time": elapsed, - "size_kb": file_size_kb - } + results[platform] = {"time": elapsed, "size_kb": file_size_kb} - print(f"{platform:15} - Time: {elapsed*1000:7.2f}ms | Size: {file_size_kb:7.1f} KB") + print(f"{platform:15} - Time: {elapsed * 1000:7.2f}ms | Size: {file_size_kb:7.1f} KB") # Validate output self.assertTrue(package_path.exists()) @@ -171,12 +175,14 @@ class TestAdaptorBenchmarks(unittest.TestCase): # Performance assertions for platform, metrics in results.items(): self.assertLess( - metrics["time"], 1.0, # Should complete < 1 second - f"{platform} packaging too slow: {metrics['time']*1000:.2f}ms" + metrics["time"], + 1.0, # Should complete < 1 second + f"{platform} packaging too slow: {metrics['time'] * 1000:.2f}ms", ) self.assertLess( - metrics["size_kb"], 1000, # Should be < 1MB for 10 refs - f"{platform} package too large: {metrics['size_kb']:.1f}KB" + metrics["size_kb"], + 1000, # Should be < 1MB for 10 refs + f"{platform} package too large: {metrics['size_kb']:.1f}KB", ) def test_benchmark_scaling_with_reference_count(self): @@ -210,14 +216,18 @@ class TestAdaptorBenchmarks(unittest.TestCase): json.loads(formatted) size_kb = len(formatted) / 1024 - results.append({ - "count": ref_count, - "time": elapsed, - "time_per_ref": time_per_ref, - "size_kb": size_kb - }) + results.append( + { + "count": ref_count, + "time": elapsed, + "time_per_ref": time_per_ref, + "size_kb": size_kb, + } + ) - print(f"{ref_count:4} | {elapsed*1000:10.2f} | {time_per_ref*1000:10.3f} | {size_kb:10.1f}") + print( + f"{ref_count:4} | {elapsed * 1000:10.2f} | {time_per_ref * 1000:10.3f} | {size_kb:10.1f}" + ) # Analyze scaling behavior # Time per ref should not increase significantly (linear scaling) @@ -230,10 +240,7 @@ class TestAdaptorBenchmarks(unittest.TestCase): print(f"(Time per ref at 50 refs / Time per ref at 1 ref)") # Assert linear or sub-linear scaling (not exponential) - self.assertLess( - scaling_factor, 3.0, - f"Non-linear scaling detected: {scaling_factor:.2f}x" - ) + self.assertLess(scaling_factor, 3.0, f"Non-linear scaling detected: {scaling_factor:.2f}x") def test_benchmark_json_vs_zip_size_comparison(self): """Compare output sizes: JSON vs ZIP/tar.gz""" @@ -263,16 +270,15 @@ class TestAdaptorBenchmarks(unittest.TestCase): size_kb = package_path.stat().st_size / 1024 - results[platform] = { - "format": format_name, - "size_kb": size_kb - } + results[platform] = {"format": format_name, "size_kb": size_kb} print(f"{platform:15} | {format_name:8} | {size_kb:10.1f}") # Analyze results json_sizes = [v["size_kb"] for k, v in results.items() if v["format"] == "JSON"] - compressed_sizes = [v["size_kb"] for k, v in results.items() if v["format"] in ["ZIP", "tar.gz"]] + compressed_sizes = [ + v["size_kb"] for k, v in results.items() if v["format"] in ["ZIP", "tar.gz"] + ] if json_sizes and compressed_sizes: avg_json = sum(json_sizes) / len(json_sizes) @@ -280,7 +286,7 @@ class TestAdaptorBenchmarks(unittest.TestCase): print(f"\nAverage JSON size: {avg_json:.1f} KB") print(f"Average compressed size: {avg_compressed:.1f} KB") - print(f"Compression ratio: {avg_json/avg_compressed:.2f}x") + print(f"Compression ratio: {avg_json / avg_compressed:.2f}x") def test_benchmark_metadata_overhead(self): """Measure metadata processing overhead""" @@ -299,7 +305,7 @@ class TestAdaptorBenchmarks(unittest.TestCase): description="A comprehensive test skill for benchmarking purposes", version="2.5.0", author="Benchmark Suite", - tags=["test", "benchmark", "performance", "validation", "quality"] + tags=["test", "benchmark", "performance", "validation", "quality"], ) adaptor = get_adaptor("langchain") @@ -326,15 +332,12 @@ class TestAdaptorBenchmarks(unittest.TestCase): overhead = avg_rich - avg_minimal overhead_pct = (overhead / avg_minimal) * 100 - print(f"\nMinimal metadata: {avg_minimal*1000:.2f}ms") - print(f"Rich metadata: {avg_rich*1000:.2f}ms") - print(f"Overhead: {overhead*1000:.2f}ms ({overhead_pct:.1f}%)") + print(f"\nMinimal metadata: {avg_minimal * 1000:.2f}ms") + print(f"Rich metadata: {avg_rich * 1000:.2f}ms") + print(f"Overhead: {overhead * 1000:.2f}ms ({overhead_pct:.1f}%)") # Overhead should be negligible (< 10%) - self.assertLess( - overhead_pct, 10.0, - f"Metadata overhead too high: {overhead_pct:.1f}%" - ) + self.assertLess(overhead_pct, 10.0, f"Metadata overhead too high: {overhead_pct:.1f}%") def test_benchmark_empty_vs_full_skill(self): """Compare performance: empty skill vs full skill""" @@ -360,9 +363,9 @@ class TestAdaptorBenchmarks(unittest.TestCase): adaptor.format_skill_md(full_dir, metadata) full_time = time.perf_counter() - start - print(f"\nEmpty skill: {empty_time*1000:.2f}ms") - print(f"Full skill (50 refs): {full_time*1000:.2f}ms") - print(f"Ratio: {full_time/empty_time:.1f}x") + print(f"\nEmpty skill: {empty_time * 1000:.2f}ms") + print(f"Full skill (50 refs): {full_time * 1000:.2f}ms") + print(f"Ratio: {full_time / empty_time:.1f}x") # Empty should be very fast self.assertLess(empty_time, 0.01, "Empty skill processing too slow") diff --git a/tests/test_adaptors/test_adaptors_e2e.py b/tests/test_adaptors/test_adaptors_e2e.py index 619a604..62f441d 100644 --- a/tests/test_adaptors/test_adaptors_e2e.py +++ b/tests/test_adaptors/test_adaptors_e2e.py @@ -662,8 +662,13 @@ export default { def test_e2e_all_rag_adaptors_from_same_skill(self): """Test all 7 RAG adaptors can package the same skill""" rag_platforms = [ - "langchain", "llama-index", "haystack", - "weaviate", "chroma", "faiss", "qdrant" + "langchain", + "llama-index", + "haystack", + "weaviate", + "chroma", + "faiss", + "qdrant", ] packages = {} @@ -674,15 +679,11 @@ export default { package_path = adaptor.package(self.skill_dir, self.output_dir) # Verify package was created - self.assertTrue( - package_path.exists(), - f"Package not created for {platform}" - ) + self.assertTrue(package_path.exists(), f"Package not created for {platform}") # Verify it's a JSON file self.assertTrue( - str(package_path).endswith(".json"), - f"{platform} should produce JSON file" + str(package_path).endswith(".json"), f"{platform} should produce JSON file" ) # Store for later verification @@ -696,10 +697,7 @@ export default { with open(path) as f: data = json.load(f) # Should be valid JSON (dict or list) - self.assertIsInstance( - data, (dict, list), - f"{platform} should produce valid JSON" - ) + self.assertIsInstance(data, (dict, list), f"{platform} should produce valid JSON") def test_e2e_rag_adaptors_preserve_metadata(self): """Test that metadata is preserved across RAG adaptors""" @@ -708,7 +706,7 @@ export default { description="Vue.js framework skill", version="2.0.0", author="Test Author", - tags=["vue", "javascript", "frontend"] + tags=["vue", "javascript", "frontend"], ) # Test subset of platforms (representative sample) @@ -758,33 +756,30 @@ export default { # Define expected structure for each platform validations = { "langchain": lambda d: ( - isinstance(d, list) and - all("page_content" in item and "metadata" in item for item in d) + isinstance(d, list) + and all("page_content" in item and "metadata" in item for item in d) ), "llama-index": lambda d: ( - isinstance(d, list) and - all("text" in item and "metadata" in item for item in d) + isinstance(d, list) and all("text" in item and "metadata" in item for item in d) ), "haystack": lambda d: ( - isinstance(d, list) and - all("content" in item and "meta" in item for item in d) + isinstance(d, list) and all("content" in item and "meta" in item for item in d) ), "weaviate": lambda d: ( - isinstance(d, dict) and - "schema" in d and "objects" in d and "class_name" in d + isinstance(d, dict) and "schema" in d and "objects" in d and "class_name" in d ), "chroma": lambda d: ( - isinstance(d, dict) and - "documents" in d and "metadatas" in d and "ids" in d and - "collection_name" in d + isinstance(d, dict) + and "documents" in d + and "metadatas" in d + and "ids" in d + and "collection_name" in d ), "faiss": lambda d: ( - isinstance(d, dict) and - "documents" in d and "metadatas" in d and "ids" in d + isinstance(d, dict) and "documents" in d and "metadatas" in d and "ids" in d ), "qdrant": lambda d: ( - isinstance(d, dict) and - "collection_name" in d and "points" in d and "config" in d + isinstance(d, dict) and "collection_name" in d and "points" in d and "config" in d ), } @@ -795,8 +790,7 @@ export default { # Validate structure self.assertTrue( - validate_func(data), - f"{platform} validation failed: incorrect JSON structure" + validate_func(data), f"{platform} validation failed: incorrect JSON structure" ) def test_e2e_rag_empty_skill_handling(self): @@ -838,9 +832,7 @@ export default { if platform == "langchain": categories = {item["metadata"]["category"] for item in data} elif platform == "weaviate": - categories = { - obj["properties"]["category"] for obj in data["objects"] - } + categories = {obj["properties"]["category"] for obj in data["objects"]} elif platform == "chroma": categories = {meta["category"] for meta in data["metadatas"]} @@ -854,8 +846,7 @@ export default { # Check that at least one reference category exists ref_categories = categories - {"overview"} self.assertGreater( - len(ref_categories), 0, - f"{platform}: Should have at least one reference category" + len(ref_categories), 0, f"{platform}: Should have at least one reference category" ) def test_e2e_rag_integration_workflow_chromadb(self): @@ -878,17 +869,10 @@ export default { # Create collection and add documents collection = client.create_collection(data["collection_name"]) - collection.add( - documents=data["documents"], - metadatas=data["metadatas"], - ids=data["ids"] - ) + collection.add(documents=data["documents"], metadatas=data["metadatas"], ids=data["ids"]) # Query - results = collection.query( - query_texts=["reactivity"], - n_results=2 - ) + results = collection.query(query_texts=["reactivity"], n_results=2) # Verify results self.assertGreater(len(results["documents"][0]), 0, "Should return results") diff --git a/tests/test_adaptors/test_chroma_adaptor.py b/tests/test_adaptors/test_chroma_adaptor.py index e36e61e..8d07a78 100644 --- a/tests/test_adaptors/test_chroma_adaptor.py +++ b/tests/test_adaptors/test_chroma_adaptor.py @@ -28,9 +28,7 @@ class TestChromaAdaptor: # Create SKILL.md skill_md = skill_dir / "SKILL.md" - skill_md.write_text( - "# Test Skill\n\nThis is a test skill for Chroma format." - ) + skill_md.write_text("# Test Skill\n\nThis is a test skill for Chroma format.") # Create references directory with files refs_dir = skill_dir / "references" @@ -40,9 +38,7 @@ class TestChromaAdaptor: # Format as Chroma collection adaptor = get_adaptor("chroma") - metadata = SkillMetadata( - name="test_skill", description="Test skill", version="1.0.0" - ) + metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0") collection_json = adaptor.format_skill_md(skill_dir, metadata) @@ -124,7 +120,10 @@ class TestChromaAdaptor: # Upload may fail if chromadb not installed (expected) assert "message" in result # Either chromadb not installed or connection error - assert ("chromadb not installed" in result["message"] or "Failed to connect" in result["message"]) + assert ( + "chromadb not installed" in result["message"] + or "Failed to connect" in result["message"] + ) def test_validate_api_key_returns_false(self): """Test that API key validation returns False (no API needed).""" @@ -157,9 +156,7 @@ class TestChromaAdaptor: skill_dir.mkdir() adaptor = get_adaptor("chroma") - metadata = SkillMetadata( - name="empty_skill", description="Empty", version="1.0.0" - ) + metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0") collection_json = adaptor.format_skill_md(skill_dir, metadata) collection = json.loads(collection_json) @@ -179,9 +176,7 @@ class TestChromaAdaptor: (refs_dir / "test.md").write_text("# Test\n\nTest content.") adaptor = get_adaptor("chroma") - metadata = SkillMetadata( - name="refs_only", description="Refs only", version="1.0.0" - ) + metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0") collection_json = adaptor.format_skill_md(skill_dir, metadata) collection = json.loads(collection_json) diff --git a/tests/test_adaptors/test_faiss_adaptor.py b/tests/test_adaptors/test_faiss_adaptor.py index d0993a3..e70b8de 100644 --- a/tests/test_adaptors/test_faiss_adaptor.py +++ b/tests/test_adaptors/test_faiss_adaptor.py @@ -28,9 +28,7 @@ class TestFAISSAdaptor: # Create SKILL.md skill_md = skill_dir / "SKILL.md" - skill_md.write_text( - "# Test Skill\n\nThis is a test skill for FAISS format." - ) + skill_md.write_text("# Test Skill\n\nThis is a test skill for FAISS format.") # Create references directory with files refs_dir = skill_dir / "references" @@ -40,9 +38,7 @@ class TestFAISSAdaptor: # Format as FAISS index data adaptor = get_adaptor("faiss") - metadata = SkillMetadata( - name="test_skill", description="Test skill", version="1.0.0" - ) + metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0") index_json = adaptor.format_skill_md(skill_dir, metadata) @@ -158,9 +154,7 @@ class TestFAISSAdaptor: skill_dir.mkdir() adaptor = get_adaptor("faiss") - metadata = SkillMetadata( - name="empty_skill", description="Empty", version="1.0.0" - ) + metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0") index_json = adaptor.format_skill_md(skill_dir, metadata) index_data = json.loads(index_json) @@ -180,9 +174,7 @@ class TestFAISSAdaptor: (refs_dir / "test.md").write_text("# Test\n\nTest content.") adaptor = get_adaptor("faiss") - metadata = SkillMetadata( - name="refs_only", description="Refs only", version="1.0.0" - ) + metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0") index_json = adaptor.format_skill_md(skill_dir, metadata) index_data = json.loads(index_json) diff --git a/tests/test_adaptors/test_haystack_adaptor.py b/tests/test_adaptors/test_haystack_adaptor.py index e0278d6..6bcbd87 100644 --- a/tests/test_adaptors/test_haystack_adaptor.py +++ b/tests/test_adaptors/test_haystack_adaptor.py @@ -28,9 +28,7 @@ class TestHaystackAdaptor: # Create SKILL.md skill_md = skill_dir / "SKILL.md" - skill_md.write_text( - "# Test Skill\n\nThis is a test skill for Haystack format." - ) + skill_md.write_text("# Test Skill\n\nThis is a test skill for Haystack format.") # Create references directory with files refs_dir = skill_dir / "references" @@ -40,9 +38,7 @@ class TestHaystackAdaptor: # Format as Haystack Documents adaptor = get_adaptor("haystack") - metadata = SkillMetadata( - name="test_skill", description="Test skill", version="1.0.0" - ) + metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0") documents_json = adaptor.format_skill_md(skill_dir, metadata) @@ -112,7 +108,7 @@ class TestHaystackAdaptor: """Test upload returns instructions (no actual upload).""" # Create test package package_path = tmp_path / "test-haystack.json" - package_path.write_text('[]') + package_path.write_text("[]") adaptor = get_adaptor("haystack") result = adaptor.upload(package_path, "fake-key") @@ -154,9 +150,7 @@ class TestHaystackAdaptor: skill_dir.mkdir() adaptor = get_adaptor("haystack") - metadata = SkillMetadata( - name="empty_skill", description="Empty", version="1.0.0" - ) + metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0") documents_json = adaptor.format_skill_md(skill_dir, metadata) documents = json.loads(documents_json) @@ -174,9 +168,7 @@ class TestHaystackAdaptor: (refs_dir / "test.md").write_text("# Test\n\nTest content.") adaptor = get_adaptor("haystack") - metadata = SkillMetadata( - name="refs_only", description="Refs only", version="1.0.0" - ) + metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0") documents_json = adaptor.format_skill_md(skill_dir, metadata) documents = json.loads(documents_json) diff --git a/tests/test_adaptors/test_langchain_adaptor.py b/tests/test_adaptors/test_langchain_adaptor.py index 9919bc3..76dd5b3 100644 --- a/tests/test_adaptors/test_langchain_adaptor.py +++ b/tests/test_adaptors/test_langchain_adaptor.py @@ -28,9 +28,7 @@ class TestLangChainAdaptor: # Create SKILL.md skill_md = skill_dir / "SKILL.md" - skill_md.write_text( - "# Test Skill\n\nThis is a test skill for LangChain format." - ) + skill_md.write_text("# Test Skill\n\nThis is a test skill for LangChain format.") # Create references directory with files refs_dir = skill_dir / "references" @@ -40,9 +38,7 @@ class TestLangChainAdaptor: # Format as LangChain Documents adaptor = get_adaptor("langchain") - metadata = SkillMetadata( - name="test_skill", description="Test skill", version="1.0.0" - ) + metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0") documents_json = adaptor.format_skill_md(skill_dir, metadata) @@ -112,7 +108,7 @@ class TestLangChainAdaptor: """Test upload returns instructions (no actual upload).""" # Create test package package_path = tmp_path / "test-langchain.json" - package_path.write_text('[]') + package_path.write_text("[]") adaptor = get_adaptor("langchain") result = adaptor.upload(package_path, "fake-key") @@ -153,9 +149,7 @@ class TestLangChainAdaptor: skill_dir.mkdir() adaptor = get_adaptor("langchain") - metadata = SkillMetadata( - name="empty_skill", description="Empty", version="1.0.0" - ) + metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0") documents_json = adaptor.format_skill_md(skill_dir, metadata) documents = json.loads(documents_json) @@ -173,9 +167,7 @@ class TestLangChainAdaptor: (refs_dir / "test.md").write_text("# Test\n\nTest content.") adaptor = get_adaptor("langchain") - metadata = SkillMetadata( - name="refs_only", description="Refs only", version="1.0.0" - ) + metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0") documents_json = adaptor.format_skill_md(skill_dir, metadata) documents = json.loads(documents_json) diff --git a/tests/test_adaptors/test_llama_index_adaptor.py b/tests/test_adaptors/test_llama_index_adaptor.py index ab01f21..80357a8 100644 --- a/tests/test_adaptors/test_llama_index_adaptor.py +++ b/tests/test_adaptors/test_llama_index_adaptor.py @@ -28,9 +28,7 @@ class TestLlamaIndexAdaptor: # Create SKILL.md skill_md = skill_dir / "SKILL.md" - skill_md.write_text( - "# Test Skill\n\nThis is a test skill for LlamaIndex format." - ) + skill_md.write_text("# Test Skill\n\nThis is a test skill for LlamaIndex format.") # Create references directory with files refs_dir = skill_dir / "references" @@ -40,9 +38,7 @@ class TestLlamaIndexAdaptor: # Format as LlamaIndex Documents adaptor = get_adaptor("llama-index") - metadata = SkillMetadata( - name="test_skill", description="Test skill", version="1.0.0" - ) + metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0") documents_json = adaptor.format_skill_md(skill_dir, metadata) @@ -112,7 +108,7 @@ class TestLlamaIndexAdaptor: """Test upload returns instructions (no actual upload).""" # Create test package package_path = tmp_path / "test-llama-index.json" - package_path.write_text('[]') + package_path.write_text("[]") adaptor = get_adaptor("llama-index") result = adaptor.upload(package_path, "fake-key") @@ -153,9 +149,7 @@ class TestLlamaIndexAdaptor: skill_dir.mkdir() adaptor = get_adaptor("llama-index") - metadata = SkillMetadata( - name="empty_skill", description="Empty", version="1.0.0" - ) + metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0") documents_json = adaptor.format_skill_md(skill_dir, metadata) documents = json.loads(documents_json) @@ -173,9 +167,7 @@ class TestLlamaIndexAdaptor: (refs_dir / "test.md").write_text("# Test\n\nTest content.") adaptor = get_adaptor("llama-index") - metadata = SkillMetadata( - name="refs_only", description="Refs only", version="1.0.0" - ) + metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0") documents_json = adaptor.format_skill_md(skill_dir, metadata) documents = json.loads(documents_json) diff --git a/tests/test_adaptors/test_qdrant_adaptor.py b/tests/test_adaptors/test_qdrant_adaptor.py index 019f926..672bdf3 100644 --- a/tests/test_adaptors/test_qdrant_adaptor.py +++ b/tests/test_adaptors/test_qdrant_adaptor.py @@ -28,9 +28,7 @@ class TestQdrantAdaptor: # Create SKILL.md skill_md = skill_dir / "SKILL.md" - skill_md.write_text( - "# Test Skill\n\nThis is a test skill for Qdrant format." - ) + skill_md.write_text("# Test Skill\n\nThis is a test skill for Qdrant format.") # Create references directory with files refs_dir = skill_dir / "references" @@ -40,9 +38,7 @@ class TestQdrantAdaptor: # Format as Qdrant points adaptor = get_adaptor("qdrant") - metadata = SkillMetadata( - name="test_skill", description="Test skill", version="1.0.0" - ) + metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0") points_json = adaptor.format_skill_md(skill_dir, metadata) @@ -119,7 +115,7 @@ class TestQdrantAdaptor: """Test upload returns instructions (no actual upload).""" # Create test package package_path = tmp_path / "test-qdrant.json" - package_path.write_text('[]') + package_path.write_text("[]") adaptor = get_adaptor("qdrant") result = adaptor.upload(package_path, "fake-key") @@ -160,9 +156,7 @@ class TestQdrantAdaptor: skill_dir.mkdir() adaptor = get_adaptor("qdrant") - metadata = SkillMetadata( - name="empty_skill", description="Empty", version="1.0.0" - ) + metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0") points_json = adaptor.format_skill_md(skill_dir, metadata) result = json.loads(points_json) @@ -181,9 +175,7 @@ class TestQdrantAdaptor: (refs_dir / "test.md").write_text("# Test\n\nTest content.") adaptor = get_adaptor("qdrant") - metadata = SkillMetadata( - name="refs_only", description="Refs only", version="1.0.0" - ) + metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0") points_json = adaptor.format_skill_md(skill_dir, metadata) result = json.loads(points_json) diff --git a/tests/test_adaptors/test_weaviate_adaptor.py b/tests/test_adaptors/test_weaviate_adaptor.py index 6a50e11..66d485d 100644 --- a/tests/test_adaptors/test_weaviate_adaptor.py +++ b/tests/test_adaptors/test_weaviate_adaptor.py @@ -28,9 +28,7 @@ class TestWeaviateAdaptor: # Create SKILL.md skill_md = skill_dir / "SKILL.md" - skill_md.write_text( - "# Test Skill\n\nThis is a test skill for Weaviate format." - ) + skill_md.write_text("# Test Skill\n\nThis is a test skill for Weaviate format.") # Create references directory with files refs_dir = skill_dir / "references" @@ -40,9 +38,7 @@ class TestWeaviateAdaptor: # Format as Weaviate objects adaptor = get_adaptor("weaviate") - metadata = SkillMetadata( - name="test_skill", description="Test skill", version="1.0.0" - ) + metadata = SkillMetadata(name="test_skill", description="Test skill", version="1.0.0") objects_json = adaptor.format_skill_md(skill_dir, metadata) @@ -119,7 +115,7 @@ class TestWeaviateAdaptor: """Test upload returns instructions (no actual upload).""" # Create test package package_path = tmp_path / "test-weaviate.json" - package_path.write_text('[]') + package_path.write_text("[]") adaptor = get_adaptor("weaviate") result = adaptor.upload(package_path, "fake-key") @@ -127,7 +123,11 @@ class TestWeaviateAdaptor: # Upload may fail if weaviate not installed (expected) assert "message" in result # Either weaviate not installed, invalid JSON, or connection error - assert ("import weaviate" in result["message"] or "Failed to connect" in result["message"] or result["success"] is False) + assert ( + "import weaviate" in result["message"] + or "Failed to connect" in result["message"] + or result["success"] is False + ) def test_validate_api_key_returns_false(self): """Test that API key validation returns False (no API needed).""" @@ -160,9 +160,7 @@ class TestWeaviateAdaptor: skill_dir.mkdir() adaptor = get_adaptor("weaviate") - metadata = SkillMetadata( - name="empty_skill", description="Empty", version="1.0.0" - ) + metadata = SkillMetadata(name="empty_skill", description="Empty", version="1.0.0") objects_json = adaptor.format_skill_md(skill_dir, metadata) result = json.loads(objects_json) @@ -181,9 +179,7 @@ class TestWeaviateAdaptor: (refs_dir / "test.md").write_text("# Test\n\nTest content.") adaptor = get_adaptor("weaviate") - metadata = SkillMetadata( - name="refs_only", description="Refs only", version="1.0.0" - ) + metadata = SkillMetadata(name="refs_only", description="Refs only", version="1.0.0") objects_json = adaptor.format_skill_md(skill_dir, metadata) result = json.loads(objects_json) diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index b0f5a8c..9163584 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -12,7 +12,7 @@ from skill_seekers.benchmark import ( BenchmarkResult, BenchmarkRunner, BenchmarkReport, - Metric + Metric, ) from skill_seekers.benchmark.models import TimingResult, MemoryUsage @@ -37,12 +37,7 @@ class TestBenchmarkResult: """Test adding timing result.""" result = BenchmarkResult("test") - timing = TimingResult( - operation="test_op", - duration=1.5, - iterations=1, - avg_duration=1.5 - ) + timing = TimingResult(operation="test_op", duration=1.5, iterations=1, avg_duration=1.5) result.add_timing(timing) @@ -55,11 +50,7 @@ class TestBenchmarkResult: result = BenchmarkResult("test") usage = MemoryUsage( - operation="test_op", - before_mb=100.0, - after_mb=150.0, - peak_mb=160.0, - allocated_mb=50.0 + operation="test_op", before_mb=100.0, after_mb=150.0, peak_mb=160.0, allocated_mb=50.0 ) result.add_memory(usage) @@ -72,11 +63,7 @@ class TestBenchmarkResult: """Test adding custom metric.""" result = BenchmarkResult("test") - metric = Metric( - name="pages_per_sec", - value=12.5, - unit="pages/sec" - ) + metric = Metric(name="pages_per_sec", value=12.5, unit="pages/sec") result.add_metric(metric) @@ -107,12 +94,7 @@ class TestBenchmarkResult: """Test report generation.""" result = BenchmarkResult("test") - timing = TimingResult( - operation="test_op", - duration=1.0, - iterations=1, - avg_duration=1.0 - ) + timing = TimingResult(operation="test_op", duration=1.0, iterations=1, avg_duration=1.0) result.add_timing(timing) report = result.to_report() @@ -303,7 +285,7 @@ class TestBenchmark: before_mb=100.0, after_mb=1200.0, peak_mb=1500.0, - allocated_mb=1100.0 + allocated_mb=1100.0, ) benchmark.result.add_memory(usage) @@ -370,10 +352,7 @@ class TestBenchmarkRunner: with bench.timer("op2"): time.sleep(0.03) - reports = runner.run_suite({ - "test1": bench1, - "test2": bench2 - }) + reports = runner.run_suite({"test1": bench1, "test2": bench2}) assert len(reports) == 2 assert "test1" in reports @@ -405,6 +384,7 @@ class TestBenchmarkRunner: # Compare from skill_seekers.benchmark.models import ComparisonReport + comparison = runner.compare(baseline_path, improved_path) assert isinstance(comparison, ComparisonReport) @@ -458,6 +438,7 @@ class TestBenchmarkRunner: def test_cleanup_old(self, tmp_path): """Test cleaning up old benchmarks.""" import os + runner = BenchmarkRunner(output_dir=tmp_path) # Create 10 benchmark files with different timestamps @@ -476,10 +457,10 @@ class TestBenchmarkRunner: "memory": [], "metrics": [], "system_info": {}, - "recommendations": [] + "recommendations": [], } - with open(file_path, 'w') as f: + with open(file_path, "w") as f: json.dump(report_data, f) # Set different modification times @@ -505,12 +486,7 @@ class TestBenchmarkModels: def test_timing_result_model(self): """Test TimingResult model.""" - timing = TimingResult( - operation="test", - duration=1.5, - iterations=10, - avg_duration=0.15 - ) + timing = TimingResult(operation="test", duration=1.5, iterations=10, avg_duration=0.15) assert timing.operation == "test" assert timing.duration == 1.5 @@ -520,11 +496,7 @@ class TestBenchmarkModels: def test_memory_usage_model(self): """Test MemoryUsage model.""" usage = MemoryUsage( - operation="allocate", - before_mb=100.0, - after_mb=200.0, - peak_mb=250.0, - allocated_mb=100.0 + operation="allocate", before_mb=100.0, after_mb=200.0, peak_mb=250.0, allocated_mb=100.0 ) assert usage.operation == "allocate" @@ -533,11 +505,7 @@ class TestBenchmarkModels: def test_metric_model(self): """Test Metric model.""" - metric = Metric( - name="throughput", - value=125.5, - unit="ops/sec" - ) + metric = Metric(name="throughput", value=125.5, unit="ops/sec") assert metric.name == "throughput" assert metric.value == 125.5 @@ -551,26 +519,19 @@ class TestBenchmarkModels: started_at=datetime.utcnow(), finished_at=datetime.utcnow(), total_duration=5.0, - timings=[ - TimingResult( - operation="op1", - duration=2.0, - iterations=1, - avg_duration=2.0 - ) - ], + timings=[TimingResult(operation="op1", duration=2.0, iterations=1, avg_duration=2.0)], memory=[ MemoryUsage( operation="op1", before_mb=100.0, after_mb=200.0, peak_mb=250.0, - allocated_mb=100.0 + allocated_mb=100.0, ) ], metrics=[], system_info={}, - recommendations=[] + recommendations=[], ) summary = report.summary @@ -592,7 +553,7 @@ class TestBenchmarkModels: memory=[], metrics=[], system_info={}, - recommendations=[] + recommendations=[], ) current = BenchmarkReport( @@ -604,7 +565,7 @@ class TestBenchmarkModels: memory=[], metrics=[], system_info={}, - recommendations=[] + recommendations=[], ) comparison = ComparisonReport( @@ -614,7 +575,7 @@ class TestBenchmarkModels: improvements=[], regressions=["Slower performance"], speedup_factor=0.5, - memory_change_mb=0.0 + memory_change_mb=0.0, ) assert comparison.has_regressions is True @@ -632,7 +593,7 @@ class TestBenchmarkModels: memory=[], metrics=[], system_info={}, - recommendations=[] + recommendations=[], ) current = BenchmarkReport( @@ -644,7 +605,7 @@ class TestBenchmarkModels: memory=[], metrics=[], system_info={}, - recommendations=[] + recommendations=[], ) comparison = ComparisonReport( @@ -654,7 +615,7 @@ class TestBenchmarkModels: improvements=[], regressions=[], speedup_factor=2.0, - memory_change_mb=0.0 + memory_change_mb=0.0, ) improvement = comparison.overall_improvement diff --git a/tests/test_chunking_integration.py b/tests/test_chunking_integration.py index 7bdd029..9a19128 100644 --- a/tests/test_chunking_integration.py +++ b/tests/test_chunking_integration.py @@ -60,7 +60,7 @@ class TestChunkingDisabledByDefault: """Test that LangChain doesn't chunk by default.""" skill_dir = create_test_skill(tmp_path, large_doc=True) - adaptor = get_adaptor('langchain') + adaptor = get_adaptor("langchain") package_path = adaptor.package(skill_dir, tmp_path) with open(package_path) as f: @@ -71,8 +71,8 @@ class TestChunkingDisabledByDefault: # No chunking metadata for doc in data: - assert 'is_chunked' not in doc['metadata'] - assert 'chunk_index' not in doc['metadata'] + assert "is_chunked" not in doc["metadata"] + assert "chunk_index" not in doc["metadata"] class TestChunkingEnabled: @@ -82,12 +82,9 @@ class TestChunkingEnabled: """Test that LangChain chunks large documents when enabled.""" skill_dir = create_test_skill(tmp_path, large_doc=True) - adaptor = get_adaptor('langchain') + adaptor = get_adaptor("langchain") package_path = adaptor.package( - skill_dir, - tmp_path, - enable_chunking=True, - chunk_max_tokens=512 + skill_dir, tmp_path, enable_chunking=True, chunk_max_tokens=512 ) with open(package_path) as f: @@ -97,25 +94,22 @@ class TestChunkingEnabled: assert len(data) > 2, f"Large doc should be chunked, got {len(data)} docs" # Check for chunking metadata - chunked_docs = [doc for doc in data if doc['metadata'].get('is_chunked')] + chunked_docs = [doc for doc in data if doc["metadata"].get("is_chunked")] assert len(chunked_docs) > 0, "Should have chunked documents" # Verify chunk metadata structure for doc in chunked_docs: - assert 'chunk_index' in doc['metadata'] - assert 'total_chunks' in doc['metadata'] - assert 'chunk_id' in doc['metadata'] + assert "chunk_index" in doc["metadata"] + assert "total_chunks" in doc["metadata"] + assert "chunk_id" in doc["metadata"] def test_chunking_preserves_small_docs(self, tmp_path): """Test that small documents are not chunked.""" skill_dir = create_test_skill(tmp_path, large_doc=False) - adaptor = get_adaptor('langchain') + adaptor = get_adaptor("langchain") package_path = adaptor.package( - skill_dir, - tmp_path, - enable_chunking=True, - chunk_max_tokens=512 + skill_dir, tmp_path, enable_chunking=True, chunk_max_tokens=512 ) with open(package_path) as f: @@ -125,7 +119,7 @@ class TestChunkingEnabled: assert len(data) == 2, "Small docs should not be chunked" for doc in data: - assert 'is_chunked' not in doc['metadata'] + assert "is_chunked" not in doc["metadata"] class TestCodeBlockPreservation: @@ -158,43 +152,43 @@ More content after code block. # Create references dir (required) (skill_dir / "references").mkdir() - adaptor = get_adaptor('langchain') + adaptor = get_adaptor("langchain") package_path = adaptor.package( skill_dir, tmp_path, enable_chunking=True, chunk_max_tokens=200, # Small chunks to force splitting - preserve_code_blocks=True + preserve_code_blocks=True, ) with open(package_path) as f: data = json.load(f) # Find chunks with code block - code_chunks = [ - doc for doc in data - if '```python' in doc['page_content'] - ] + code_chunks = [doc for doc in data if "```python" in doc["page_content"]] # Code block should be in at least one chunk assert len(code_chunks) >= 1, "Code block should be preserved" # Code block should be complete (opening and closing backticks) for chunk in code_chunks: - content = chunk['page_content'] - if '```python' in content: + content = chunk["page_content"] + if "```python" in content: # Should also have closing backticks - assert content.count('```') >= 2, "Code block should be complete" + assert content.count("```") >= 2, "Code block should be complete" class TestAutoChunkingForRAGPlatforms: """Test that chunking is auto-enabled for RAG platforms.""" - @pytest.mark.parametrize("platform", [ - 'langchain', - # Add others after they're updated: - # 'llama-index', 'haystack', 'weaviate', 'chroma', 'faiss', 'qdrant' - ]) + @pytest.mark.parametrize( + "platform", + [ + "langchain", + # Add others after they're updated: + # 'llama-index', 'haystack', 'weaviate', 'chroma', 'faiss', 'qdrant' + ], + ) def test_rag_platforms_auto_chunk(self, platform, tmp_path): """Test that RAG platforms auto-enable chunking.""" skill_dir = create_test_skill(tmp_path, large_doc=True) @@ -208,7 +202,7 @@ class TestAutoChunkingForRAGPlatforms: open_folder_after=False, skip_quality_check=True, target=platform, - enable_chunking=False # Explicitly disabled, but should be auto-enabled + enable_chunking=False, # Explicitly disabled, but should be auto-enabled ) assert success, f"Packaging failed for {platform}" @@ -221,8 +215,8 @@ class TestAutoChunkingForRAGPlatforms: # Should have multiple documents/chunks if isinstance(data, list): assert len(data) > 2, f"{platform}: Should auto-chunk large docs" - elif isinstance(data, dict) and 'documents' in data: - assert len(data['documents']) > 2, f"{platform}: Should auto-chunk large docs" + elif isinstance(data, dict) and "documents" in data: + assert len(data["documents"]) > 2, f"{platform}: Should auto-chunk large docs" class TestBaseAdaptorChunkingHelper: @@ -237,11 +231,7 @@ class TestBaseAdaptorChunkingHelper: content = "Test content " * 1000 # Large content metadata = {"source": "test"} - chunks = adaptor._maybe_chunk_content( - content, - metadata, - enable_chunking=False - ) + chunks = adaptor._maybe_chunk_content(content, metadata, enable_chunking=False) # Should return single chunk assert len(chunks) == 1 @@ -258,10 +248,7 @@ class TestBaseAdaptorChunkingHelper: metadata = {"source": "test"} chunks = adaptor._maybe_chunk_content( - content, - metadata, - enable_chunking=True, - chunk_max_tokens=512 + content, metadata, enable_chunking=True, chunk_max_tokens=512 ) # Should return single chunk @@ -282,7 +269,7 @@ class TestBaseAdaptorChunkingHelper: enable_chunking=True, chunk_max_tokens=512, preserve_code_blocks=True, - source_file="test.md" + source_file="test.md", ) # Should return multiple chunks @@ -292,12 +279,12 @@ class TestBaseAdaptorChunkingHelper: for chunk_text, chunk_meta in chunks: assert isinstance(chunk_text, str) assert isinstance(chunk_meta, dict) - assert chunk_meta['is_chunked'] - assert 'chunk_index' in chunk_meta - assert 'chunk_id' in chunk_meta + assert chunk_meta["is_chunked"] + assert "chunk_index" in chunk_meta + assert "chunk_id" in chunk_meta # Original metadata preserved - assert chunk_meta['source'] == 'test' - assert chunk_meta['file'] == 'test.md' + assert chunk_meta["source"] == "test" + assert chunk_meta["file"] == "test.md" class TestChunkingCLIIntegration: @@ -313,10 +300,10 @@ class TestChunkingCLIIntegration: skill_dir=skill_dir, open_folder_after=False, skip_quality_check=True, - target='langchain', + target="langchain", enable_chunking=True, # --chunk flag chunk_max_tokens=512, - preserve_code_blocks=True + preserve_code_blocks=True, ) assert success @@ -339,10 +326,10 @@ class TestChunkingCLIIntegration: skill_dir=skill_dir, open_folder_after=False, skip_quality_check=True, - target='langchain', + target="langchain", enable_chunking=True, chunk_max_tokens=256, # Small chunks - preserve_code_blocks=True + preserve_code_blocks=True, ) assert success @@ -355,10 +342,10 @@ class TestChunkingCLIIntegration: skill_dir=skill_dir, open_folder_after=False, skip_quality_check=True, - target='langchain', + target="langchain", enable_chunking=True, chunk_max_tokens=1024, # Large chunks - preserve_code_blocks=True + preserve_code_blocks=True, ) assert success @@ -367,9 +354,10 @@ class TestChunkingCLIIntegration: data_large = json.load(f) # Small chunk size should produce more chunks - assert len(data_small) > len(data_large), \ + assert len(data_small) > len(data_large), ( f"Small chunks ({len(data_small)}) should be more than large chunks ({len(data_large)})" + ) -if __name__ == '__main__': - pytest.main([__file__, '-v']) +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_cli_parsers.py b/tests/test_cli_parsers.py index 46c1827..d379e21 100644 --- a/tests/test_cli_parsers.py +++ b/tests/test_cli_parsers.py @@ -30,12 +30,12 @@ class TestParserRegistry: """Test getting list of parser names.""" names = get_parser_names() assert len(names) == 19 - assert 'scrape' in names - assert 'github' in names - assert 'package' in names - assert 'upload' in names - assert 'analyze' in names - assert 'config' in names + assert "scrape" in names + assert "github" in names + assert "package" in names + assert "upload" in names + assert "analyze" in names + assert "config" in names def test_all_parsers_are_subcommand_parsers(self): """Test that all parsers inherit from SubcommandParser.""" @@ -45,9 +45,9 @@ class TestParserRegistry: def test_all_parsers_have_required_properties(self): """Test that all parsers have name, help, description.""" for parser in PARSERS: - assert hasattr(parser, 'name') - assert hasattr(parser, 'help') - assert hasattr(parser, 'description') + assert hasattr(parser, "name") + assert hasattr(parser, "help") + assert hasattr(parser, "description") assert isinstance(parser.name, str) assert isinstance(parser.help, str) assert isinstance(parser.description, str) @@ -57,7 +57,7 @@ class TestParserRegistry: def test_all_parsers_have_add_arguments_method(self): """Test that all parsers implement add_arguments.""" for parser in PARSERS: - assert hasattr(parser, 'add_arguments') + assert hasattr(parser, "add_arguments") assert callable(parser.add_arguments) def test_no_duplicate_parser_names(self): @@ -106,21 +106,21 @@ class TestParserCreation: def test_register_parsers_creates_all_subcommands(self): """Test that register_parsers creates all 19 subcommands.""" main_parser = argparse.ArgumentParser() - subparsers = main_parser.add_subparsers(dest='command') + subparsers = main_parser.add_subparsers(dest="command") # Register all parsers register_parsers(subparsers) # Test that all commands can be parsed test_commands = [ - 'config --show', - 'scrape --config test.json', - 'github --repo owner/repo', - 'package output/test/', - 'upload test.zip', - 'analyze --directory .', - 'enhance output/test/', - 'estimate test.json', + "config --show", + "scrape --config test.json", + "github --repo owner/repo", + "package output/test/", + "upload test.zip", + "analyze --directory .", + "enhance output/test/", + "estimate test.json", ] for cmd in test_commands: @@ -134,75 +134,76 @@ class TestSpecificParsers: def test_scrape_parser_arguments(self): """Test ScrapeParser has correct arguments.""" main_parser = argparse.ArgumentParser() - subparsers = main_parser.add_subparsers(dest='command') + subparsers = main_parser.add_subparsers(dest="command") scrape_parser = ScrapeParser() scrape_parser.create_parser(subparsers) # Test various argument combinations - args = main_parser.parse_args(['scrape', '--config', 'test.json']) - assert args.command == 'scrape' - assert args.config == 'test.json' + args = main_parser.parse_args(["scrape", "--config", "test.json"]) + assert args.command == "scrape" + assert args.config == "test.json" - args = main_parser.parse_args(['scrape', '--config', 'test.json', '--max-pages', '100']) + args = main_parser.parse_args(["scrape", "--config", "test.json", "--max-pages", "100"]) assert args.max_pages == 100 - args = main_parser.parse_args(['scrape', '--enhance']) + args = main_parser.parse_args(["scrape", "--enhance"]) assert args.enhance is True def test_github_parser_arguments(self): """Test GitHubParser has correct arguments.""" main_parser = argparse.ArgumentParser() - subparsers = main_parser.add_subparsers(dest='command') + subparsers = main_parser.add_subparsers(dest="command") github_parser = GitHubParser() github_parser.create_parser(subparsers) - args = main_parser.parse_args(['github', '--repo', 'owner/repo']) - assert args.command == 'github' - assert args.repo == 'owner/repo' + args = main_parser.parse_args(["github", "--repo", "owner/repo"]) + assert args.command == "github" + assert args.repo == "owner/repo" - args = main_parser.parse_args(['github', '--repo', 'owner/repo', '--non-interactive']) + args = main_parser.parse_args(["github", "--repo", "owner/repo", "--non-interactive"]) assert args.non_interactive is True def test_package_parser_arguments(self): """Test PackageParser has correct arguments.""" main_parser = argparse.ArgumentParser() - subparsers = main_parser.add_subparsers(dest='command') + subparsers = main_parser.add_subparsers(dest="command") package_parser = PackageParser() package_parser.create_parser(subparsers) - args = main_parser.parse_args(['package', 'output/test/']) - assert args.command == 'package' - assert args.skill_directory == 'output/test/' + args = main_parser.parse_args(["package", "output/test/"]) + assert args.command == "package" + assert args.skill_directory == "output/test/" - args = main_parser.parse_args(['package', 'output/test/', '--target', 'gemini']) - assert args.target == 'gemini' + args = main_parser.parse_args(["package", "output/test/", "--target", "gemini"]) + assert args.target == "gemini" - args = main_parser.parse_args(['package', 'output/test/', '--no-open']) + args = main_parser.parse_args(["package", "output/test/", "--no-open"]) assert args.no_open is True def test_analyze_parser_arguments(self): """Test AnalyzeParser has correct arguments.""" main_parser = argparse.ArgumentParser() - subparsers = main_parser.add_subparsers(dest='command') + subparsers = main_parser.add_subparsers(dest="command") from skill_seekers.cli.parsers.analyze_parser import AnalyzeParser + analyze_parser = AnalyzeParser() analyze_parser.create_parser(subparsers) - args = main_parser.parse_args(['analyze', '--directory', '.']) - assert args.command == 'analyze' - assert args.directory == '.' + args = main_parser.parse_args(["analyze", "--directory", "."]) + assert args.command == "analyze" + assert args.directory == "." - args = main_parser.parse_args(['analyze', '--directory', '.', '--quick']) + args = main_parser.parse_args(["analyze", "--directory", ".", "--quick"]) assert args.quick is True - args = main_parser.parse_args(['analyze', '--directory', '.', '--comprehensive']) + args = main_parser.parse_args(["analyze", "--directory", ".", "--comprehensive"]) assert args.comprehensive is True - args = main_parser.parse_args(['analyze', '--directory', '.', '--skip-patterns']) + args = main_parser.parse_args(["analyze", "--directory", ".", "--skip-patterns"]) assert args.skip_patterns is True @@ -215,11 +216,25 @@ class TestBackwardCompatibility: # Original commands from old main.py original_commands = [ - 'config', 'scrape', 'github', 'pdf', 'unified', - 'enhance', 'enhance-status', 'package', 'upload', - 'estimate', 'extract-test-examples', 'install-agent', - 'analyze', 'install', 'resume', 'stream', - 'update', 'multilang', 'quality' + "config", + "scrape", + "github", + "pdf", + "unified", + "enhance", + "enhance-status", + "package", + "upload", + "estimate", + "extract-test-examples", + "install-agent", + "analyze", + "install", + "resume", + "stream", + "update", + "multilang", + "quality", ] for cmd in original_commands: diff --git a/tests/test_cloud_storage.py b/tests/test_cloud_storage.py index 91ff4a0..ed26c40 100644 --- a/tests/test_cloud_storage.py +++ b/tests/test_cloud_storage.py @@ -20,18 +20,21 @@ from skill_seekers.cli.storage import ( # Check if cloud storage dependencies are available try: import boto3 # noqa: F401 + BOTO3_AVAILABLE = True except ImportError: BOTO3_AVAILABLE = False try: from google.cloud import storage # noqa: F401 + GCS_AVAILABLE = True except ImportError: GCS_AVAILABLE = False try: from azure.storage.blob import BlobServiceClient # noqa: F401 + AZURE_AVAILABLE = True except ImportError: AZURE_AVAILABLE = False @@ -41,12 +44,13 @@ except ImportError: # Factory Tests # ======================================== + def test_get_storage_adaptor_s3(): """Test S3 adaptor factory.""" if not BOTO3_AVAILABLE: pytest.skip("boto3 not installed") - with patch('skill_seekers.cli.storage.s3_storage.boto3'): - adaptor = get_storage_adaptor('s3', bucket='test-bucket') + with patch("skill_seekers.cli.storage.s3_storage.boto3"): + adaptor = get_storage_adaptor("s3", bucket="test-bucket") assert isinstance(adaptor, S3StorageAdaptor) @@ -54,8 +58,8 @@ def test_get_storage_adaptor_gcs(): """Test GCS adaptor factory.""" if not GCS_AVAILABLE: pytest.skip("google-cloud-storage not installed") - with patch('skill_seekers.cli.storage.gcs_storage.storage'): - adaptor = get_storage_adaptor('gcs', bucket='test-bucket') + with patch("skill_seekers.cli.storage.gcs_storage.storage"): + adaptor = get_storage_adaptor("gcs", bucket="test-bucket") assert isinstance(adaptor, GCSStorageAdaptor) @@ -63,11 +67,11 @@ def test_get_storage_adaptor_azure(): """Test Azure adaptor factory.""" if not AZURE_AVAILABLE: pytest.skip("azure-storage-blob not installed") - with patch('skill_seekers.cli.storage.azure_storage.BlobServiceClient'): + with patch("skill_seekers.cli.storage.azure_storage.BlobServiceClient"): adaptor = get_storage_adaptor( - 'azure', - container='test-container', - connection_string='DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key' + "azure", + container="test-container", + connection_string="DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key", ) assert isinstance(adaptor, AzureStorageAdaptor) @@ -75,36 +79,37 @@ def test_get_storage_adaptor_azure(): def test_get_storage_adaptor_invalid_provider(): """Test invalid provider raises error.""" with pytest.raises(ValueError, match="Unsupported storage provider"): - get_storage_adaptor('invalid', bucket='test') + get_storage_adaptor("invalid", bucket="test") # ======================================== # S3 Storage Tests # ======================================== + def test_s3_upload_file(): """Test S3 file upload.""" if not BOTO3_AVAILABLE: pytest.skip("boto3 not installed") - with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3: + with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3: # Setup mocks mock_client = Mock() mock_boto3.client.return_value = mock_client mock_boto3.resource.return_value = Mock() - adaptor = S3StorageAdaptor(bucket='test-bucket') + adaptor = S3StorageAdaptor(bucket="test-bucket") # Create temporary file with tempfile.NamedTemporaryFile(delete=False) as tmp_file: - tmp_file.write(b'test content') + tmp_file.write(b"test content") tmp_path = tmp_file.name try: # Test upload - result = adaptor.upload_file(tmp_path, 'test.txt') + result = adaptor.upload_file(tmp_path, "test.txt") - assert result == 's3://test-bucket/test.txt' + assert result == "s3://test-bucket/test.txt" mock_client.upload_file.assert_called_once() finally: Path(tmp_path).unlink() @@ -115,23 +120,21 @@ def test_s3_download_file(): if not BOTO3_AVAILABLE: pytest.skip("boto3 not installed") - with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3: + with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3: # Setup mocks mock_client = Mock() mock_boto3.client.return_value = mock_client mock_boto3.resource.return_value = Mock() - adaptor = S3StorageAdaptor(bucket='test-bucket') + adaptor = S3StorageAdaptor(bucket="test-bucket") with tempfile.TemporaryDirectory() as tmp_dir: - local_path = os.path.join(tmp_dir, 'downloaded.txt') + local_path = os.path.join(tmp_dir, "downloaded.txt") # Test download - adaptor.download_file('test.txt', local_path) + adaptor.download_file("test.txt", local_path) - mock_client.download_file.assert_called_once_with( - 'test-bucket', 'test.txt', local_path - ) + mock_client.download_file.assert_called_once_with("test-bucket", "test.txt", local_path) def test_s3_list_files(): @@ -139,18 +142,18 @@ def test_s3_list_files(): if not BOTO3_AVAILABLE: pytest.skip("boto3 not installed") - with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3: + with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3: # Setup mocks mock_client = Mock() mock_paginator = Mock() mock_page_iterator = [ { - 'Contents': [ + "Contents": [ { - 'Key': 'file1.txt', - 'Size': 100, - 'LastModified': Mock(isoformat=lambda: '2024-01-01T00:00:00'), - 'ETag': '"abc123"' + "Key": "file1.txt", + "Size": 100, + "LastModified": Mock(isoformat=lambda: "2024-01-01T00:00:00"), + "ETag": '"abc123"', } ] } @@ -161,15 +164,15 @@ def test_s3_list_files(): mock_boto3.client.return_value = mock_client mock_boto3.resource.return_value = Mock() - adaptor = S3StorageAdaptor(bucket='test-bucket') + adaptor = S3StorageAdaptor(bucket="test-bucket") # Test list - files = adaptor.list_files('prefix/') + files = adaptor.list_files("prefix/") assert len(files) == 1 - assert files[0].key == 'file1.txt' + assert files[0].key == "file1.txt" assert files[0].size == 100 - assert files[0].etag == 'abc123' + assert files[0].etag == "abc123" def test_s3_file_exists(): @@ -177,17 +180,17 @@ def test_s3_file_exists(): if not BOTO3_AVAILABLE: pytest.skip("boto3 not installed") - with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3: + with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3: # Setup mocks mock_client = Mock() mock_client.head_object.return_value = {} mock_boto3.client.return_value = mock_client mock_boto3.resource.return_value = Mock() - adaptor = S3StorageAdaptor(bucket='test-bucket') + adaptor = S3StorageAdaptor(bucket="test-bucket") # Test exists - assert adaptor.file_exists('test.txt') is True + assert adaptor.file_exists("test.txt") is True def test_s3_get_file_url(): @@ -195,19 +198,19 @@ def test_s3_get_file_url(): if not BOTO3_AVAILABLE: pytest.skip("boto3 not installed") - with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3: + with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3: # Setup mocks mock_client = Mock() - mock_client.generate_presigned_url.return_value = 'https://s3.amazonaws.com/signed-url' + mock_client.generate_presigned_url.return_value = "https://s3.amazonaws.com/signed-url" mock_boto3.client.return_value = mock_client mock_boto3.resource.return_value = Mock() - adaptor = S3StorageAdaptor(bucket='test-bucket') + adaptor = S3StorageAdaptor(bucket="test-bucket") # Test URL generation - url = adaptor.get_file_url('test.txt', expires_in=7200) + url = adaptor.get_file_url("test.txt", expires_in=7200) - assert url == 'https://s3.amazonaws.com/signed-url' + assert url == "https://s3.amazonaws.com/signed-url" mock_client.generate_presigned_url.assert_called_once() @@ -215,12 +218,13 @@ def test_s3_get_file_url(): # GCS Storage Tests # ======================================== + def test_gcs_upload_file(): """Test GCS file upload.""" if not GCS_AVAILABLE: pytest.skip("google-cloud-storage not installed") - with patch('skill_seekers.cli.storage.gcs_storage.storage') as mock_storage: + with patch("skill_seekers.cli.storage.gcs_storage.storage") as mock_storage: # Setup mocks mock_client = Mock() mock_bucket = Mock() @@ -230,18 +234,18 @@ def test_gcs_upload_file(): mock_bucket.blob.return_value = mock_blob mock_storage.Client.return_value = mock_client - adaptor = GCSStorageAdaptor(bucket='test-bucket') + adaptor = GCSStorageAdaptor(bucket="test-bucket") # Create temporary file with tempfile.NamedTemporaryFile(delete=False) as tmp_file: - tmp_file.write(b'test content') + tmp_file.write(b"test content") tmp_path = tmp_file.name try: # Test upload - result = adaptor.upload_file(tmp_path, 'test.txt') + result = adaptor.upload_file(tmp_path, "test.txt") - assert result == 'gs://test-bucket/test.txt' + assert result == "gs://test-bucket/test.txt" mock_blob.upload_from_filename.assert_called_once() finally: Path(tmp_path).unlink() @@ -252,7 +256,7 @@ def test_gcs_download_file(): if not GCS_AVAILABLE: pytest.skip("google-cloud-storage not installed") - with patch('skill_seekers.cli.storage.gcs_storage.storage') as mock_storage: + with patch("skill_seekers.cli.storage.gcs_storage.storage") as mock_storage: # Setup mocks mock_client = Mock() mock_bucket = Mock() @@ -262,13 +266,13 @@ def test_gcs_download_file(): mock_bucket.blob.return_value = mock_blob mock_storage.Client.return_value = mock_client - adaptor = GCSStorageAdaptor(bucket='test-bucket') + adaptor = GCSStorageAdaptor(bucket="test-bucket") with tempfile.TemporaryDirectory() as tmp_dir: - local_path = os.path.join(tmp_dir, 'downloaded.txt') + local_path = os.path.join(tmp_dir, "downloaded.txt") # Test download - adaptor.download_file('test.txt', local_path) + adaptor.download_file("test.txt", local_path) mock_blob.download_to_filename.assert_called_once() @@ -278,27 +282,27 @@ def test_gcs_list_files(): if not GCS_AVAILABLE: pytest.skip("google-cloud-storage not installed") - with patch('skill_seekers.cli.storage.gcs_storage.storage') as mock_storage: + with patch("skill_seekers.cli.storage.gcs_storage.storage") as mock_storage: # Setup mocks mock_client = Mock() mock_blob = Mock() - mock_blob.name = 'file1.txt' + mock_blob.name = "file1.txt" mock_blob.size = 100 - mock_blob.updated = Mock(isoformat=lambda: '2024-01-01T00:00:00') - mock_blob.etag = 'abc123' + mock_blob.updated = Mock(isoformat=lambda: "2024-01-01T00:00:00") + mock_blob.etag = "abc123" mock_blob.metadata = {} mock_client.list_blobs.return_value = [mock_blob] mock_storage.Client.return_value = mock_client mock_client.bucket.return_value = Mock() - adaptor = GCSStorageAdaptor(bucket='test-bucket') + adaptor = GCSStorageAdaptor(bucket="test-bucket") # Test list - files = adaptor.list_files('prefix/') + files = adaptor.list_files("prefix/") assert len(files) == 1 - assert files[0].key == 'file1.txt' + assert files[0].key == "file1.txt" assert files[0].size == 100 @@ -306,12 +310,13 @@ def test_gcs_list_files(): # Azure Storage Tests # ======================================== + def test_azure_upload_file(): """Test Azure file upload.""" if not AZURE_AVAILABLE: pytest.skip("azure-storage-blob not installed") - with patch('skill_seekers.cli.storage.azure_storage.BlobServiceClient') as mock_blob_service: + with patch("skill_seekers.cli.storage.azure_storage.BlobServiceClient") as mock_blob_service: # Setup mocks mock_service_client = Mock() mock_container_client = Mock() @@ -321,19 +326,21 @@ def test_azure_upload_file(): mock_container_client.get_blob_client.return_value = mock_blob_client mock_blob_service.from_connection_string.return_value = mock_service_client - connection_string = 'DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key' - adaptor = AzureStorageAdaptor(container='test-container', connection_string=connection_string) + connection_string = "DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key" + adaptor = AzureStorageAdaptor( + container="test-container", connection_string=connection_string + ) # Create temporary file with tempfile.NamedTemporaryFile(delete=False) as tmp_file: - tmp_file.write(b'test content') + tmp_file.write(b"test content") tmp_path = tmp_file.name try: # Test upload - result = adaptor.upload_file(tmp_path, 'test.txt') + result = adaptor.upload_file(tmp_path, "test.txt") - assert 'test.blob.core.windows.net' in result + assert "test.blob.core.windows.net" in result mock_blob_client.upload_blob.assert_called_once() finally: Path(tmp_path).unlink() @@ -344,30 +351,32 @@ def test_azure_download_file(): if not AZURE_AVAILABLE: pytest.skip("azure-storage-blob not installed") - with patch('skill_seekers.cli.storage.azure_storage.BlobServiceClient') as mock_blob_service: + with patch("skill_seekers.cli.storage.azure_storage.BlobServiceClient") as mock_blob_service: # Setup mocks mock_service_client = Mock() mock_container_client = Mock() mock_blob_client = Mock() mock_download_stream = Mock() - mock_download_stream.readall.return_value = b'test content' + mock_download_stream.readall.return_value = b"test content" mock_service_client.get_container_client.return_value = mock_container_client mock_container_client.get_blob_client.return_value = mock_blob_client mock_blob_client.download_blob.return_value = mock_download_stream mock_blob_service.from_connection_string.return_value = mock_service_client - connection_string = 'DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key' - adaptor = AzureStorageAdaptor(container='test-container', connection_string=connection_string) + connection_string = "DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key" + adaptor = AzureStorageAdaptor( + container="test-container", connection_string=connection_string + ) with tempfile.TemporaryDirectory() as tmp_dir: - local_path = os.path.join(tmp_dir, 'downloaded.txt') + local_path = os.path.join(tmp_dir, "downloaded.txt") # Test download - adaptor.download_file('test.txt', local_path) + adaptor.download_file("test.txt", local_path) assert Path(local_path).exists() - assert Path(local_path).read_bytes() == b'test content' + assert Path(local_path).read_bytes() == b"test content" def test_azure_list_files(): @@ -375,29 +384,31 @@ def test_azure_list_files(): if not AZURE_AVAILABLE: pytest.skip("azure-storage-blob not installed") - with patch('skill_seekers.cli.storage.azure_storage.BlobServiceClient') as mock_blob_service: + with patch("skill_seekers.cli.storage.azure_storage.BlobServiceClient") as mock_blob_service: # Setup mocks mock_service_client = Mock() mock_container_client = Mock() mock_blob = Mock() - mock_blob.name = 'file1.txt' + mock_blob.name = "file1.txt" mock_blob.size = 100 - mock_blob.last_modified = Mock(isoformat=lambda: '2024-01-01T00:00:00') - mock_blob.etag = 'abc123' + mock_blob.last_modified = Mock(isoformat=lambda: "2024-01-01T00:00:00") + mock_blob.etag = "abc123" mock_blob.metadata = {} mock_container_client.list_blobs.return_value = [mock_blob] mock_service_client.get_container_client.return_value = mock_container_client mock_blob_service.from_connection_string.return_value = mock_service_client - connection_string = 'DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key' - adaptor = AzureStorageAdaptor(container='test-container', connection_string=connection_string) + connection_string = "DefaultEndpointsProtocol=https;AccountName=test;AccountKey=key" + adaptor = AzureStorageAdaptor( + container="test-container", connection_string=connection_string + ) # Test list - files = adaptor.list_files('prefix/') + files = adaptor.list_files("prefix/") assert len(files) == 1 - assert files[0].key == 'file1.txt' + assert files[0].key == "file1.txt" assert files[0].size == 100 @@ -405,53 +416,55 @@ def test_azure_list_files(): # Base Adaptor Tests # ======================================== + def test_storage_object(): """Test StorageObject dataclass.""" obj = StorageObject( - key='test.txt', + key="test.txt", size=100, - last_modified='2024-01-01T00:00:00', - etag='abc123', - metadata={'key': 'value'} + last_modified="2024-01-01T00:00:00", + etag="abc123", + metadata={"key": "value"}, ) - assert obj.key == 'test.txt' + assert obj.key == "test.txt" assert obj.size == 100 - assert obj.metadata == {'key': 'value'} + assert obj.metadata == {"key": "value"} def test_base_adaptor_abstract(): """Test that BaseStorageAdaptor cannot be instantiated.""" with pytest.raises(TypeError): - BaseStorageAdaptor(bucket='test') + BaseStorageAdaptor(bucket="test") # ======================================== # Integration-style Tests # ======================================== + def test_upload_directory(): """Test directory upload.""" if not BOTO3_AVAILABLE: pytest.skip("boto3 not installed") - with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3: + with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3: # Setup mocks mock_client = Mock() mock_boto3.client.return_value = mock_client mock_boto3.resource.return_value = Mock() - adaptor = S3StorageAdaptor(bucket='test-bucket') + adaptor = S3StorageAdaptor(bucket="test-bucket") # Create temporary directory with files with tempfile.TemporaryDirectory() as tmp_dir: - (Path(tmp_dir) / 'file1.txt').write_text('content1') - (Path(tmp_dir) / 'file2.txt').write_text('content2') - (Path(tmp_dir) / 'subdir').mkdir() - (Path(tmp_dir) / 'subdir' / 'file3.txt').write_text('content3') + (Path(tmp_dir) / "file1.txt").write_text("content1") + (Path(tmp_dir) / "file2.txt").write_text("content2") + (Path(tmp_dir) / "subdir").mkdir() + (Path(tmp_dir) / "subdir" / "file3.txt").write_text("content3") # Test upload directory - uploaded_files = adaptor.upload_directory(tmp_dir, 'skills/') + uploaded_files = adaptor.upload_directory(tmp_dir, "skills/") assert len(uploaded_files) == 3 assert mock_client.upload_file.call_count == 3 @@ -462,25 +475,25 @@ def test_download_directory(): if not BOTO3_AVAILABLE: pytest.skip("boto3 not installed") - with patch('skill_seekers.cli.storage.s3_storage.boto3') as mock_boto3: + with patch("skill_seekers.cli.storage.s3_storage.boto3") as mock_boto3: # Setup mocks mock_client = Mock() mock_paginator = Mock() mock_page_iterator = [ { - 'Contents': [ + "Contents": [ { - 'Key': 'skills/file1.txt', - 'Size': 100, - 'LastModified': Mock(isoformat=lambda: '2024-01-01T00:00:00'), - 'ETag': '"abc"' + "Key": "skills/file1.txt", + "Size": 100, + "LastModified": Mock(isoformat=lambda: "2024-01-01T00:00:00"), + "ETag": '"abc"', }, { - 'Key': 'skills/file2.txt', - 'Size': 200, - 'LastModified': Mock(isoformat=lambda: '2024-01-01T00:00:00'), - 'ETag': '"def"' - } + "Key": "skills/file2.txt", + "Size": 200, + "LastModified": Mock(isoformat=lambda: "2024-01-01T00:00:00"), + "ETag": '"def"', + }, ] } ] @@ -490,11 +503,11 @@ def test_download_directory(): mock_boto3.client.return_value = mock_client mock_boto3.resource.return_value = Mock() - adaptor = S3StorageAdaptor(bucket='test-bucket') + adaptor = S3StorageAdaptor(bucket="test-bucket") with tempfile.TemporaryDirectory() as tmp_dir: # Test download directory - downloaded_files = adaptor.download_directory('skills/', tmp_dir) + downloaded_files = adaptor.download_directory("skills/", tmp_dir) assert len(downloaded_files) == 2 assert mock_client.download_file.call_count == 2 diff --git a/tests/test_embedding.py b/tests/test_embedding.py index b54e664..703734a 100644 --- a/tests/test_embedding.py +++ b/tests/test_embedding.py @@ -23,6 +23,7 @@ from skill_seekers.embedding.cache import EmbeddingCache # Cache Tests # ======================================== + def test_cache_init(): """Test cache initialization.""" cache = EmbeddingCache(":memory:") @@ -121,6 +122,7 @@ def test_cache_context_manager(): # Generator Tests # ======================================== + def test_generator_init(): """Test generator initialization.""" generator = EmbeddingGenerator() @@ -174,7 +176,7 @@ def test_generator_compute_hash(): assert hash1 != hash4 -@patch('skill_seekers.embedding.generator.SENTENCE_TRANSFORMERS_AVAILABLE', False) +@patch("skill_seekers.embedding.generator.SENTENCE_TRANSFORMERS_AVAILABLE", False) def test_generator_sentence_transformers_not_available(): """Test sentence-transformers not available.""" generator = EmbeddingGenerator() @@ -183,7 +185,7 @@ def test_generator_sentence_transformers_not_available(): generator.generate("test", model="all-MiniLM-L6-v2") -@patch('skill_seekers.embedding.generator.OPENAI_AVAILABLE', False) +@patch("skill_seekers.embedding.generator.OPENAI_AVAILABLE", False) def test_generator_openai_not_available(): """Test OpenAI not available.""" generator = EmbeddingGenerator() @@ -192,7 +194,7 @@ def test_generator_openai_not_available(): generator.generate("test", model="text-embedding-3-small") -@patch('skill_seekers.embedding.generator.VOYAGE_AVAILABLE', False) +@patch("skill_seekers.embedding.generator.VOYAGE_AVAILABLE", False) def test_generator_voyage_not_available(): """Test Voyage AI not available.""" generator = EmbeddingGenerator() @@ -227,13 +229,10 @@ def test_generator_voyage_large_2_model_info(): # Model Tests # ======================================== + def test_embedding_request(): """Test EmbeddingRequest model.""" - request = EmbeddingRequest( - text="Hello world", - model="text-embedding-3-small", - normalize=True - ) + request = EmbeddingRequest(text="Hello world", model="text-embedding-3-small", normalize=True) assert request.text == "Hello world" assert request.model == "text-embedding-3-small" @@ -243,9 +242,7 @@ def test_embedding_request(): def test_batch_embedding_request(): """Test BatchEmbeddingRequest model.""" request = BatchEmbeddingRequest( - texts=["text1", "text2", "text3"], - model="text-embedding-3-small", - batch_size=32 + texts=["text1", "text2", "text3"], model="text-embedding-3-small", batch_size=32 ) assert len(request.texts) == 3 @@ -255,10 +252,7 @@ def test_batch_embedding_request(): def test_embedding_response(): """Test EmbeddingResponse model.""" response = EmbeddingResponse( - embedding=[0.1, 0.2, 0.3], - model="test-model", - dimensions=3, - cached=False + embedding=[0.1, 0.2, 0.3], model="test-model", dimensions=3, cached=False ) assert len(response.embedding) == 3 @@ -273,7 +267,7 @@ def test_batch_embedding_response(): model="test-model", dimensions=2, count=2, - cached_count=1 + cached_count=1, ) assert len(response.embeddings) == 2 @@ -288,7 +282,7 @@ def test_health_response(): version="1.0.0", models=["model1", "model2"], cache_enabled=True, - cache_size=100 + cache_size=100, ) assert response.status == "ok" @@ -303,7 +297,7 @@ def test_model_info(): provider="openai", dimensions=1536, max_tokens=8191, - cost_per_million=0.02 + cost_per_million=0.02, ) assert info.name == "test-model" @@ -315,6 +309,7 @@ def test_model_info(): # Integration Tests # ======================================== + def test_cache_batch_operations(): """Test cache batch operations.""" cache = EmbeddingCache(":memory:") diff --git a/tests/test_embedding_pipeline.py b/tests/test_embedding_pipeline.py index f7d316f..48a34fb 100644 --- a/tests/test_embedding_pipeline.py +++ b/tests/test_embedding_pipeline.py @@ -23,7 +23,7 @@ from skill_seekers.cli.embedding_pipeline import ( EmbeddingPipeline, LocalEmbeddingProvider, EmbeddingCache, - CostTracker + CostTracker, ) @@ -112,21 +112,16 @@ def test_cost_tracker(): stats = tracker.get_stats() - assert stats['total_requests'] == 2 - assert stats['total_tokens'] == 1500 - assert stats['cache_hits'] == 1 - assert stats['cache_misses'] == 1 - assert '50.0%' in stats['cache_rate'] + assert stats["total_requests"] == 2 + assert stats["total_tokens"] == 1500 + assert stats["cache_hits"] == 1 + assert stats["cache_misses"] == 1 + assert "50.0%" in stats["cache_rate"] def test_pipeline_initialization(): """Test pipeline initialization.""" - config = EmbeddingConfig( - provider='local', - model='test-model', - dimension=128, - batch_size=10 - ) + config = EmbeddingConfig(provider="local", model="test-model", dimension=128, batch_size=10) pipeline = EmbeddingPipeline(config) @@ -137,12 +132,7 @@ def test_pipeline_initialization(): def test_pipeline_generate_batch(): """Test batch embedding generation.""" - config = EmbeddingConfig( - provider='local', - model='test-model', - dimension=64, - batch_size=2 - ) + config = EmbeddingConfig(provider="local", model="test-model", dimension=64, batch_size=2) pipeline = EmbeddingPipeline(config) @@ -159,11 +149,11 @@ def test_pipeline_caching(): """Test pipeline uses caching.""" with tempfile.TemporaryDirectory() as tmpdir: config = EmbeddingConfig( - provider='local', - model='test-model', + provider="local", + model="test-model", dimension=32, batch_size=10, - cache_dir=Path(tmpdir) + cache_dir=Path(tmpdir), ) pipeline = EmbeddingPipeline(config) @@ -184,10 +174,10 @@ def test_pipeline_caching(): def test_pipeline_batch_processing(): """Test large batch is processed in chunks.""" config = EmbeddingConfig( - provider='local', - model='test-model', + provider="local", + model="test-model", dimension=16, - batch_size=3 # Small batch size + batch_size=3, # Small batch size ) pipeline = EmbeddingPipeline(config) @@ -201,11 +191,7 @@ def test_pipeline_batch_processing(): def test_validate_dimensions_valid(): """Test dimension validation with valid embeddings.""" - config = EmbeddingConfig( - provider='local', - model='test-model', - dimension=128 - ) + config = EmbeddingConfig(provider="local", model="test-model", dimension=128) pipeline = EmbeddingPipeline(config) @@ -217,11 +203,7 @@ def test_validate_dimensions_valid(): def test_validate_dimensions_invalid(): """Test dimension validation with invalid embeddings.""" - config = EmbeddingConfig( - provider='local', - model='test-model', - dimension=128 - ) + config = EmbeddingConfig(provider="local", model="test-model", dimension=128) pipeline = EmbeddingPipeline(config) @@ -234,30 +216,22 @@ def test_validate_dimensions_invalid(): def test_embedding_result_metadata(): """Test embedding result includes metadata.""" - config = EmbeddingConfig( - provider='local', - model='test-model', - dimension=256 - ) + config = EmbeddingConfig(provider="local", model="test-model", dimension=256) pipeline = EmbeddingPipeline(config) texts = ["test"] result = pipeline.generate_batch(texts, show_progress=False) - assert 'provider' in result.metadata - assert 'model' in result.metadata - assert 'dimension' in result.metadata - assert result.metadata['dimension'] == 256 + assert "provider" in result.metadata + assert "model" in result.metadata + assert "dimension" in result.metadata + assert result.metadata["dimension"] == 256 def test_cost_stats(): """Test cost statistics tracking.""" - config = EmbeddingConfig( - provider='local', - model='test-model', - dimension=64 - ) + config = EmbeddingConfig(provider="local", model="test-model", dimension=64) pipeline = EmbeddingPipeline(config) @@ -266,18 +240,14 @@ def test_cost_stats(): stats = pipeline.get_cost_stats() - assert 'total_requests' in stats - assert 'cache_hits' in stats - assert 'estimated_cost' in stats + assert "total_requests" in stats + assert "cache_hits" in stats + assert "estimated_cost" in stats def test_empty_batch(): """Test handling empty batch.""" - config = EmbeddingConfig( - provider='local', - model='test-model', - dimension=32 - ) + config = EmbeddingConfig(provider="local", model="test-model", dimension=32) pipeline = EmbeddingPipeline(config) @@ -289,11 +259,7 @@ def test_empty_batch(): def test_single_document(): """Test single document generation.""" - config = EmbeddingConfig( - provider='local', - model='test-model', - dimension=128 - ) + config = EmbeddingConfig(provider="local", model="test-model", dimension=128) pipeline = EmbeddingPipeline(config) @@ -306,11 +272,7 @@ def test_single_document(): def test_different_dimensions(): """Test different embedding dimensions.""" for dim in [64, 128, 256, 512]: - config = EmbeddingConfig( - provider='local', - model='test-model', - dimension=dim - ) + config = EmbeddingConfig(provider="local", model="test-model", dimension=dim) pipeline = EmbeddingPipeline(config) result = pipeline.generate_batch(["test"], show_progress=False) diff --git a/tests/test_enhance_skill_local.py b/tests/test_enhance_skill_local.py index 1d6c712..522c2d8 100644 --- a/tests/test_enhance_skill_local.py +++ b/tests/test_enhance_skill_local.py @@ -152,9 +152,7 @@ class TestMultiAgentSupport: def test_rejects_missing_executable(self, tmp_path, monkeypatch): """Test rejection when executable is not found on PATH.""" - monkeypatch.setattr( - "skill_seekers.cli.enhance_skill_local.shutil.which", lambda _exe: None - ) + monkeypatch.setattr("skill_seekers.cli.enhance_skill_local.shutil.which", lambda _exe: None) skill_dir = _make_skill_dir(tmp_path) with pytest.raises(ValueError, match="not found in PATH"): diff --git a/tests/test_framework_detection.py b/tests/test_framework_detection.py index b08fbfc..49c41e3 100644 --- a/tests/test_framework_detection.py +++ b/tests/test_framework_detection.py @@ -80,8 +80,9 @@ class TestFrameworkDetection(unittest.TestCase): arch_data = json.load(f) self.assertIn("frameworks_detected", arch_data) - self.assertIn("Flask", arch_data["frameworks_detected"], - "Flask should be detected from imports") + self.assertIn( + "Flask", arch_data["frameworks_detected"], "Flask should be detected from imports" + ) def test_files_with_imports_are_included(self): """Test that files with only imports are included in analysis (Issue #239).""" @@ -119,24 +120,19 @@ class TestFrameworkDetection(unittest.TestCase): analysis_data = json.load(f) # File should be included - self.assertGreater(len(analysis_data["files"]), 0, - "Files with imports should be included") + self.assertGreater(len(analysis_data["files"]), 0, "Files with imports should be included") # Find our import-only file import_file = next( - (f for f in analysis_data["files"] if "imports_only.py" in f["file"]), - None + (f for f in analysis_data["files"] if "imports_only.py" in f["file"]), None ) self.assertIsNotNone(import_file, "Import-only file should be in analysis") # Verify imports were extracted self.assertIn("imports", import_file, "Imports should be extracted") - self.assertGreater(len(import_file["imports"]), 0, - "Should have captured imports") - self.assertIn("django", import_file["imports"], - "Django import should be captured") - self.assertIn("flask", import_file["imports"], - "Flask import should be captured") + self.assertGreater(len(import_file["imports"]), 0, "Should have captured imports") + self.assertIn("django", import_file["imports"], "Django import should be captured") + self.assertIn("flask", import_file["imports"], "Flask import should be captured") def test_no_false_positive_frameworks(self): """Test that framework detection doesn't produce false positives (Issue #239).""" @@ -145,10 +141,7 @@ class TestFrameworkDetection(unittest.TestCase): app_dir.mkdir() # File with no framework imports - (app_dir / "utils.py").write_text( - "def my_function():\n" - " return 'hello'\n" - ) + (app_dir / "utils.py").write_text("def my_function():\n return 'hello'\n") # Run codebase analyzer from skill_seekers.cli.codebase_scraper import main as scraper_main @@ -180,12 +173,10 @@ class TestFrameworkDetection(unittest.TestCase): frameworks = arch_data.get("frameworks_detected", []) # Should not detect Flask just from "app" directory name - self.assertNotIn("Flask", frameworks, - "Should not detect Flask without imports") + self.assertNotIn("Flask", frameworks, "Should not detect Flask without imports") # Should not detect other frameworks with "app" in markers for fw in ["ASP.NET", "Rails", "Laravel"]: - self.assertNotIn(fw, frameworks, - f"Should not detect {fw} without real evidence") + self.assertNotIn(fw, frameworks, f"Should not detect {fw} without real evidence") if __name__ == "__main__": diff --git a/tests/test_incremental_updates.py b/tests/test_incremental_updates.py index dc945e8..818c4a8 100644 --- a/tests/test_incremental_updates.py +++ b/tests/test_incremental_updates.py @@ -20,9 +20,7 @@ import time # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / "src")) -from skill_seekers.cli.incremental_updater import ( - IncrementalUpdater -) +from skill_seekers.cli.incremental_updater import IncrementalUpdater @pytest.fixture @@ -281,15 +279,15 @@ def test_apply_update_package(temp_skill_dir): "timestamp": "2026-02-05T12:00:00", "skill_name": "test_skill", "change_summary": {"modified": 1}, - "total_changes": 1 + "total_changes": 1, }, "changes": { "SKILL.md": { "action": "modify", "version": 2, - "content": "# Updated Content\n\nApplied from package" + "content": "# Updated Content\n\nApplied from package", } - } + }, } package_path.write_text(json.dumps(update_data)) @@ -298,7 +296,9 @@ def test_apply_update_package(temp_skill_dir): success = updater.apply_update_package(package_path) assert success - assert (temp_skill_dir / "SKILL.md").read_text() == "# Updated Content\n\nApplied from package" + assert ( + temp_skill_dir / "SKILL.md" + ).read_text() == "# Updated Content\n\nApplied from package" def test_content_hash_consistency(temp_skill_dir): diff --git a/tests/test_integration.py b/tests/test_integration.py index 906b421..cca16bb 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -92,7 +92,11 @@ class TestConfigLoading(unittest.TestCase): { "type": "documentation", "base_url": "https://example.com/", - "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"}, + "selectors": { + "main_content": "article", + "title": "h1", + "code_blocks": "pre code", + }, "rate_limit": 0.5, "max_pages": 100, } diff --git a/tests/test_integration_adaptors.py b/tests/test_integration_adaptors.py index 59b73a1..fb335c7 100644 --- a/tests/test_integration_adaptors.py +++ b/tests/test_integration_adaptors.py @@ -113,6 +113,7 @@ def check_service_available(url: str, timeout: int = 5) -> bool: """Check if a service is available.""" try: import requests + response = requests.get(url, timeout=timeout) return response.status_code == 200 except Exception: @@ -133,7 +134,9 @@ class TestWeaviateIntegration: # Check if Weaviate is running if not check_service_available("http://localhost:8080/v1/.well-known/ready"): - pytest.skip("Weaviate not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)") + pytest.skip( + "Weaviate not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)" + ) # Connect to Weaviate try: @@ -144,10 +147,7 @@ class TestWeaviateIntegration: # Package skill adaptor = get_adaptor("weaviate") - SkillMetadata( - name="integration_test", - description="Integration test skill for Weaviate" - ) + SkillMetadata(name="integration_test", description="Integration test skill for Weaviate") package_path = adaptor.package(sample_skill_dir, tmp_path) assert package_path.exists(), "Package not created" @@ -173,19 +173,16 @@ class TestWeaviateIntegration: with client.batch as batch: for obj in data["objects"]: batch.add_data_object( - data_object=obj["properties"], - class_name=class_name, - uuid=obj["id"] + data_object=obj["properties"], class_name=class_name, uuid=obj["id"] ) # Wait for indexing time.sleep(1) # Query - Get all objects - result = client.query.get( - class_name, - ["content", "source", "category"] - ).with_limit(10).do() + result = ( + client.query.get(class_name, ["content", "source", "category"]).with_limit(10).do() + ) # Verify results assert "data" in result, "Query returned no data" @@ -203,8 +200,9 @@ class TestWeaviateIntegration: # Verify content contents = [obj["content"] for obj in objects] - assert any("vector" in content.lower() for content in contents), \ + assert any("vector" in content.lower() for content in contents), ( "Expected content not found" + ) finally: # Cleanup - Delete collection @@ -234,7 +232,7 @@ class TestWeaviateIntegration: description="Test metadata preservation", version="2.0.0", author="Integration Test Suite", - tags=["test", "integration", "weaviate"] + tags=["test", "integration", "weaviate"], ) package_path = adaptor.package(sample_skill_dir, tmp_path) @@ -249,18 +247,17 @@ class TestWeaviateIntegration: with client.batch as batch: for obj in data["objects"]: batch.add_data_object( - data_object=obj["properties"], - class_name=class_name, - uuid=obj["id"] + data_object=obj["properties"], class_name=class_name, uuid=obj["id"] ) time.sleep(1) # Query and verify metadata - result = client.query.get( - class_name, - ["source", "version", "author", "tags"] - ).with_limit(1).do() + result = ( + client.query.get(class_name, ["source", "version", "author", "tags"]) + .with_limit(1) + .do() + ) obj = result["data"]["Get"][class_name][0] assert obj["source"] == "metadata_test", "Source not preserved" @@ -287,7 +284,9 @@ class TestChromaIntegration: # Check if Chroma is running if not check_service_available("http://localhost:8000/api/v1/heartbeat"): - pytest.skip("ChromaDB not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)") + pytest.skip( + "ChromaDB not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)" + ) # Connect to ChromaDB try: @@ -299,8 +298,7 @@ class TestChromaIntegration: # Package skill adaptor = get_adaptor("chroma") SkillMetadata( - name="chroma_integration_test", - description="Integration test skill for ChromaDB" + name="chroma_integration_test", description="Integration test skill for ChromaDB" ) package_path = adaptor.package(sample_skill_dir, tmp_path) @@ -326,9 +324,7 @@ class TestChromaIntegration: # Add documents collection.add( - documents=data["documents"], - metadatas=data["metadatas"], - ids=data["ids"] + documents=data["documents"], metadatas=data["metadatas"], ids=data["ids"] ) # Wait for indexing @@ -340,8 +336,7 @@ class TestChromaIntegration: # Verify results assert "documents" in results, "Query returned no documents" assert len(results["documents"]) > 0, "No documents returned" - assert len(results["documents"]) == len(data["documents"]), \ - "Document count mismatch" + assert len(results["documents"]) == len(data["documents"]), "Document count mismatch" # Verify metadata assert "metadatas" in results, "Query returned no metadatas" @@ -350,8 +345,9 @@ class TestChromaIntegration: assert "category" in first_metadata, "Missing category in metadata" # Verify content - assert any("vector" in doc.lower() for doc in results["documents"]), \ + assert any("vector" in doc.lower() for doc in results["documents"]), ( "Expected content not found" + ) finally: # Cleanup - Delete collection @@ -377,8 +373,7 @@ class TestChromaIntegration: # Package and upload adaptor = get_adaptor("chroma") metadata = SkillMetadata( - name="chroma_filter_test", - description="Test filtering capabilities" + name="chroma_filter_test", description="Test filtering capabilities" ) package_path = adaptor.package(sample_skill_dir, tmp_path) @@ -390,23 +385,18 @@ class TestChromaIntegration: try: collection = client.get_or_create_collection(name=collection_name) collection.add( - documents=data["documents"], - metadatas=data["metadatas"], - ids=data["ids"] + documents=data["documents"], metadatas=data["metadatas"], ids=data["ids"] ) time.sleep(1) # Query with category filter - results = collection.get( - where={"category": "getting started"} - ) + results = collection.get(where={"category": "getting started"}) # Verify filtering worked assert len(results["documents"]) > 0, "No documents matched filter" for metadata in results["metadatas"]: - assert metadata["category"] == "getting started", \ - "Filter returned wrong category" + assert metadata["category"] == "getting started", "Filter returned wrong category" finally: with contextlib.suppress(Exception): @@ -428,7 +418,9 @@ class TestQdrantIntegration: # Check if Qdrant is running if not check_service_available("http://localhost:6333/"): - pytest.skip("Qdrant not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)") + pytest.skip( + "Qdrant not running (start with: docker-compose -f tests/docker-compose.test.yml up -d)" + ) # Connect to Qdrant try: @@ -440,8 +432,7 @@ class TestQdrantIntegration: # Package skill adaptor = get_adaptor("qdrant") SkillMetadata( - name="qdrant_integration_test", - description="Integration test skill for Qdrant" + name="qdrant_integration_test", description="Integration test skill for Qdrant" ) package_path = adaptor.package(sample_skill_dir, tmp_path) @@ -465,25 +456,21 @@ class TestQdrantIntegration: # Create collection client.create_collection( collection_name=collection_name, - vectors_config=VectorParams( - size=vector_size, - distance=Distance.COSINE - ) + vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE), ) # Upload points (with placeholder vectors for testing) points = [] for point in data["points"]: - points.append(PointStruct( - id=point["id"], - vector=[0.0] * vector_size, # Placeholder vectors - payload=point["payload"] - )) + points.append( + PointStruct( + id=point["id"], + vector=[0.0] * vector_size, # Placeholder vectors + payload=point["payload"], + ) + ) - client.upsert( - collection_name=collection_name, - points=points - ) + client.upsert(collection_name=collection_name, points=points) # Wait for indexing time.sleep(1) @@ -493,14 +480,10 @@ class TestQdrantIntegration: # Verify collection assert collection_info.points_count > 0, "No points in collection" - assert collection_info.points_count == len(data["points"]), \ - "Point count mismatch" + assert collection_info.points_count == len(data["points"]), "Point count mismatch" # Query - Scroll through points - scroll_result = client.scroll( - collection_name=collection_name, - limit=10 - ) + scroll_result = client.scroll(collection_name=collection_name, limit=10) points_list = scroll_result[0] assert len(points_list) > 0, "No points returned" @@ -514,8 +497,9 @@ class TestQdrantIntegration: # Verify content contents = [p.payload["content"] for p in points_list] - assert any("vector" in content.lower() for content in contents), \ + assert any("vector" in content.lower() for content in contents), ( "Expected content not found" + ) finally: # Cleanup - Delete collection @@ -527,8 +511,12 @@ class TestQdrantIntegration: try: from qdrant_client import QdrantClient from qdrant_client.models import ( - Distance, VectorParams, PointStruct, - Filter, FieldCondition, MatchValue + Distance, + VectorParams, + PointStruct, + Filter, + FieldCondition, + MatchValue, ) except ImportError: pytest.skip("qdrant-client not installed") @@ -544,10 +532,7 @@ class TestQdrantIntegration: # Package and upload adaptor = get_adaptor("qdrant") - SkillMetadata( - name="qdrant_filter_test", - description="Test filtering capabilities" - ) + SkillMetadata(name="qdrant_filter_test", description="Test filtering capabilities") package_path = adaptor.package(sample_skill_dir, tmp_path) with open(package_path) as f: @@ -560,19 +545,16 @@ class TestQdrantIntegration: # Create and upload client.create_collection( collection_name=collection_name, - vectors_config=VectorParams( - size=vector_size, - distance=Distance.COSINE - ) + vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE), ) points = [] for point in data["points"]: - points.append(PointStruct( - id=point["id"], - vector=[0.0] * vector_size, - payload=point["payload"] - )) + points.append( + PointStruct( + id=point["id"], vector=[0.0] * vector_size, payload=point["payload"] + ) + ) client.upsert(collection_name=collection_name, points=points) time.sleep(1) @@ -581,14 +563,9 @@ class TestQdrantIntegration: scroll_result = client.scroll( collection_name=collection_name, scroll_filter=Filter( - must=[ - FieldCondition( - key="type", - match=MatchValue(value="reference") - ) - ] + must=[FieldCondition(key="type", match=MatchValue(value="reference"))] ), - limit=10 + limit=10, ) points_list = scroll_result[0] @@ -596,8 +573,7 @@ class TestQdrantIntegration: # Verify filtering worked assert len(points_list) > 0, "No points matched filter" for point in points_list: - assert point.payload["type"] == "reference", \ - "Filter returned wrong type" + assert point.payload["type"] == "reference", "Filter returned wrong type" finally: with contextlib.suppress(Exception): @@ -607,4 +583,5 @@ class TestQdrantIntegration: if __name__ == "__main__": # Run integration tests import sys + sys.exit(pytest.main([__file__, "-v", "-m", "integration"])) diff --git a/tests/test_issue_277_real_world.py b/tests/test_issue_277_real_world.py index 263764a..1c15f30 100644 --- a/tests/test_issue_277_real_world.py +++ b/tests/test_issue_277_real_world.py @@ -192,9 +192,7 @@ https://mikro-orm.io/docs/defining-entities#formulas # Verify converted URLs are valid # In real scenario, these would be added to pending_urls and scraped - self.assertTrue( - len(converted_urls) > 0, "Should generate at least one URL to scrape" - ) + self.assertTrue(len(converted_urls) > 0, "Should generate at least one URL to scrape") # Verify no URLs would cause 404 (no anchors in middle of path) for url in converted_urls: diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py index f81fc21..b58df83 100644 --- a/tests/test_mcp_server.py +++ b/tests/test_mcp_server.py @@ -464,13 +464,15 @@ class TestValidateConfigTool(unittest.IsolatedAsyncioTestCase): valid_config = { "name": "valid-test", "description": "Test configuration", - "sources": [{ - "type": "documentation", - "base_url": "https://example.com/", - "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"}, - "rate_limit": 0.5, - "max_pages": 100, - }], + "sources": [ + { + "type": "documentation", + "base_url": "https://example.com/", + "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre"}, + "rate_limit": 0.5, + "max_pages": 100, + } + ], } with open(config_path, "w") as f: json.dump(valid_config, f) diff --git a/tests/test_multilang_support.py b/tests/test_multilang_support.py index 7c22271..6127692 100644 --- a/tests/test_multilang_support.py +++ b/tests/test_multilang_support.py @@ -19,10 +19,7 @@ import json # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / "src")) -from skill_seekers.cli.multilang_support import ( - LanguageDetector, - MultiLanguageManager -) +from skill_seekers.cli.multilang_support import LanguageDetector, MultiLanguageManager def test_detect_english(): @@ -32,8 +29,8 @@ def test_detect_english(): text = "This is an English document. It contains common English words." lang_info = detector.detect(text) - assert lang_info.code == 'en' - assert lang_info.name == 'English' + assert lang_info.code == "en" + assert lang_info.name == "English" assert lang_info.confidence > 0.0 @@ -44,8 +41,8 @@ def test_detect_spanish(): text = "Este es un documento en español. Contiene palabras comunes en español." lang_info = detector.detect(text) - assert lang_info.code == 'es' - assert lang_info.name == 'Spanish' + assert lang_info.code == "es" + assert lang_info.name == "Spanish" def test_detect_french(): @@ -55,8 +52,8 @@ def test_detect_french(): text = "Ceci est un document en français. Il contient des mots français communs." lang_info = detector.detect(text) - assert lang_info.code == 'fr' - assert lang_info.name == 'French' + assert lang_info.code == "fr" + assert lang_info.name == "French" def test_detect_german(): @@ -66,8 +63,8 @@ def test_detect_german(): text = "Dies ist ein deutsches Dokument. Es enthält übliche deutsche Wörter." lang_info = detector.detect(text) - assert lang_info.code == 'de' - assert lang_info.name == 'German' + assert lang_info.code == "de" + assert lang_info.name == "German" def test_detect_chinese(): @@ -77,33 +74,33 @@ def test_detect_chinese(): text = "这是一个中文文档。它包含常见的中文字符。" lang_info = detector.detect(text) - assert lang_info.code == 'zh' - assert lang_info.name == 'Chinese' + assert lang_info.code == "zh" + assert lang_info.name == "Chinese" def test_detect_from_filename_dot_pattern(): """Test language detection from filename (file.en.md pattern).""" detector = LanguageDetector() - assert detector.detect_from_filename("README.en.md") == 'en' - assert detector.detect_from_filename("guide.es.md") == 'es' - assert detector.detect_from_filename("doc.fr.md") == 'fr' + assert detector.detect_from_filename("README.en.md") == "en" + assert detector.detect_from_filename("guide.es.md") == "es" + assert detector.detect_from_filename("doc.fr.md") == "fr" def test_detect_from_filename_underscore_pattern(): """Test language detection from filename (file_en.md pattern).""" detector = LanguageDetector() - assert detector.detect_from_filename("README_en.md") == 'en' - assert detector.detect_from_filename("guide_es.md") == 'es' + assert detector.detect_from_filename("README_en.md") == "en" + assert detector.detect_from_filename("guide_es.md") == "es" def test_detect_from_filename_dash_pattern(): """Test language detection from filename (file-en.md pattern).""" detector = LanguageDetector() - assert detector.detect_from_filename("README-en.md") == 'en' - assert detector.detect_from_filename("guide-es.md") == 'es' + assert detector.detect_from_filename("README-en.md") == "en" + assert detector.detect_from_filename("guide-es.md") == "es" def test_detect_from_filename_no_match(): @@ -118,15 +115,11 @@ def test_add_document_single_language(): """Test adding documents in single language.""" manager = MultiLanguageManager() - manager.add_document( - "README.md", - "This is an English document.", - {"category": "overview"} - ) + manager.add_document("README.md", "This is an English document.", {"category": "overview"}) assert len(manager.get_languages()) == 1 - assert 'en' in manager.get_languages() - assert manager.get_document_count('en') == 1 + assert "en" in manager.get_languages() + assert manager.get_document_count("en") == 1 def test_add_document_multiple_languages(): @@ -138,9 +131,9 @@ def test_add_document_multiple_languages(): manager.add_document("README.fr.md", "Ceci est français.", {}) assert len(manager.get_languages()) == 3 - assert 'en' in manager.get_languages() - assert 'es' in manager.get_languages() - assert 'fr' in manager.get_languages() + assert "en" in manager.get_languages() + assert "es" in manager.get_languages() + assert "fr" in manager.get_languages() def test_force_language(): @@ -148,15 +141,10 @@ def test_force_language(): manager = MultiLanguageManager() # Force Spanish despite English content - manager.add_document( - "file.md", - "This is actually English content.", - {}, - force_language='es' - ) + manager.add_document("file.md", "This is actually English content.", {}, force_language="es") - assert 'es' in manager.get_languages() - assert manager.get_document_count('es') == 1 + assert "es" in manager.get_languages() + assert manager.get_document_count("es") == 1 def test_filename_language_priority(): @@ -164,14 +152,10 @@ def test_filename_language_priority(): manager = MultiLanguageManager() # Filename says Spanish, but content is English - manager.add_document( - "guide.es.md", - "This is English content.", - {} - ) + manager.add_document("guide.es.md", "This is English content.", {}) # Should use filename language - assert 'es' in manager.get_languages() + assert "es" in manager.get_languages() def test_document_count_all(): @@ -183,8 +167,8 @@ def test_document_count_all(): manager.add_document("file3.es.md", "Spanish doc", {}) assert manager.get_document_count() == 3 - assert manager.get_document_count('en') == 2 - assert manager.get_document_count('es') == 1 + assert manager.get_document_count("en") == 2 + assert manager.get_document_count("es") == 1 def test_primary_language(): @@ -195,7 +179,7 @@ def test_primary_language(): manager.add_document("file2.es.md", "Spanish doc", {}) # Primary should be first added - assert manager.primary_language == 'en' + assert manager.primary_language == "en" def test_translation_status(): @@ -208,9 +192,9 @@ def test_translation_status(): status = manager.get_translation_status() - assert status.source_language == 'en' - assert 'es' in status.translated_languages - assert 'fr' in status.translated_languages + assert status.source_language == "en" + assert "es" in status.translated_languages + assert "fr" in status.translated_languages assert len(status.translated_languages) == 2 @@ -225,17 +209,17 @@ def test_export_by_language(): exports = manager.export_by_language(Path(tmpdir)) assert len(exports) == 2 - assert 'en' in exports - assert 'es' in exports + assert "en" in exports + assert "es" in exports # Check files exist - assert exports['en'].exists() - assert exports['es'].exists() + assert exports["en"].exists() + assert exports["es"].exists() # Check content - en_data = json.loads(exports['en'].read_text()) - assert en_data['language'] == 'en' - assert en_data['document_count'] == 1 + en_data = json.loads(exports["en"].read_text()) + assert en_data["language"] == "en" + assert en_data["document_count"] == 1 def test_translation_report_generation(): @@ -268,11 +252,11 @@ def test_script_detection(): # English uses Latin script en_info = detector.detect("This is English") - assert en_info.script == 'Latin' + assert en_info.script == "Latin" # Chinese uses Han script zh_info = detector.detect("这是中文") - assert zh_info.script == 'Han' + assert zh_info.script == "Han" def test_confidence_scoring(): @@ -283,7 +267,7 @@ def test_confidence_scoring(): strong_en = "The quick brown fox jumps over the lazy dog. This is clearly English." lang_info = detector.detect(strong_en) - assert lang_info.code == 'en' + assert lang_info.code == "en" assert lang_info.confidence > 0.3 # Should have decent confidence @@ -294,9 +278,9 @@ def test_metadata_preservation(): metadata = {"category": "guide", "version": "1.0"} manager.add_document("file.md", "English content", metadata) - docs = manager.documents['en'] + docs = manager.documents["en"] assert len(docs) == 1 - assert docs[0]['metadata'] == metadata + assert docs[0]["metadata"] == metadata if __name__ == "__main__": diff --git a/tests/test_preset_system.py b/tests/test_preset_system.py index 6626ed3..3f28529 100644 --- a/tests/test_preset_system.py +++ b/tests/test_preset_system.py @@ -14,9 +14,9 @@ class TestPresetDefinitions: def test_all_presets_defined(self): """Test that all expected presets are defined.""" - assert 'quick' in PRESETS - assert 'standard' in PRESETS - assert 'comprehensive' in PRESETS + assert "quick" in PRESETS + assert "standard" in PRESETS + assert "comprehensive" in PRESETS assert len(PRESETS) == 3 def test_preset_structure(self): @@ -25,7 +25,7 @@ class TestPresetDefinitions: assert isinstance(preset, AnalysisPreset) assert preset.name assert preset.description - assert preset.depth in ['surface', 'deep', 'full'] + assert preset.depth in ["surface", "deep", "full"] assert isinstance(preset.features, dict) assert 0 <= preset.enhance_level <= 3 assert preset.estimated_time @@ -33,45 +33,45 @@ class TestPresetDefinitions: def test_quick_preset(self): """Test quick preset configuration.""" - quick = PRESETS['quick'] - assert quick.name == 'Quick' - assert quick.depth == 'surface' + quick = PRESETS["quick"] + assert quick.name == "Quick" + assert quick.depth == "surface" assert quick.enhance_level == 0 - assert quick.estimated_time == '1-2 minutes' - assert quick.icon == '⚡' + assert quick.estimated_time == "1-2 minutes" + assert quick.icon == "⚡" # Quick should disable slow features - assert quick.features['api_reference'] # Essential - assert not quick.features['dependency_graph'] # Slow - assert not quick.features['patterns'] # Slow - assert not quick.features['test_examples'] # Slow - assert not quick.features['how_to_guides'] # Requires AI - assert quick.features['docs'] # Essential + assert quick.features["api_reference"] # Essential + assert not quick.features["dependency_graph"] # Slow + assert not quick.features["patterns"] # Slow + assert not quick.features["test_examples"] # Slow + assert not quick.features["how_to_guides"] # Requires AI + assert quick.features["docs"] # Essential def test_standard_preset(self): """Test standard preset configuration.""" - standard = PRESETS['standard'] - assert standard.name == 'Standard' - assert standard.depth == 'deep' + standard = PRESETS["standard"] + assert standard.name == "Standard" + assert standard.depth == "deep" assert standard.enhance_level == 1 - assert standard.estimated_time == '5-10 minutes' - assert standard.icon == '🎯' + assert standard.estimated_time == "5-10 minutes" + assert standard.icon == "🎯" # Standard should enable core features - assert standard.features['api_reference'] - assert standard.features['dependency_graph'] - assert standard.features['patterns'] - assert standard.features['test_examples'] - assert not standard.features['how_to_guides'] # Slow - assert standard.features['config_patterns'] - assert standard.features['docs'] + assert standard.features["api_reference"] + assert standard.features["dependency_graph"] + assert standard.features["patterns"] + assert standard.features["test_examples"] + assert not standard.features["how_to_guides"] # Slow + assert standard.features["config_patterns"] + assert standard.features["docs"] def test_comprehensive_preset(self): """Test comprehensive preset configuration.""" - comprehensive = PRESETS['comprehensive'] - assert comprehensive.name == 'Comprehensive' - assert comprehensive.depth == 'full' + comprehensive = PRESETS["comprehensive"] + assert comprehensive.name == "Comprehensive" + assert comprehensive.depth == "full" assert comprehensive.enhance_level == 3 - assert comprehensive.estimated_time == '20-60 minutes' - assert comprehensive.icon == '🚀' + assert comprehensive.estimated_time == "20-60 minutes" + assert comprehensive.icon == "🚀" # Comprehensive should enable ALL features assert all(comprehensive.features.values()) @@ -81,44 +81,44 @@ class TestPresetManager: def test_get_preset(self): """Test PresetManager.get_preset().""" - quick = PresetManager.get_preset('quick') + quick = PresetManager.get_preset("quick") assert quick is not None - assert quick.name == 'Quick' - assert quick.depth == 'surface' + assert quick.name == "Quick" + assert quick.depth == "surface" # Case insensitive - standard = PresetManager.get_preset('STANDARD') + standard = PresetManager.get_preset("STANDARD") assert standard is not None - assert standard.name == 'Standard' + assert standard.name == "Standard" def test_get_preset_invalid(self): """Test PresetManager.get_preset() with invalid name.""" - invalid = PresetManager.get_preset('nonexistent') + invalid = PresetManager.get_preset("nonexistent") assert invalid is None def test_list_presets(self): """Test PresetManager.list_presets().""" presets = PresetManager.list_presets() assert len(presets) == 3 - assert 'quick' in presets - assert 'standard' in presets - assert 'comprehensive' in presets + assert "quick" in presets + assert "standard" in presets + assert "comprehensive" in presets def test_format_preset_help(self): """Test PresetManager.format_preset_help().""" help_text = PresetManager.format_preset_help() - assert 'Available presets:' in help_text - assert '⚡ quick' in help_text - assert '🎯 standard' in help_text - assert '🚀 comprehensive' in help_text - assert '1-2 minutes' in help_text - assert '5-10 minutes' in help_text - assert '20-60 minutes' in help_text + assert "Available presets:" in help_text + assert "⚡ quick" in help_text + assert "🎯 standard" in help_text + assert "🚀 comprehensive" in help_text + assert "1-2 minutes" in help_text + assert "5-10 minutes" in help_text + assert "20-60 minutes" in help_text def test_get_default_preset(self): """Test PresetManager.get_default_preset().""" default = PresetManager.get_default_preset() - assert default == 'standard' + assert default == "standard" class TestPresetApplication: @@ -126,85 +126,85 @@ class TestPresetApplication: def test_apply_preset_quick(self): """Test applying quick preset.""" - args = {'directory': '/tmp/test'} - updated = PresetManager.apply_preset('quick', args) + args = {"directory": "/tmp/test"} + updated = PresetManager.apply_preset("quick", args) - assert updated['depth'] == 'surface' - assert updated['enhance_level'] == 0 - assert updated['skip_patterns'] # Quick disables patterns - assert updated['skip_dependency_graph'] # Quick disables dep graph - assert updated['skip_test_examples'] # Quick disables tests - assert updated['skip_how_to_guides'] # Quick disables guides - assert not updated['skip_api_reference'] # Quick enables API ref - assert not updated['skip_docs'] # Quick enables docs + assert updated["depth"] == "surface" + assert updated["enhance_level"] == 0 + assert updated["skip_patterns"] # Quick disables patterns + assert updated["skip_dependency_graph"] # Quick disables dep graph + assert updated["skip_test_examples"] # Quick disables tests + assert updated["skip_how_to_guides"] # Quick disables guides + assert not updated["skip_api_reference"] # Quick enables API ref + assert not updated["skip_docs"] # Quick enables docs def test_apply_preset_standard(self): """Test applying standard preset.""" - args = {'directory': '/tmp/test'} - updated = PresetManager.apply_preset('standard', args) + args = {"directory": "/tmp/test"} + updated = PresetManager.apply_preset("standard", args) - assert updated['depth'] == 'deep' - assert updated['enhance_level'] == 1 - assert not updated['skip_patterns'] # Standard enables patterns - assert not updated['skip_dependency_graph'] # Standard enables dep graph - assert not updated['skip_test_examples'] # Standard enables tests - assert updated['skip_how_to_guides'] # Standard disables guides (slow) - assert not updated['skip_api_reference'] # Standard enables API ref - assert not updated['skip_docs'] # Standard enables docs + assert updated["depth"] == "deep" + assert updated["enhance_level"] == 1 + assert not updated["skip_patterns"] # Standard enables patterns + assert not updated["skip_dependency_graph"] # Standard enables dep graph + assert not updated["skip_test_examples"] # Standard enables tests + assert updated["skip_how_to_guides"] # Standard disables guides (slow) + assert not updated["skip_api_reference"] # Standard enables API ref + assert not updated["skip_docs"] # Standard enables docs def test_apply_preset_comprehensive(self): """Test applying comprehensive preset.""" - args = {'directory': '/tmp/test'} - updated = PresetManager.apply_preset('comprehensive', args) + args = {"directory": "/tmp/test"} + updated = PresetManager.apply_preset("comprehensive", args) - assert updated['depth'] == 'full' - assert updated['enhance_level'] == 3 + assert updated["depth"] == "full" + assert updated["enhance_level"] == 3 # Comprehensive enables ALL features - assert not updated['skip_patterns'] - assert not updated['skip_dependency_graph'] - assert not updated['skip_test_examples'] - assert not updated['skip_how_to_guides'] - assert not updated['skip_api_reference'] - assert not updated['skip_config_patterns'] - assert not updated['skip_docs'] + assert not updated["skip_patterns"] + assert not updated["skip_dependency_graph"] + assert not updated["skip_test_examples"] + assert not updated["skip_how_to_guides"] + assert not updated["skip_api_reference"] + assert not updated["skip_config_patterns"] + assert not updated["skip_docs"] def test_cli_overrides_preset(self): """Test that CLI args override preset defaults.""" args = { - 'directory': '/tmp/test', - 'enhance_level': 2, # Override preset default - 'skip_patterns': False # Override preset default + "directory": "/tmp/test", + "enhance_level": 2, # Override preset default + "skip_patterns": False, # Override preset default } - updated = PresetManager.apply_preset('quick', args) + updated = PresetManager.apply_preset("quick", args) # Preset says enhance_level=0, but CLI said 2 - assert updated['enhance_level'] == 2 # CLI wins + assert updated["enhance_level"] == 2 # CLI wins # Preset says skip_patterns=True (disabled), but CLI said False (enabled) - assert not updated['skip_patterns'] # CLI wins + assert not updated["skip_patterns"] # CLI wins def test_apply_preset_preserves_args(self): """Test that apply_preset preserves existing args.""" args = { - 'directory': '/tmp/test', - 'output': 'custom_output/', - 'languages': 'Python,JavaScript' + "directory": "/tmp/test", + "output": "custom_output/", + "languages": "Python,JavaScript", } - updated = PresetManager.apply_preset('standard', args) + updated = PresetManager.apply_preset("standard", args) # Existing args should be preserved - assert updated['directory'] == '/tmp/test' - assert updated['output'] == 'custom_output/' - assert updated['languages'] == 'Python,JavaScript' + assert updated["directory"] == "/tmp/test" + assert updated["output"] == "custom_output/" + assert updated["languages"] == "Python,JavaScript" def test_apply_preset_invalid(self): """Test applying invalid preset raises error.""" - args = {'directory': '/tmp/test'} + args = {"directory": "/tmp/test"} with pytest.raises(ValueError, match="Unknown preset: nonexistent"): - PresetManager.apply_preset('nonexistent', args) + PresetManager.apply_preset("nonexistent", args) class TestDeprecationWarnings: @@ -215,12 +215,7 @@ class TestDeprecationWarnings: from skill_seekers.cli.codebase_scraper import _check_deprecated_flags import argparse - args = argparse.Namespace( - quick=True, - comprehensive=False, - depth=None, - ai_mode='auto' - ) + args = argparse.Namespace(quick=True, comprehensive=False, depth=None, ai_mode="auto") _check_deprecated_flags(args) @@ -235,12 +230,7 @@ class TestDeprecationWarnings: from skill_seekers.cli.codebase_scraper import _check_deprecated_flags import argparse - args = argparse.Namespace( - quick=False, - comprehensive=True, - depth=None, - ai_mode='auto' - ) + args = argparse.Namespace(quick=False, comprehensive=True, depth=None, ai_mode="auto") _check_deprecated_flags(args) @@ -255,12 +245,7 @@ class TestDeprecationWarnings: from skill_seekers.cli.codebase_scraper import _check_deprecated_flags import argparse - args = argparse.Namespace( - quick=False, - comprehensive=False, - depth='full', - ai_mode='auto' - ) + args = argparse.Namespace(quick=False, comprehensive=False, depth="full", ai_mode="auto") _check_deprecated_flags(args) @@ -275,12 +260,7 @@ class TestDeprecationWarnings: from skill_seekers.cli.codebase_scraper import _check_deprecated_flags import argparse - args = argparse.Namespace( - quick=False, - comprehensive=False, - depth=None, - ai_mode='api' - ) + args = argparse.Namespace(quick=False, comprehensive=False, depth=None, ai_mode="api") _check_deprecated_flags(args) @@ -295,12 +275,7 @@ class TestDeprecationWarnings: from skill_seekers.cli.codebase_scraper import _check_deprecated_flags import argparse - args = argparse.Namespace( - quick=True, - comprehensive=False, - depth='surface', - ai_mode='local' - ) + args = argparse.Namespace(quick=True, comprehensive=False, depth="surface", ai_mode="local") _check_deprecated_flags(args) @@ -317,12 +292,7 @@ class TestDeprecationWarnings: from skill_seekers.cli.codebase_scraper import _check_deprecated_flags import argparse - args = argparse.Namespace( - quick=False, - comprehensive=False, - depth=None, - ai_mode='auto' - ) + args = argparse.Namespace(quick=False, comprehensive=False, depth=None, ai_mode="auto") _check_deprecated_flags(args) @@ -337,31 +307,31 @@ class TestBackwardCompatibility: def test_old_flags_still_work(self): """Test that old flags still work (with warnings).""" # --quick flag - args = {'quick': True} - updated = PresetManager.apply_preset('quick', args) - assert updated['depth'] == 'surface' + args = {"quick": True} + updated = PresetManager.apply_preset("quick", args) + assert updated["depth"] == "surface" # --comprehensive flag - args = {'comprehensive': True} - updated = PresetManager.apply_preset('comprehensive', args) - assert updated['depth'] == 'full' + args = {"comprehensive": True} + updated = PresetManager.apply_preset("comprehensive", args) + assert updated["depth"] == "full" def test_preset_flag_preferred(self): """Test that --preset flag is the recommended way.""" # Using --preset quick - args = {'preset': 'quick'} - updated = PresetManager.apply_preset('quick', args) - assert updated['depth'] == 'surface' + args = {"preset": "quick"} + updated = PresetManager.apply_preset("quick", args) + assert updated["depth"] == "surface" # Using --preset standard - args = {'preset': 'standard'} - updated = PresetManager.apply_preset('standard', args) - assert updated['depth'] == 'deep' + args = {"preset": "standard"} + updated = PresetManager.apply_preset("standard", args) + assert updated["depth"] == "deep" # Using --preset comprehensive - args = {'preset': 'comprehensive'} - updated = PresetManager.apply_preset('comprehensive', args) - assert updated['depth'] == 'full' + args = {"preset": "comprehensive"} + updated = PresetManager.apply_preset("comprehensive", args) + assert updated["depth"] == "full" if __name__ == "__main__": diff --git a/tests/test_quality_metrics.py b/tests/test_quality_metrics.py index 25e4b7f..714510f 100644 --- a/tests/test_quality_metrics.py +++ b/tests/test_quality_metrics.py @@ -19,10 +19,7 @@ import tempfile # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / "src")) -from skill_seekers.cli.quality_metrics import ( - QualityAnalyzer, - MetricLevel -) +from skill_seekers.cli.quality_metrics import QualityAnalyzer, MetricLevel @pytest.fixture @@ -176,9 +173,9 @@ def test_calculate_statistics(complete_skill_dir): analyzer = QualityAnalyzer(complete_skill_dir) stats = analyzer.calculate_statistics() - assert stats['total_files'] > 0 - assert stats['markdown_files'] > 0 - assert stats['total_words'] > 0 + assert stats["total_files"] > 0 + assert stats["markdown_files"] > 0 + assert stats["total_words"] > 0 def test_overall_score_calculation(): @@ -197,9 +194,7 @@ def test_overall_score_calculation(): coverage = 70.0 health = 85.0 - overall = analyzer.calculate_overall_score( - completeness, accuracy, coverage, health - ) + overall = analyzer.calculate_overall_score(completeness, accuracy, coverage, health) assert overall.completeness == 80.0 assert overall.accuracy == 90.0 @@ -218,13 +213,13 @@ def test_grade_assignment(): # Test various scores score_95 = analyzer.calculate_overall_score(95, 95, 95, 95) - assert score_95.grade == 'A+' + assert score_95.grade == "A+" score_85 = analyzer.calculate_overall_score(85, 85, 85, 85) - assert score_85.grade in ['A-', 'B+'] + assert score_85.grade in ["A-", "B+"] score_70 = analyzer.calculate_overall_score(70, 70, 70, 70) - assert score_70.grade in ['B-', 'C+', 'C'] + assert score_70.grade in ["B-", "C+", "C"] def test_generate_recommendations(): @@ -240,7 +235,7 @@ def test_generate_recommendations(): recommendations = analyzer.generate_recommendations(score) assert len(recommendations) > 0 - assert any('completeness' in r.lower() for r in recommendations) + assert any("completeness" in r.lower() for r in recommendations) def test_generate_report(complete_skill_dir): diff --git a/tests/test_rag_chunker.py b/tests/test_rag_chunker.py index 7ce7282..ae674ef 100644 --- a/tests/test_rag_chunker.py +++ b/tests/test_rag_chunker.py @@ -28,7 +28,7 @@ class TestRAGChunker: chunk_overlap=100, preserve_code_blocks=False, preserve_paragraphs=False, - min_chunk_size=50 + min_chunk_size=50, ) assert chunker.chunk_size == 1024 @@ -180,13 +180,17 @@ class TestRAGChunker: # Create SKILL.md skill_md = skill_dir / "SKILL.md" - skill_md.write_text("# Main Skill\n\nThis is the main skill content.\n\nWith multiple paragraphs.") + skill_md.write_text( + "# Main Skill\n\nThis is the main skill content.\n\nWith multiple paragraphs." + ) # Create references directory with files references_dir = skill_dir / "references" references_dir.mkdir() - (references_dir / "getting_started.md").write_text("# Getting Started\n\nQuick start guide.") + (references_dir / "getting_started.md").write_text( + "# Getting Started\n\nQuick start guide." + ) (references_dir / "api.md").write_text("# API Reference\n\nAPI documentation.") # Chunk skill @@ -209,7 +213,7 @@ class TestRAGChunker: { "chunk_id": "test_0", "page_content": "Test content", - "metadata": {"source": "test", "chunk_index": 0} + "metadata": {"source": "test", "chunk_index": 0}, } ] @@ -340,7 +344,7 @@ class TestRAGChunker: metadata = { "source": "react-docs", "category": "hooks", - "url": "https://react.dev/reference/react" + "url": "https://react.dev/reference/react", } chunks = chunker.chunk_document(text, metadata) @@ -379,10 +383,7 @@ class TestRAGChunkerIntegration: # Convert to LangChain Documents docs = [ - Document( - page_content=chunk["page_content"], - metadata=chunk["metadata"] - ) + Document(page_content=chunk["page_content"], metadata=chunk["metadata"]) for chunk in chunks ] @@ -407,11 +408,7 @@ class TestRAGChunkerIntegration: # Convert to LlamaIndex TextNodes nodes = [ - TextNode( - text=chunk["page_content"], - metadata=chunk["metadata"], - id_=chunk["chunk_id"] - ) + TextNode(text=chunk["page_content"], metadata=chunk["metadata"], id_=chunk["chunk_id"]) for chunk in chunks ] diff --git a/tests/test_server_fastmcp_http.py b/tests/test_server_fastmcp_http.py index d093066..91ada46 100644 --- a/tests/test_server_fastmcp_http.py +++ b/tests/test_server_fastmcp_http.py @@ -13,6 +13,7 @@ pytest.importorskip("mcp.server") # Check if starlette is available try: from starlette.testclient import TestClient + STARLETTE_AVAILABLE = True except ImportError: STARLETTE_AVAILABLE = False @@ -21,8 +22,7 @@ from skill_seekers.mcp.server_fastmcp import mcp # Skip all tests if starlette is not installed pytestmark = pytest.mark.skipif( - not STARLETTE_AVAILABLE, - reason="starlette not installed (pip install starlette httpx)" + not STARLETTE_AVAILABLE, reason="starlette not installed (pip install starlette httpx)" ) diff --git a/tests/test_streaming_ingestion.py b/tests/test_streaming_ingestion.py index 7360b6b..dc5f3d7 100644 --- a/tests/test_streaming_ingestion.py +++ b/tests/test_streaming_ingestion.py @@ -18,10 +18,7 @@ import tempfile # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / "src")) -from skill_seekers.cli.streaming_ingest import ( - StreamingIngester, - IngestionProgress -) +from skill_seekers.cli.streaming_ingest import StreamingIngester, IngestionProgress @pytest.fixture @@ -158,11 +155,13 @@ def test_progress_tracking(temp_skill_dir): progress_updates = [] def callback(progress: IngestionProgress): - progress_updates.append({ - "processed_docs": progress.processed_documents, - "processed_chunks": progress.processed_chunks, - "percent": progress.progress_percent - }) + progress_updates.append( + { + "processed_docs": progress.processed_documents, + "processed_chunks": progress.processed_chunks, + "percent": progress.progress_percent, + } + ) list(ingester.stream_skill_directory(temp_skill_dir, callback=callback)) @@ -171,7 +170,9 @@ def test_progress_tracking(temp_skill_dir): # Progress should increase for i in range(len(progress_updates) - 1): - assert progress_updates[i + 1]["processed_chunks"] >= progress_updates[i]["processed_chunks"] + assert ( + progress_updates[i + 1]["processed_chunks"] >= progress_updates[i]["processed_chunks"] + ) def test_checkpoint_save_load(): @@ -189,7 +190,7 @@ def test_checkpoint_save_load(): processed_chunks=50, failed_chunks=2, bytes_processed=10000, - start_time=1234567890.0 + start_time=1234567890.0, ) # Save checkpoint @@ -215,7 +216,7 @@ def test_format_progress(): processed_chunks=50, failed_chunks=0, bytes_processed=10000, - start_time=0.0 + start_time=0.0, ) progress_str = ingester.format_progress() @@ -245,17 +246,19 @@ def test_chunk_size_validation(): # Small chunks ingester_small = StreamingIngester(chunk_size=100, chunk_overlap=10) - chunks_small = list(ingester_small.chunk_document( - content, - {"source": "test", "file": "test.md", "category": "test"} - )) + chunks_small = list( + ingester_small.chunk_document( + content, {"source": "test", "file": "test.md", "category": "test"} + ) + ) # Large chunks ingester_large = StreamingIngester(chunk_size=500, chunk_overlap=50) - chunks_large = list(ingester_large.chunk_document( - content, - {"source": "test", "file": "test.md", "category": "test"} - )) + chunks_large = list( + ingester_large.chunk_document( + content, {"source": "test", "file": "test.md", "category": "test"} + ) + ) # Smaller chunk size should create more chunks assert len(chunks_small) > len(chunks_large) diff --git a/tests/test_upload_integration.py b/tests/test_upload_integration.py index 6c1eaac..5d69ee2 100644 --- a/tests/test_upload_integration.py +++ b/tests/test_upload_integration.py @@ -21,9 +21,9 @@ def sample_chroma_package(tmp_path): "metadatas": [ {"source": "test", "category": "overview", "file": "SKILL.md"}, {"source": "test", "category": "api", "file": "API.md"}, - {"source": "test", "category": "guide", "file": "GUIDE.md"} + {"source": "test", "category": "guide", "file": "GUIDE.md"}, ], - "ids": ["id1", "id2", "id3"] + "ids": ["id1", "id2", "id3"], } package_path = tmp_path / "test-chroma.json" @@ -43,8 +43,8 @@ def sample_weaviate_package(tmp_path): "properties": [ {"name": "content", "dataType": ["text"]}, {"name": "source", "dataType": ["string"]}, - {"name": "category", "dataType": ["string"]} - ] + {"name": "category", "dataType": ["string"]}, + ], }, "objects": [ { @@ -52,18 +52,14 @@ def sample_weaviate_package(tmp_path): "properties": { "content": "Test content 1", "source": "test", - "category": "overview" - } + "category": "overview", + }, }, { "id": "00000000-0000-0000-0000-000000000002", - "properties": { - "content": "Test content 2", - "source": "test", - "category": "api" - } - } - ] + "properties": {"content": "Test content 2", "source": "test", "category": "api"}, + }, + ], } package_path = tmp_path / "test-weaviate.json" @@ -76,40 +72,41 @@ class TestChromaUploadBasics: def test_chroma_adaptor_exists(self): """Test that ChromaDB adaptor can be loaded.""" - adaptor = get_adaptor('chroma') + adaptor = get_adaptor("chroma") assert adaptor is not None - assert adaptor.PLATFORM == 'chroma' + assert adaptor.PLATFORM == "chroma" def test_chroma_upload_without_chromadb_installed(self, sample_chroma_package): """Test upload fails gracefully without chromadb installed.""" - adaptor = get_adaptor('chroma') + adaptor = get_adaptor("chroma") # Temporarily remove chromadb if it exists import sys - chromadb_backup = sys.modules.get('chromadb') - if 'chromadb' in sys.modules: - del sys.modules['chromadb'] + + chromadb_backup = sys.modules.get("chromadb") + if "chromadb" in sys.modules: + del sys.modules["chromadb"] try: result = adaptor.upload(sample_chroma_package) - assert result['success'] is False - assert 'chromadb not installed' in result['message'] - assert 'pip install chromadb' in result['message'] + assert result["success"] is False + assert "chromadb not installed" in result["message"] + assert "pip install chromadb" in result["message"] finally: if chromadb_backup: - sys.modules['chromadb'] = chromadb_backup + sys.modules["chromadb"] = chromadb_backup def test_chroma_upload_api_signature(self, sample_chroma_package): """Test ChromaDB upload has correct API signature.""" - adaptor = get_adaptor('chroma') + adaptor = get_adaptor("chroma") # Verify upload method exists and accepts kwargs - assert hasattr(adaptor, 'upload') + assert hasattr(adaptor, "upload") assert callable(adaptor.upload) # Verify adaptor methods exist - assert hasattr(adaptor, '_generate_openai_embeddings') + assert hasattr(adaptor, "_generate_openai_embeddings") class TestWeaviateUploadBasics: @@ -117,40 +114,41 @@ class TestWeaviateUploadBasics: def test_weaviate_adaptor_exists(self): """Test that Weaviate adaptor can be loaded.""" - adaptor = get_adaptor('weaviate') + adaptor = get_adaptor("weaviate") assert adaptor is not None - assert adaptor.PLATFORM == 'weaviate' + assert adaptor.PLATFORM == "weaviate" def test_weaviate_upload_without_weaviate_installed(self, sample_weaviate_package): """Test upload fails gracefully without weaviate-client installed.""" - adaptor = get_adaptor('weaviate') + adaptor = get_adaptor("weaviate") # Temporarily remove weaviate if it exists import sys - weaviate_backup = sys.modules.get('weaviate') - if 'weaviate' in sys.modules: - del sys.modules['weaviate'] + + weaviate_backup = sys.modules.get("weaviate") + if "weaviate" in sys.modules: + del sys.modules["weaviate"] try: result = adaptor.upload(sample_weaviate_package) - assert result['success'] is False - assert 'weaviate-client not installed' in result['message'] - assert 'pip install weaviate-client' in result['message'] + assert result["success"] is False + assert "weaviate-client not installed" in result["message"] + assert "pip install weaviate-client" in result["message"] finally: if weaviate_backup: - sys.modules['weaviate'] = weaviate_backup + sys.modules["weaviate"] = weaviate_backup def test_weaviate_upload_api_signature(self, sample_weaviate_package): """Test Weaviate upload has correct API signature.""" - adaptor = get_adaptor('weaviate') + adaptor = get_adaptor("weaviate") # Verify upload method exists and accepts kwargs - assert hasattr(adaptor, 'upload') + assert hasattr(adaptor, "upload") assert callable(adaptor.upload) # Verify adaptor methods exist - assert hasattr(adaptor, '_generate_openai_embeddings') + assert hasattr(adaptor, "_generate_openai_embeddings") class TestPackageStructure: @@ -161,30 +159,30 @@ class TestPackageStructure: with open(sample_chroma_package) as f: data = json.load(f) - assert 'collection_name' in data - assert 'documents' in data - assert 'metadatas' in data - assert 'ids' in data - assert len(data['documents']) == len(data['metadatas']) == len(data['ids']) + assert "collection_name" in data + assert "documents" in data + assert "metadatas" in data + assert "ids" in data + assert len(data["documents"]) == len(data["metadatas"]) == len(data["ids"]) def test_weaviate_package_structure(self, sample_weaviate_package): """Test Weaviate package has required fields.""" with open(sample_weaviate_package) as f: data = json.load(f) - assert 'class_name' in data - assert 'schema' in data - assert 'objects' in data - assert len(data['objects']) == 2 + assert "class_name" in data + assert "schema" in data + assert "objects" in data + assert len(data["objects"]) == 2 # Verify schema structure - assert 'class' in data['schema'] - assert 'properties' in data['schema'] + assert "class" in data["schema"] + assert "properties" in data["schema"] # Verify object structure - for obj in data['objects']: - assert 'id' in obj - assert 'properties' in obj + for obj in data["objects"]: + assert "id" in obj + assert "properties" in obj class TestUploadCommandIntegration: @@ -199,25 +197,26 @@ class TestUploadCommandIntegration: # Verify it accepts kwargs for vector DBs import inspect + sig = inspect.signature(upload_skill_api) params = list(sig.parameters.keys()) - assert 'package_path' in params - assert 'target' in params - assert 'api_key' in params - assert 'kwargs' in params # For platform-specific options + assert "package_path" in params + assert "target" in params + assert "api_key" in params + assert "kwargs" in params # For platform-specific options def test_upload_command_supports_chroma(self): """Test upload command recognizes chroma as target.""" # This should not raise ValueError - adaptor = get_adaptor('chroma') + adaptor = get_adaptor("chroma") assert adaptor is not None def test_upload_command_supports_weaviate(self): """Test upload command recognizes weaviate as target.""" # This should not raise ValueError - adaptor = get_adaptor('weaviate') + adaptor = get_adaptor("weaviate") assert adaptor is not None @@ -226,7 +225,7 @@ class TestErrorHandling: def test_chroma_handles_missing_file(self, tmp_path): """Test ChromaDB upload handles missing files gracefully.""" - adaptor = get_adaptor('chroma') + adaptor = get_adaptor("chroma") missing_file = tmp_path / "nonexistent.json" @@ -234,14 +233,14 @@ class TestErrorHandling: try: result = adaptor.upload(missing_file) # If it returns a dict, it should indicate failure - assert result['success'] is False + assert result["success"] is False except FileNotFoundError: # This is also acceptable pass def test_weaviate_handles_missing_file(self, tmp_path): """Test Weaviate upload handles missing files gracefully.""" - adaptor = get_adaptor('weaviate') + adaptor = get_adaptor("weaviate") missing_file = tmp_path / "nonexistent.json" @@ -249,14 +248,14 @@ class TestErrorHandling: try: result = adaptor.upload(missing_file) # If it returns a dict, it should indicate failure - assert result['success'] is False + assert result["success"] is False except FileNotFoundError: # This is also acceptable pass def test_chroma_handles_invalid_json(self, tmp_path): """Test ChromaDB upload handles invalid JSON gracefully.""" - adaptor = get_adaptor('chroma') + adaptor = get_adaptor("chroma") invalid_file = tmp_path / "invalid.json" invalid_file.write_text("not valid json{") @@ -265,14 +264,14 @@ class TestErrorHandling: try: result = adaptor.upload(invalid_file) # If it returns a dict, it should indicate failure - assert result['success'] is False + assert result["success"] is False except json.JSONDecodeError: # This is also acceptable pass def test_weaviate_handles_invalid_json(self, tmp_path): """Test Weaviate upload handles invalid JSON gracefully.""" - adaptor = get_adaptor('weaviate') + adaptor = get_adaptor("weaviate") invalid_file = tmp_path / "invalid.json" invalid_file.write_text("not valid json{") @@ -281,7 +280,7 @@ class TestErrorHandling: try: result = adaptor.upload(invalid_file) # If it returns a dict, it should indicate failure - assert result['success'] is False + assert result["success"] is False except json.JSONDecodeError: # This is also acceptable pass diff --git a/tests/test_url_conversion.py b/tests/test_url_conversion.py index 3eb31bd..e37a540 100644 --- a/tests/test_url_conversion.py +++ b/tests/test_url_conversion.py @@ -155,13 +155,9 @@ class TestConvertToMdUrls(unittest.TestCase): # Should deduplicate to 3 unique base URLs self.assertEqual(len(result), 3) - self.assertIn( - "https://mikro-orm.io/docs/quick-start/index.html.md", result - ) + self.assertIn("https://mikro-orm.io/docs/quick-start/index.html.md", result) self.assertIn("https://mikro-orm.io/docs/propagation/index.html.md", result) - self.assertIn( - "https://mikro-orm.io/docs/defining-entities/index.html.md", result - ) + self.assertIn("https://mikro-orm.io/docs/defining-entities/index.html.md", result) # Should NOT contain any URLs with anchor fragments for url in result: