#!/usr/bin/env python3 """ Performance Benchmarks for Platform Adaptors Measures: - format_skill_md() performance across all adaptors - Complete package operation performance - Scaling behavior with increasing reference count - Output file sizes Usage: # Run all benchmarks pytest tests/test_adaptor_benchmarks.py -v # Run with benchmark marker pytest tests/test_adaptor_benchmarks.py -v -m benchmark # Generate detailed output pytest tests/test_adaptor_benchmarks.py -v -s """ import json import tempfile import time import unittest from pathlib import Path import pytest from skill_seekers.cli.adaptors import get_adaptor from skill_seekers.cli.adaptors.base import SkillMetadata @pytest.mark.benchmark class TestAdaptorBenchmarks(unittest.TestCase): """Performance benchmark suite for adaptors""" def setUp(self): """Set up test environment""" self.temp_dir = tempfile.TemporaryDirectory() self.output_dir = Path(self.temp_dir.name) / "output" self.output_dir.mkdir() def tearDown(self): """Clean up""" self.temp_dir.cleanup() def _create_skill_with_n_references(self, n: int, skill_name: str = "benchmark") -> Path: """ Create a skill directory with N reference files. Args: n: Number of reference files to create skill_name: Name of the skill Returns: Path to skill directory """ skill_dir = Path(self.temp_dir.name) / f"skill_{n}_refs" skill_dir.mkdir(exist_ok=True) # Create SKILL.md (5KB) skill_content = f"# {skill_name.title()} Skill\n\n" + "Lorem ipsum dolor sit amet. " * 500 (skill_dir / "SKILL.md").write_text(skill_content) # Create N reference files (5KB each) refs_dir = skill_dir / "references" refs_dir.mkdir(exist_ok=True) for i in range(n): content = f"# Reference {i}\n\n" + f"Content for reference {i}. " * 500 (refs_dir / f"ref_{i:03d}.md").write_text(content) return skill_dir def test_benchmark_format_skill_md_all_adaptors(self): """Benchmark format_skill_md across all adaptors""" print("\n" + "=" * 80) print("BENCHMARK: format_skill_md() - All Adaptors") print("=" * 80) # Create test skill (10 references) skill_dir = self._create_skill_with_n_references(10) metadata = SkillMetadata(name="benchmark", description="Benchmark test") # Platforms to benchmark platforms = [ "claude", "gemini", "openai", "markdown", # IDE integrations "langchain", "llama-index", "haystack", # RAG frameworks "weaviate", "chroma", "faiss", "qdrant", # Vector DBs ] results = {} for platform in platforms: adaptor = get_adaptor(platform) # Warm up (1 iteration) adaptor.format_skill_md(skill_dir, metadata) # Benchmark (5 iterations) times = [] for _ in range(5): start = time.perf_counter() formatted = adaptor.format_skill_md(skill_dir, metadata) end = time.perf_counter() times.append(end - start) # Validate output self.assertIsInstance(formatted, str) self.assertGreater(len(formatted), 0) # Calculate statistics avg_time = sum(times) / len(times) min_time = min(times) max_time = max(times) results[platform] = {"avg": avg_time, "min": min_time, "max": max_time} print( f"{platform:15} - Avg: {avg_time * 1000:6.2f}ms | " f"Min: {min_time * 1000:6.2f}ms | Max: {max_time * 1000:6.2f}ms" ) # Performance assertions (should complete in reasonable time) for platform, metrics in results.items(): self.assertLess( metrics["avg"], 0.5, # Should average < 500ms f"{platform} format_skill_md too slow: {metrics['avg'] * 1000:.2f}ms", ) def test_benchmark_package_operations(self): """Benchmark complete package operation""" print("\n" + "=" * 80) print("BENCHMARK: package() - Complete Operation") print("=" * 80) # Create test skill (10 references) skill_dir = self._create_skill_with_n_references(10) # Benchmark subset of platforms (representative sample) platforms = ["claude", "langchain", "chroma", "weaviate", "faiss"] results = {} for platform in platforms: adaptor = get_adaptor(platform) # Benchmark packaging start = time.perf_counter() package_path = adaptor.package(skill_dir, self.output_dir) end = time.perf_counter() elapsed = end - start # Get file size file_size_kb = package_path.stat().st_size / 1024 results[platform] = {"time": elapsed, "size_kb": file_size_kb} print(f"{platform:15} - Time: {elapsed * 1000:7.2f}ms | Size: {file_size_kb:7.1f} KB") # Validate output self.assertTrue(package_path.exists()) # Performance assertions for platform, metrics in results.items(): self.assertLess( metrics["time"], 1.0, # Should complete < 1 second f"{platform} packaging too slow: {metrics['time'] * 1000:.2f}ms", ) self.assertLess( metrics["size_kb"], 1000, # Should be < 1MB for 10 refs f"{platform} package too large: {metrics['size_kb']:.1f}KB", ) def test_benchmark_scaling_with_reference_count(self): """Test how performance scales with reference count""" print("\n" + "=" * 80) print("BENCHMARK: Scaling with Reference Count") print("=" * 80) # Test with LangChain (representative RAG adaptor) adaptor = get_adaptor("langchain") metadata = SkillMetadata(name="scaling_test", description="Scaling benchmark test") reference_counts = [1, 5, 10, 25, 50] results = [] print(f"\n{'Refs':>4} | {'Time (ms)':>10} | {'Time/Ref':>10} | {'Size (KB)':>10}") print("-" * 50) for ref_count in reference_counts: skill_dir = self._create_skill_with_n_references(ref_count) # Benchmark format_skill_md start = time.perf_counter() formatted = adaptor.format_skill_md(skill_dir, metadata) end = time.perf_counter() elapsed = end - start time_per_ref = elapsed / ref_count # Get output size json.loads(formatted) size_kb = len(formatted) / 1024 results.append( { "count": ref_count, "time": elapsed, "time_per_ref": time_per_ref, "size_kb": size_kb, } ) print( f"{ref_count:4} | {elapsed * 1000:10.2f} | {time_per_ref * 1000:10.3f} | {size_kb:10.1f}" ) # Analyze scaling behavior # Time per ref should not increase significantly (linear scaling) first_per_ref = results[0]["time_per_ref"] last_per_ref = results[-1]["time_per_ref"] scaling_factor = last_per_ref / first_per_ref print(f"\nScaling Factor: {scaling_factor:.2f}x") print(f"(Time per ref at 50 refs / Time per ref at 1 ref)") # Assert linear or sub-linear scaling (not exponential) self.assertLess(scaling_factor, 3.0, f"Non-linear scaling detected: {scaling_factor:.2f}x") def test_benchmark_json_vs_zip_size_comparison(self): """Compare output sizes: JSON vs ZIP/tar.gz""" print("\n" + "=" * 80) print("BENCHMARK: Output Size Comparison") print("=" * 80) # Create test skill (10 references) skill_dir = self._create_skill_with_n_references(10) # Package with different formats formats = { "claude": ("ZIP", ".zip"), "gemini": ("tar.gz", ".tar.gz"), "langchain": ("JSON", ".json"), "weaviate": ("JSON", ".json"), } results = {} print(f"\n{'Platform':15} | {'Format':8} | {'Size (KB)':>10}") print("-" * 50) for platform, (format_name, ext) in formats.items(): adaptor = get_adaptor(platform) package_path = adaptor.package(skill_dir, self.output_dir) size_kb = package_path.stat().st_size / 1024 results[platform] = {"format": format_name, "size_kb": size_kb} print(f"{platform:15} | {format_name:8} | {size_kb:10.1f}") # Analyze results json_sizes = [v["size_kb"] for k, v in results.items() if v["format"] == "JSON"] compressed_sizes = [ v["size_kb"] for k, v in results.items() if v["format"] in ["ZIP", "tar.gz"] ] if json_sizes and compressed_sizes: avg_json = sum(json_sizes) / len(json_sizes) avg_compressed = sum(compressed_sizes) / len(compressed_sizes) print(f"\nAverage JSON size: {avg_json:.1f} KB") print(f"Average compressed size: {avg_compressed:.1f} KB") print(f"Compression ratio: {avg_json / avg_compressed:.2f}x") def test_benchmark_metadata_overhead(self): """Measure metadata processing overhead""" print("\n" + "=" * 80) print("BENCHMARK: Metadata Processing Overhead") print("=" * 80) skill_dir = self._create_skill_with_n_references(10) # Minimal metadata minimal_meta = SkillMetadata(name="test", description="Test") # Rich metadata rich_meta = SkillMetadata( name="test", description="A comprehensive test skill for benchmarking purposes", version="2.5.0", author="Benchmark Suite", tags=["test", "benchmark", "performance", "validation", "quality"], ) adaptor = get_adaptor("langchain") iterations = 20 # Enough iterations to average out CI timing noise # Warm-up run (filesystem caches, JIT, etc.) adaptor.format_skill_md(skill_dir, minimal_meta) adaptor.format_skill_md(skill_dir, rich_meta) # Benchmark with minimal metadata times_minimal = [] for _ in range(iterations): start = time.perf_counter() adaptor.format_skill_md(skill_dir, minimal_meta) end = time.perf_counter() times_minimal.append(end - start) # Benchmark with rich metadata times_rich = [] for _ in range(iterations): start = time.perf_counter() adaptor.format_skill_md(skill_dir, rich_meta) end = time.perf_counter() times_rich.append(end - start) # Use median instead of mean to reduce outlier impact times_minimal.sort() times_rich.sort() med_minimal = times_minimal[len(times_minimal) // 2] med_rich = times_rich[len(times_rich) // 2] overhead = med_rich - med_minimal overhead_pct = (overhead / med_minimal) * 100 if med_minimal > 0 else 0.0 print(f"\nMinimal metadata (median): {med_minimal * 1000:.2f}ms") print(f"Rich metadata (median): {med_rich * 1000:.2f}ms") print(f"Overhead: {overhead * 1000:.2f}ms ({overhead_pct:.1f}%)") # Rich metadata should not cause catastrophic overhead. # On noisy CI machines, microsecond-level operations can show high # percentage variance, so we use a generous threshold. self.assertLess(overhead_pct, 200.0, f"Metadata overhead too high: {overhead_pct:.1f}%") def test_benchmark_empty_vs_full_skill(self): """Compare performance: empty skill vs full skill""" print("\n" + "=" * 80) print("BENCHMARK: Empty vs Full Skill") print("=" * 80) adaptor = get_adaptor("chroma") metadata = SkillMetadata(name="test", description="Test benchmark") # Empty skill empty_dir = Path(self.temp_dir.name) / "empty" empty_dir.mkdir() start = time.perf_counter() adaptor.format_skill_md(empty_dir, metadata) empty_time = time.perf_counter() - start # Full skill (50 references) full_dir = self._create_skill_with_n_references(50) start = time.perf_counter() adaptor.format_skill_md(full_dir, metadata) full_time = time.perf_counter() - start print(f"\nEmpty skill: {empty_time * 1000:.2f}ms") print(f"Full skill (50 refs): {full_time * 1000:.2f}ms") print(f"Ratio: {full_time / empty_time:.1f}x") # Empty should be very fast self.assertLess(empty_time, 0.01, "Empty skill processing too slow") # Full should scale reasonably self.assertLess(full_time, 0.5, "Full skill processing too slow") if __name__ == "__main__": # Run benchmarks unittest.main(verbosity=2)