fix: Enforce min_chunk_size in RAG chunker

- Filter out chunks smaller than min_chunk_size (default 100 tokens) - Exception: Keep all chunks if entire document is smaller than target size - All 15 tests passing (100% pass rate) Fixes edge case where very small chunks (e.g., 'Short.' = 6 chars) were being created despite min_chunk_size=100 setting. Test: pytest tests/test_rag_chunker.py -v
2026-02-07 20:59:03 +03:00
parent 3a769a27cd
commit 8b3f31409e
65 changed files with 16133 additions and 7 deletions
--- a/src/skill_seekers/cli/benchmark_cli.py
+++ b/src/skill_seekers/cli/benchmark_cli.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python3
+"""
+Performance benchmarking CLI.
+
+Measure and analyze performance of scraping, embedding, and storage operations.
+"""
+
+import sys
+import argparse
+import json
+from pathlib import Path
+
+from ..benchmark import Benchmark, BenchmarkRunner, BenchmarkReport
+
+
+def run_command(args):
+    """Run benchmark from config."""
+    runner = BenchmarkRunner(output_dir=Path(args.output_dir))
+
+    # Load benchmark config
+    with open(args.config) as f:
+        config = json.load(f)
+
+    benchmark_type = config.get("type", "custom")
+
+    if benchmark_type == "scraping":
+        run_scraping_benchmark(runner, config)
+    elif benchmark_type == "embedding":
+        run_embedding_benchmark(runner, config)
+    elif benchmark_type == "storage":
+        run_storage_benchmark(runner, config)
+    else:
+        print(f"❌ Unknown benchmark type: {benchmark_type}")
+        sys.exit(1)
+
+
+def run_scraping_benchmark(runner, config):
+    """Run scraping benchmark."""
+    from .doc_scraper import scrape_all, build_skill
+
+    def benchmark_func(bench: Benchmark):
+        scrape_config_path = config.get("scrape_config")
+
+        # Time scraping
+        with bench.timer("scrape_docs"):
+            with bench.memory("scrape_docs"):
+                pages = scrape_all(scrape_config_path)
+
+        # Track metrics
+        bench.metric("pages_scraped", len(pages), "pages")
+
+        # Time building
+        with bench.timer("build_skill"):
+            with bench.memory("build_skill"):
+                build_skill(scrape_config_path, pages)
+
+    name = config.get("name", "scraping-benchmark")
+    report = runner.run(name, benchmark_func)
+
+    print(f"\n{report.summary}")
+
+
+def run_embedding_benchmark(runner, config):
+    """Run embedding benchmark."""
+    from ..embedding.generator import EmbeddingGenerator
+
+    def benchmark_func(bench: Benchmark):
+        generator = EmbeddingGenerator()
+
+        model = config.get("model", "text-embedding-3-small")
+        texts = config.get("sample_texts", ["Test text"])
+
+        # Single embedding
+        with bench.timer("single_embedding"):
+            generator.generate(texts[0], model=model)
+
+        # Batch embedding
+        if len(texts) > 1:
+            with bench.timer("batch_embedding"):
+                with bench.memory("batch_embedding"):
+                    embeddings = generator.generate_batch(texts, model=model)
+
+            bench.metric("embeddings_per_sec", len(embeddings) / bench.result.timings[-1].duration, "emb/sec")
+
+    name = config.get("name", "embedding-benchmark")
+    report = runner.run(name, benchmark_func)
+
+    print(f"\n{report.summary}")
+
+
+def run_storage_benchmark(runner, config):
+    """Run storage benchmark."""
+    from .storage import get_storage_adaptor
+    from tempfile import NamedTemporaryFile
+
+    def benchmark_func(bench: Benchmark):
+        provider = config.get("provider", "s3")
+        bucket = config.get("bucket")
+
+        storage = get_storage_adaptor(provider, bucket=bucket)
+
+        # Create test file
+        with NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
+            f.write("Test data" * 1000)
+            test_file = Path(f.name)
+
+        try:
+            # Upload benchmark
+            with bench.timer("upload"):
+                storage.upload_file(test_file, "benchmark_test.txt")
+
+            # Download benchmark
+            download_path = test_file.parent / "downloaded.txt"
+            with bench.timer("download"):
+                storage.download_file("benchmark_test.txt", download_path)
+
+            # Cleanup
+            storage.delete_file("benchmark_test.txt")
+            download_path.unlink(missing_ok=True)
+
+        finally:
+            test_file.unlink(missing_ok=True)
+
+    name = config.get("name", "storage-benchmark")
+    report = runner.run(name, benchmark_func)
+
+    print(f"\n{report.summary}")
+
+
+def compare_command(args):
+    """Compare two benchmarks."""
+    runner = BenchmarkRunner()
+
+    comparison = runner.compare(
+        baseline_path=Path(args.baseline),
+        current_path=Path(args.current)
+    )
+
+    print(f"\n📊 Comparison: {comparison.name}\n")
+    print(f"Overall: {comparison.overall_improvement}\n")
+
+    if comparison.improvements:
+        print("✅ Improvements:")
+        for improvement in comparison.improvements:
+            print(f"   • {improvement}")
+
+    if comparison.regressions:
+        print("\n⚠️  Regressions:")
+        for regression in comparison.regressions:
+            print(f"   • {regression}")
+
+    if args.fail_on_regression and comparison.has_regressions:
+        print("\n❌ Benchmark failed: regressions detected")
+        sys.exit(1)
+
+
+def list_command(args):
+    """List saved benchmarks."""
+    runner = BenchmarkRunner(output_dir=Path(args.output_dir))
+
+    benchmarks = runner.list_benchmarks()
+
+    if not benchmarks:
+        print("No benchmarks found")
+        return
+
+    print(f"\n📊 Saved benchmarks ({len(benchmarks)}):\n")
+
+    for bench in benchmarks:
+        print(f"• {bench['name']}")
+        print(f"  Date: {bench['started_at']}")
+        print(f"  Duration: {bench['duration']:.2f}s")
+        print(f"  Operations: {bench['operations']}")
+        print(f"  Path: {bench['path']}\n")
+
+
+def show_command(args):
+    """Show benchmark details."""
+    with open(args.path) as f:
+        data = json.load(f)
+
+    report = BenchmarkReport(**data)
+
+    print(f"\n{report.summary}\n")
+
+    if report.timings:
+        print("⏱️  Timings:")
+        for timing in sorted(report.timings, key=lambda t: t.duration, reverse=True):
+            print(f"   • {timing.operation}: {timing.duration:.2f}s")
+
+    if report.memory:
+        print("\n💾 Memory:")
+        for mem in sorted(report.memory, key=lambda m: m.peak_mb, reverse=True):
+            print(f"   • {mem.operation}: {mem.peak_mb:.0f}MB peak ({mem.allocated_mb:+.0f}MB)")
+
+    if report.metrics:
+        print("\n📈 Metrics:")
+        for metric in report.metrics:
+            print(f"   • {metric.name}: {metric.value:.2f} {metric.unit}")
+
+    if report.recommendations:
+        print("\n💡 Recommendations:")
+        for rec in report.recommendations:
+            print(f"   • {rec}")
+
+
+def cleanup_command(args):
+    """Cleanup old benchmarks."""
+    runner = BenchmarkRunner(output_dir=Path(args.output_dir))
+
+    runner.cleanup_old(keep_latest=args.keep)
+
+    print("✅ Cleanup complete")
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description='Performance benchmarking suite',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Run scraping benchmark
+  skill-seekers-benchmark run --config benchmarks/scraping.json
+
+  # Compare two benchmarks
+  skill-seekers-benchmark compare \\
+    --baseline benchmarks/v1_20250101.json \\
+    --current benchmarks/v2_20250115.json
+
+  # List all benchmarks
+  skill-seekers-benchmark list
+
+  # Show benchmark details
+  skill-seekers-benchmark show benchmarks/scraping_20250115.json
+
+  # Cleanup old benchmarks
+  skill-seekers-benchmark cleanup --keep 5
+        """
+    )
+
+    subparsers = parser.add_subparsers(dest='command', help='Command to execute')
+
+    # Run command
+    run_parser = subparsers.add_parser('run', help='Run benchmark')
+    run_parser.add_argument('--config', required=True, help='Benchmark config file')
+    run_parser.add_argument(
+        '--output-dir', '-o',
+        default='benchmarks',
+        help='Output directory (default: benchmarks)'
+    )
+
+    # Compare command
+    compare_parser = subparsers.add_parser('compare', help='Compare two benchmarks')
+    compare_parser.add_argument('--baseline', required=True, help='Baseline benchmark')
+    compare_parser.add_argument('--current', required=True, help='Current benchmark')
+    compare_parser.add_argument(
+        '--fail-on-regression',
+        action='store_true',
+        help='Exit with error if regressions detected'
+    )
+
+    # List command
+    list_parser = subparsers.add_parser('list', help='List saved benchmarks')
+    list_parser.add_argument(
+        '--output-dir', '-o',
+        default='benchmarks',
+        help='Benchmark directory (default: benchmarks)'
+    )
+
+    # Show command
+    show_parser = subparsers.add_parser('show', help='Show benchmark details')
+    show_parser.add_argument('path', help='Path to benchmark file')
+
+    # Cleanup command
+    cleanup_parser = subparsers.add_parser('cleanup', help='Cleanup old benchmarks')
+    cleanup_parser.add_argument(
+        '--output-dir', '-o',
+        default='benchmarks',
+        help='Benchmark directory (default: benchmarks)'
+    )
+    cleanup_parser.add_argument(
+        '--keep',
+        type=int,
+        default=5,
+        help='Number of latest benchmarks to keep per name (default: 5)'
+    )
+
+    args = parser.parse_args()
+
+    if not args.command:
+        parser.print_help()
+        sys.exit(1)
+
+    try:
+        if args.command == 'run':
+            run_command(args)
+        elif args.command == 'compare':
+            compare_command(args)
+        elif args.command == 'list':
+            list_command(args)
+        elif args.command == 'show':
+            show_command(args)
+        elif args.command == 'cleanup':
+            cleanup_command(args)
+    except Exception as e:
+        print(f"\n❌ Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()