fix: Enforce min_chunk_size in RAG chunker

- Filter out chunks smaller than min_chunk_size (default 100 tokens) - Exception: Keep all chunks if entire document is smaller than target size - All 15 tests passing (100% pass rate) Fixes edge case where very small chunks (e.g., 'Short.' = 6 chars) were being created despite min_chunk_size=100 setting. Test: pytest tests/test_rag_chunker.py -v
2026-02-07 20:59:03 +03:00
parent 3a769a27cd
commit 8b3f31409e
65 changed files with 16133 additions and 7 deletions
--- a/src/skill_seekers/cli/benchmark_cli.py
+++ b/src/skill_seekers/cli/benchmark_cli.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python3
+"""
+Performance benchmarking CLI.
+
+Measure and analyze performance of scraping, embedding, and storage operations.
+"""
+
+import sys
+import argparse
+import json
+from pathlib import Path
+
+from ..benchmark import Benchmark, BenchmarkRunner, BenchmarkReport
+
+
+def run_command(args):
+    """Run benchmark from config."""
+    runner = BenchmarkRunner(output_dir=Path(args.output_dir))
+
+    # Load benchmark config
+    with open(args.config) as f:
+        config = json.load(f)
+
+    benchmark_type = config.get("type", "custom")
+
+    if benchmark_type == "scraping":
+        run_scraping_benchmark(runner, config)
+    elif benchmark_type == "embedding":
+        run_embedding_benchmark(runner, config)
+    elif benchmark_type == "storage":
+        run_storage_benchmark(runner, config)
+    else:
+        print(f"❌ Unknown benchmark type: {benchmark_type}")
+        sys.exit(1)
+
+
+def run_scraping_benchmark(runner, config):
+    """Run scraping benchmark."""
+    from .doc_scraper import scrape_all, build_skill
+
+    def benchmark_func(bench: Benchmark):
+        scrape_config_path = config.get("scrape_config")
+
+        # Time scraping
+        with bench.timer("scrape_docs"):
+            with bench.memory("scrape_docs"):
+                pages = scrape_all(scrape_config_path)
+
+        # Track metrics
+        bench.metric("pages_scraped", len(pages), "pages")
+
+        # Time building
+        with bench.timer("build_skill"):
+            with bench.memory("build_skill"):
+                build_skill(scrape_config_path, pages)
+
+    name = config.get("name", "scraping-benchmark")
+    report = runner.run(name, benchmark_func)
+
+    print(f"\n{report.summary}")
+
+
+def run_embedding_benchmark(runner, config):
+    """Run embedding benchmark."""
+    from ..embedding.generator import EmbeddingGenerator
+
+    def benchmark_func(bench: Benchmark):
+        generator = EmbeddingGenerator()
+
+        model = config.get("model", "text-embedding-3-small")
+        texts = config.get("sample_texts", ["Test text"])
+
+        # Single embedding
+        with bench.timer("single_embedding"):
+            generator.generate(texts[0], model=model)
+
+        # Batch embedding
+        if len(texts) > 1:
+            with bench.timer("batch_embedding"):
+                with bench.memory("batch_embedding"):
+                    embeddings = generator.generate_batch(texts, model=model)
+
+            bench.metric("embeddings_per_sec", len(embeddings) / bench.result.timings[-1].duration, "emb/sec")
+
+    name = config.get("name", "embedding-benchmark")
+    report = runner.run(name, benchmark_func)
+
+    print(f"\n{report.summary}")
+
+
+def run_storage_benchmark(runner, config):
+    """Run storage benchmark."""
+    from .storage import get_storage_adaptor
+    from tempfile import NamedTemporaryFile
+
+    def benchmark_func(bench: Benchmark):
+        provider = config.get("provider", "s3")
+        bucket = config.get("bucket")
+
+        storage = get_storage_adaptor(provider, bucket=bucket)
+
+        # Create test file
+        with NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
+            f.write("Test data" * 1000)
+            test_file = Path(f.name)
+
+        try:
+            # Upload benchmark
+            with bench.timer("upload"):
+                storage.upload_file(test_file, "benchmark_test.txt")
+
+            # Download benchmark
+            download_path = test_file.parent / "downloaded.txt"
+            with bench.timer("download"):
+                storage.download_file("benchmark_test.txt", download_path)
+
+            # Cleanup
+            storage.delete_file("benchmark_test.txt")
+            download_path.unlink(missing_ok=True)
+
+        finally:
+            test_file.unlink(missing_ok=True)
+
+    name = config.get("name", "storage-benchmark")
+    report = runner.run(name, benchmark_func)
+
+    print(f"\n{report.summary}")
+
+
+def compare_command(args):
+    """Compare two benchmarks."""
+    runner = BenchmarkRunner()
+
+    comparison = runner.compare(
+        baseline_path=Path(args.baseline),
+        current_path=Path(args.current)
+    )
+
+    print(f"\n📊 Comparison: {comparison.name}\n")
+    print(f"Overall: {comparison.overall_improvement}\n")
+
+    if comparison.improvements:
+        print("✅ Improvements:")
+        for improvement in comparison.improvements:
+            print(f"   • {improvement}")
+
+    if comparison.regressions:
+        print("\n⚠️  Regressions:")
+        for regression in comparison.regressions:
+            print(f"   • {regression}")
+
+    if args.fail_on_regression and comparison.has_regressions:
+        print("\n❌ Benchmark failed: regressions detected")
+        sys.exit(1)
+
+
+def list_command(args):
+    """List saved benchmarks."""
+    runner = BenchmarkRunner(output_dir=Path(args.output_dir))
+
+    benchmarks = runner.list_benchmarks()
+
+    if not benchmarks:
+        print("No benchmarks found")
+        return
+
+    print(f"\n📊 Saved benchmarks ({len(benchmarks)}):\n")
+
+    for bench in benchmarks:
+        print(f"• {bench['name']}")
+        print(f"  Date: {bench['started_at']}")
+        print(f"  Duration: {bench['duration']:.2f}s")
+        print(f"  Operations: {bench['operations']}")
+        print(f"  Path: {bench['path']}\n")
+
+
+def show_command(args):
+    """Show benchmark details."""
+    with open(args.path) as f:
+        data = json.load(f)
+
+    report = BenchmarkReport(**data)
+
+    print(f"\n{report.summary}\n")
+
+    if report.timings:
+        print("⏱️  Timings:")
+        for timing in sorted(report.timings, key=lambda t: t.duration, reverse=True):
+            print(f"   • {timing.operation}: {timing.duration:.2f}s")
+
+    if report.memory:
+        print("\n💾 Memory:")
+        for mem in sorted(report.memory, key=lambda m: m.peak_mb, reverse=True):
+            print(f"   • {mem.operation}: {mem.peak_mb:.0f}MB peak ({mem.allocated_mb:+.0f}MB)")
+
+    if report.metrics:
+        print("\n📈 Metrics:")
+        for metric in report.metrics:
+            print(f"   • {metric.name}: {metric.value:.2f} {metric.unit}")
+
+    if report.recommendations:
+        print("\n💡 Recommendations:")
+        for rec in report.recommendations:
+            print(f"   • {rec}")
+
+
+def cleanup_command(args):
+    """Cleanup old benchmarks."""
+    runner = BenchmarkRunner(output_dir=Path(args.output_dir))
+
+    runner.cleanup_old(keep_latest=args.keep)
+
+    print("✅ Cleanup complete")
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description='Performance benchmarking suite',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Run scraping benchmark
+  skill-seekers-benchmark run --config benchmarks/scraping.json
+
+  # Compare two benchmarks
+  skill-seekers-benchmark compare \\
+    --baseline benchmarks/v1_20250101.json \\
+    --current benchmarks/v2_20250115.json
+
+  # List all benchmarks
+  skill-seekers-benchmark list
+
+  # Show benchmark details
+  skill-seekers-benchmark show benchmarks/scraping_20250115.json
+
+  # Cleanup old benchmarks
+  skill-seekers-benchmark cleanup --keep 5
+        """
+    )
+
+    subparsers = parser.add_subparsers(dest='command', help='Command to execute')
+
+    # Run command
+    run_parser = subparsers.add_parser('run', help='Run benchmark')
+    run_parser.add_argument('--config', required=True, help='Benchmark config file')
+    run_parser.add_argument(
+        '--output-dir', '-o',
+        default='benchmarks',
+        help='Output directory (default: benchmarks)'
+    )
+
+    # Compare command
+    compare_parser = subparsers.add_parser('compare', help='Compare two benchmarks')
+    compare_parser.add_argument('--baseline', required=True, help='Baseline benchmark')
+    compare_parser.add_argument('--current', required=True, help='Current benchmark')
+    compare_parser.add_argument(
+        '--fail-on-regression',
+        action='store_true',
+        help='Exit with error if regressions detected'
+    )
+
+    # List command
+    list_parser = subparsers.add_parser('list', help='List saved benchmarks')
+    list_parser.add_argument(
+        '--output-dir', '-o',
+        default='benchmarks',
+        help='Benchmark directory (default: benchmarks)'
+    )
+
+    # Show command
+    show_parser = subparsers.add_parser('show', help='Show benchmark details')
+    show_parser.add_argument('path', help='Path to benchmark file')
+
+    # Cleanup command
+    cleanup_parser = subparsers.add_parser('cleanup', help='Cleanup old benchmarks')
+    cleanup_parser.add_argument(
+        '--output-dir', '-o',
+        default='benchmarks',
+        help='Benchmark directory (default: benchmarks)'
+    )
+    cleanup_parser.add_argument(
+        '--keep',
+        type=int,
+        default=5,
+        help='Number of latest benchmarks to keep per name (default: 5)'
+    )
+
+    args = parser.parse_args()
+
+    if not args.command:
+        parser.print_help()
+        sys.exit(1)
+
+    try:
+        if args.command == 'run':
+            run_command(args)
+        elif args.command == 'compare':
+            compare_command(args)
+        elif args.command == 'list':
+            list_command(args)
+        elif args.command == 'show':
+            show_command(args)
+        elif args.command == 'cleanup':
+            cleanup_command(args)
+    except Exception as e:
+        print(f"\n❌ Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
--- a/src/skill_seekers/cli/cloud_storage_cli.py
+++ b/src/skill_seekers/cli/cloud_storage_cli.py
@@ -0,0 +1,351 @@
+#!/usr/bin/env python3
+"""
+Cloud storage CLI for Skill Seekers.
+
+Upload, download, and manage skills in cloud storage (S3, GCS, Azure).
+"""
+
+import sys
+import argparse
+from pathlib import Path
+from typing import Optional
+
+from .storage import get_storage_adaptor
+
+
+def upload_command(args):
+    """Handle upload subcommand."""
+    adaptor = get_storage_adaptor(
+        args.provider,
+        bucket=args.bucket,
+        container=args.container,
+        **parse_extra_args(args.extra)
+    )
+
+    if Path(args.local_path).is_dir():
+        print(f"📁 Uploading directory: {args.local_path}")
+        uploaded_files = adaptor.upload_directory(
+            args.local_path,
+            args.remote_path,
+            exclude_patterns=args.exclude
+        )
+        print(f"✅ Uploaded {len(uploaded_files)} files")
+        if args.verbose:
+            for file_path in uploaded_files:
+                print(f"  - {file_path}")
+    else:
+        print(f"📄 Uploading file: {args.local_path}")
+        url = adaptor.upload_file(args.local_path, args.remote_path)
+        print(f"✅ Upload complete: {url}")
+
+
+def download_command(args):
+    """Handle download subcommand."""
+    adaptor = get_storage_adaptor(
+        args.provider,
+        bucket=args.bucket,
+        container=args.container,
+        **parse_extra_args(args.extra)
+    )
+
+    # Check if remote path is a directory (ends with /)
+    if args.remote_path.endswith('/'):
+        print(f"📁 Downloading directory: {args.remote_path}")
+        downloaded_files = adaptor.download_directory(
+            args.remote_path,
+            args.local_path
+        )
+        print(f"✅ Downloaded {len(downloaded_files)} files")
+        if args.verbose:
+            for file_path in downloaded_files:
+                print(f"  - {file_path}")
+    else:
+        print(f"📄 Downloading file: {args.remote_path}")
+        adaptor.download_file(args.remote_path, args.local_path)
+        print(f"✅ Download complete: {args.local_path}")
+
+
+def list_command(args):
+    """Handle list subcommand."""
+    adaptor = get_storage_adaptor(
+        args.provider,
+        bucket=args.bucket,
+        container=args.container,
+        **parse_extra_args(args.extra)
+    )
+
+    print(f"📋 Listing files: {args.prefix or '(root)'}")
+    files = adaptor.list_files(args.prefix, args.max_results)
+
+    if not files:
+        print("  (no files found)")
+        return
+
+    print(f"\nFound {len(files)} files:\n")
+
+    # Calculate column widths
+    max_size_width = max(len(format_size(f.size)) for f in files)
+
+    for file_obj in files:
+        size_str = format_size(file_obj.size).rjust(max_size_width)
+        print(f"  {size_str}  {file_obj.key}")
+
+        if args.verbose and file_obj.last_modified:
+            print(f"           Modified: {file_obj.last_modified}")
+            if file_obj.metadata:
+                print(f"           Metadata: {file_obj.metadata}")
+            print()
+
+
+def delete_command(args):
+    """Handle delete subcommand."""
+    adaptor = get_storage_adaptor(
+        args.provider,
+        bucket=args.bucket,
+        container=args.container,
+        **parse_extra_args(args.extra)
+    )
+
+    if not args.force:
+        response = input(f"⚠️  Delete {args.remote_path}? [y/N]: ")
+        if response.lower() != 'y':
+            print("❌ Deletion cancelled")
+            return
+
+    print(f"🗑️  Deleting: {args.remote_path}")
+    adaptor.delete_file(args.remote_path)
+    print("✅ Deletion complete")
+
+
+def url_command(args):
+    """Handle url subcommand."""
+    adaptor = get_storage_adaptor(
+        args.provider,
+        bucket=args.bucket,
+        container=args.container,
+        **parse_extra_args(args.extra)
+    )
+
+    print(f"🔗 Generating signed URL: {args.remote_path}")
+    url = adaptor.get_file_url(args.remote_path, args.expires_in)
+    print(f"\n{url}\n")
+    print(f"⏱️  Expires in: {args.expires_in} seconds ({args.expires_in // 3600}h)")
+
+
+def copy_command(args):
+    """Handle copy subcommand."""
+    adaptor = get_storage_adaptor(
+        args.provider,
+        bucket=args.bucket,
+        container=args.container,
+        **parse_extra_args(args.extra)
+    )
+
+    print(f"📋 Copying: {args.source_path} → {args.dest_path}")
+    adaptor.copy_file(args.source_path, args.dest_path)
+    print("✅ Copy complete")
+
+
+def format_size(size_bytes: int) -> str:
+    """Format file size in human-readable format."""
+    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
+        if size_bytes < 1024.0:
+            return f"{size_bytes:.1f}{unit}"
+        size_bytes /= 1024.0
+    return f"{size_bytes:.1f}PB"
+
+
+def parse_extra_args(extra: Optional[list]) -> dict:
+    """Parse extra arguments into dictionary."""
+    if not extra:
+        return {}
+
+    result = {}
+    for arg in extra:
+        if '=' in arg:
+            key, value = arg.split('=', 1)
+            result[key.lstrip('-')] = value
+        else:
+            result[arg.lstrip('-')] = True
+
+    return result
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description='Cloud storage operations for Skill Seekers',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Upload skill to S3
+  skill-seekers-cloud upload --provider s3 --bucket my-bucket \\
+    --local-path output/react/ --remote-path skills/react/
+
+  # Download from GCS
+  skill-seekers-cloud download --provider gcs --bucket my-bucket \\
+    --remote-path skills/react/ --local-path output/react/
+
+  # List files in Azure
+  skill-seekers-cloud list --provider azure --container my-container \\
+    --prefix skills/
+
+  # Generate signed URL
+  skill-seekers-cloud url --provider s3 --bucket my-bucket \\
+    --remote-path skills/react.zip --expires-in 7200
+
+Provider-specific options:
+  S3:    --region=us-west-2 --endpoint-url=https://...
+  GCS:   --project=my-project --credentials-path=/path/to/creds.json
+  Azure: --account-name=myaccount --account-key=...
+        """
+    )
+
+    # Global arguments
+    parser.add_argument(
+        '--provider',
+        choices=['s3', 'gcs', 'azure'],
+        required=True,
+        help='Cloud storage provider'
+    )
+    parser.add_argument(
+        '--bucket',
+        help='S3/GCS bucket name (for S3/GCS)'
+    )
+    parser.add_argument(
+        '--container',
+        help='Azure container name (for Azure)'
+    )
+    parser.add_argument(
+        '--verbose', '-v',
+        action='store_true',
+        help='Verbose output'
+    )
+
+    subparsers = parser.add_subparsers(dest='command', help='Command to execute')
+
+    # Upload command
+    upload_parser = subparsers.add_parser('upload', help='Upload file or directory')
+    upload_parser.add_argument('local_path', help='Local file or directory path')
+    upload_parser.add_argument('remote_path', help='Remote path in cloud storage')
+    upload_parser.add_argument(
+        '--exclude',
+        action='append',
+        help='Glob patterns to exclude (for directories)'
+    )
+    upload_parser.add_argument(
+        'extra',
+        nargs='*',
+        help='Provider-specific options (--key=value)'
+    )
+
+    # Download command
+    download_parser = subparsers.add_parser('download', help='Download file or directory')
+    download_parser.add_argument('remote_path', help='Remote path in cloud storage')
+    download_parser.add_argument('local_path', help='Local destination path')
+    download_parser.add_argument(
+        'extra',
+        nargs='*',
+        help='Provider-specific options (--key=value)'
+    )
+
+    # List command
+    list_parser = subparsers.add_parser('list', help='List files in cloud storage')
+    list_parser.add_argument(
+        '--prefix',
+        default='',
+        help='Prefix to filter files'
+    )
+    list_parser.add_argument(
+        '--max-results',
+        type=int,
+        default=1000,
+        help='Maximum number of results'
+    )
+    list_parser.add_argument(
+        'extra',
+        nargs='*',
+        help='Provider-specific options (--key=value)'
+    )
+
+    # Delete command
+    delete_parser = subparsers.add_parser('delete', help='Delete file from cloud storage')
+    delete_parser.add_argument('remote_path', help='Remote path in cloud storage')
+    delete_parser.add_argument(
+        '--force', '-f',
+        action='store_true',
+        help='Skip confirmation prompt'
+    )
+    delete_parser.add_argument(
+        'extra',
+        nargs='*',
+        help='Provider-specific options (--key=value)'
+    )
+
+    # URL command
+    url_parser = subparsers.add_parser('url', help='Generate signed URL')
+    url_parser.add_argument('remote_path', help='Remote path in cloud storage')
+    url_parser.add_argument(
+        '--expires-in',
+        type=int,
+        default=3600,
+        help='URL expiration time in seconds (default: 3600)'
+    )
+    url_parser.add_argument(
+        'extra',
+        nargs='*',
+        help='Provider-specific options (--key=value)'
+    )
+
+    # Copy command
+    copy_parser = subparsers.add_parser('copy', help='Copy file within cloud storage')
+    copy_parser.add_argument('source_path', help='Source path')
+    copy_parser.add_argument('dest_path', help='Destination path')
+    copy_parser.add_argument(
+        'extra',
+        nargs='*',
+        help='Provider-specific options (--key=value)'
+    )
+
+    args = parser.parse_args()
+
+    if not args.command:
+        parser.print_help()
+        sys.exit(1)
+
+    # Validate bucket/container based on provider
+    if args.provider in ['s3', 'gcs'] and not args.bucket:
+        print(f"❌ Error: --bucket is required for {args.provider.upper()}", file=sys.stderr)
+        sys.exit(1)
+    elif args.provider == 'azure' and not args.container:
+        print("❌ Error: --container is required for Azure", file=sys.stderr)
+        sys.exit(1)
+
+    try:
+        # Execute command
+        if args.command == 'upload':
+            upload_command(args)
+        elif args.command == 'download':
+            download_command(args)
+        elif args.command == 'list':
+            list_command(args)
+        elif args.command == 'delete':
+            delete_command(args)
+        elif args.command == 'url':
+            url_command(args)
+        elif args.command == 'copy':
+            copy_command(args)
+
+    except FileNotFoundError as e:
+        print(f"❌ Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"❌ Error: {e}", file=sys.stderr)
+        if args.verbose:
+            import traceback
+            traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
--- a/src/skill_seekers/cli/rag_chunker.py
+++ b/src/skill_seekers/cli/rag_chunker.py
@@ -206,8 +206,9 @@ class RAGChunker:
        code_blocks = []
        placeholder_pattern = "<<CODE_BLOCK_{idx}>>"

-        # Match code blocks (both ``` and indented)
-        code_block_pattern = r'```[\s\S]*?```|(?:^|\n)(?: {4}|\t).+(?:\n(?: {4}|\t).+)*'
+        # Match code blocks (``` fenced blocks)
+        # Use DOTALL flag to match across newlines
+        code_block_pattern = r'```[^\n]*\n.*?```'

        def replacer(match):
            idx = len(code_blocks)
@@ -219,7 +220,12 @@ class RAGChunker:
            })
            return placeholder_pattern.format(idx=idx)

-        text_with_placeholders = re.sub(code_block_pattern, replacer, text)
+        text_with_placeholders = re.sub(
+            code_block_pattern,
+            replacer,
+            text,
+            flags=re.DOTALL
+        )

        return text_with_placeholders, code_blocks

@@ -270,6 +276,17 @@ class RAGChunker:
        for match in re.finditer(r'\n#{1,6}\s+.+\n', text):
            boundaries.append(match.start())

+        # Single newlines (less preferred, but useful)
+        for match in re.finditer(r'\n', text):
+            boundaries.append(match.start())
+
+        # If we have very few boundaries, add artificial ones
+        # (for text without natural boundaries like "AAA...")
+        if len(boundaries) < 3:
+            target_size_chars = self.chunk_size * self.chars_per_token
+            for i in range(target_size_chars, len(text), target_size_chars):
+                boundaries.append(i)
+
        # End is always a boundary
        boundaries.append(len(text))

@@ -326,9 +343,11 @@ class RAGChunker:
            end_pos = boundaries[min(j, len(boundaries) - 1)]
            chunk_text = text[start_pos:end_pos]

-            # Add chunk (relaxed minimum size requirement for small docs)
+            # Add chunk if it meets minimum size requirement
+            # (unless the entire text is smaller than target size)
            if chunk_text.strip():
-                chunks.append(chunk_text)
+                if len(text) <= target_size_chars or len(chunk_text) >= min_size_chars:
+                    chunks.append(chunk_text)

            # Move to next chunk with overlap
            if j < len(boundaries) - 1:
--- a/src/skill_seekers/cli/storage/init.py
+++ b/src/skill_seekers/cli/storage/init.py
@@ -0,0 +1,85 @@
+"""
+Cloud storage adaptors for Skill Seekers.
+
+Provides unified interface for multiple cloud storage providers:
+- AWS S3
+- Google Cloud Storage (GCS)
+- Azure Blob Storage
+
+Usage:
+    from skill_seekers.cli.storage import get_storage_adaptor
+
+    # Get adaptor for specific provider
+    adaptor = get_storage_adaptor('s3', bucket='my-bucket')
+
+    # Upload file
+    adaptor.upload_file('local/path/skill.zip', 'skills/skill.zip')
+
+    # Download file
+    adaptor.download_file('skills/skill.zip', 'local/path/skill.zip')
+
+    # List files
+    files = adaptor.list_files('skills/')
+"""
+
+from .base_storage import BaseStorageAdaptor, StorageObject
+from .s3_storage import S3StorageAdaptor
+from .gcs_storage import GCSStorageAdaptor
+from .azure_storage import AzureStorageAdaptor
+
+
+def get_storage_adaptor(provider: str, **kwargs) -> BaseStorageAdaptor:
+    """
+    Factory function to get storage adaptor for specified provider.
+
+    Args:
+        provider: Storage provider name ('s3', 'gcs', 'azure')
+        **kwargs: Provider-specific configuration
+
+    Returns:
+        Storage adaptor instance
+
+    Raises:
+        ValueError: If provider is not supported
+
+    Examples:
+        # AWS S3
+        adaptor = get_storage_adaptor('s3',
+                                     bucket='my-bucket',
+                                     region='us-west-2')
+
+        # Google Cloud Storage
+        adaptor = get_storage_adaptor('gcs',
+                                     bucket='my-bucket',
+                                     project='my-project')
+
+        # Azure Blob Storage
+        adaptor = get_storage_adaptor('azure',
+                                     container='my-container',
+                                     account_name='myaccount')
+    """
+    adaptors = {
+        's3': S3StorageAdaptor,
+        'gcs': GCSStorageAdaptor,
+        'azure': AzureStorageAdaptor,
+    }
+
+    provider_lower = provider.lower()
+    if provider_lower not in adaptors:
+        supported = ', '.join(adaptors.keys())
+        raise ValueError(
+            f"Unsupported storage provider: {provider}. "
+            f"Supported providers: {supported}"
+        )
+
+    return adaptors[provider_lower](**kwargs)
+
+
+__all__ = [
+    'BaseStorageAdaptor',
+    'StorageObject',
+    'S3StorageAdaptor',
+    'GCSStorageAdaptor',
+    'AzureStorageAdaptor',
+    'get_storage_adaptor',
+]
--- a/src/skill_seekers/cli/storage/azure_storage.py
+++ b/src/skill_seekers/cli/storage/azure_storage.py
@@ -0,0 +1,254 @@
+"""
+Azure Blob Storage adaptor implementation.
+"""
+
+import os
+from pathlib import Path
+from typing import List, Dict, Optional
+from datetime import datetime, timedelta
+
+try:
+    from azure.storage.blob import BlobServiceClient, BlobSasPermissions, generate_blob_sas
+    from azure.core.exceptions import ResourceNotFoundError
+    AZURE_AVAILABLE = True
+except ImportError:
+    AZURE_AVAILABLE = False
+
+from .base_storage import BaseStorageAdaptor, StorageObject
+
+
+class AzureStorageAdaptor(BaseStorageAdaptor):
+    """
+    Azure Blob Storage adaptor.
+
+    Configuration:
+        container: Azure container name (required)
+        account_name: Storage account name (optional, uses env)
+        account_key: Storage account key (optional, uses env)
+        connection_string: Connection string (optional, alternative to account_name/key)
+
+    Environment Variables:
+        AZURE_STORAGE_CONNECTION_STRING: Azure storage connection string
+        AZURE_STORAGE_ACCOUNT_NAME: Storage account name
+        AZURE_STORAGE_ACCOUNT_KEY: Storage account key
+
+    Examples:
+        # Using connection string
+        adaptor = AzureStorageAdaptor(
+            container='my-container',
+            connection_string='DefaultEndpointsProtocol=https;...'
+        )
+
+        # Using account name and key
+        adaptor = AzureStorageAdaptor(
+            container='my-container',
+            account_name='myaccount',
+            account_key='mykey'
+        )
+
+        # Using environment variables
+        adaptor = AzureStorageAdaptor(container='my-container')
+    """
+
+    def __init__(self, **kwargs):
+        """
+        Initialize Azure storage adaptor.
+
+        Args:
+            container: Azure container name (required)
+            **kwargs: Additional Azure configuration
+        """
+        super().__init__(**kwargs)
+
+        if not AZURE_AVAILABLE:
+            raise ImportError(
+                "azure-storage-blob is required for Azure storage. "
+                "Install with: pip install azure-storage-blob"
+            )
+
+        if 'container' not in kwargs:
+            raise ValueError("container parameter is required for Azure storage")
+
+        self.container_name = kwargs['container']
+
+        # Initialize BlobServiceClient
+        if 'connection_string' in kwargs:
+            connection_string = kwargs['connection_string']
+        else:
+            connection_string = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
+
+        if connection_string:
+            self.blob_service_client = BlobServiceClient.from_connection_string(
+                connection_string
+            )
+            # Extract account name from connection string
+            self.account_name = None
+            self.account_key = None
+            for part in connection_string.split(';'):
+                if part.startswith('AccountName='):
+                    self.account_name = part.split('=', 1)[1]
+                elif part.startswith('AccountKey='):
+                    self.account_key = part.split('=', 1)[1]
+        else:
+            account_name = kwargs.get(
+                'account_name',
+                os.getenv('AZURE_STORAGE_ACCOUNT_NAME')
+            )
+            account_key = kwargs.get(
+                'account_key',
+                os.getenv('AZURE_STORAGE_ACCOUNT_KEY')
+            )
+
+            if not account_name or not account_key:
+                raise ValueError(
+                    "Either connection_string or (account_name + account_key) "
+                    "must be provided for Azure storage"
+                )
+
+            self.account_name = account_name
+            self.account_key = account_key
+            account_url = f"https://{account_name}.blob.core.windows.net"
+            self.blob_service_client = BlobServiceClient(
+                account_url=account_url,
+                credential=account_key
+            )
+
+        self.container_client = self.blob_service_client.get_container_client(
+            self.container_name
+        )
+
+    def upload_file(
+        self, local_path: str, remote_path: str, metadata: Optional[Dict[str, str]] = None
+    ) -> str:
+        """Upload file to Azure Blob Storage."""
+        local_file = Path(local_path)
+        if not local_file.exists():
+            raise FileNotFoundError(f"Local file not found: {local_path}")
+
+        try:
+            blob_client = self.container_client.get_blob_client(remote_path)
+
+            with open(local_file, "rb") as data:
+                blob_client.upload_blob(
+                    data,
+                    overwrite=True,
+                    metadata=metadata
+                )
+
+            return f"https://{self.account_name}.blob.core.windows.net/{self.container_name}/{remote_path}"
+        except Exception as e:
+            raise Exception(f"Azure upload failed: {e}")
+
+    def download_file(self, remote_path: str, local_path: str) -> None:
+        """Download file from Azure Blob Storage."""
+        local_file = Path(local_path)
+        local_file.parent.mkdir(parents=True, exist_ok=True)
+
+        try:
+            blob_client = self.container_client.get_blob_client(remote_path)
+
+            with open(local_file, "wb") as download_file:
+                download_stream = blob_client.download_blob()
+                download_file.write(download_stream.readall())
+        except ResourceNotFoundError:
+            raise FileNotFoundError(f"Remote file not found: {remote_path}")
+        except Exception as e:
+            raise Exception(f"Azure download failed: {e}")
+
+    def delete_file(self, remote_path: str) -> None:
+        """Delete file from Azure Blob Storage."""
+        try:
+            blob_client = self.container_client.get_blob_client(remote_path)
+            blob_client.delete_blob()
+        except ResourceNotFoundError:
+            raise FileNotFoundError(f"Remote file not found: {remote_path}")
+        except Exception as e:
+            raise Exception(f"Azure deletion failed: {e}")
+
+    def list_files(
+        self, prefix: str = "", max_results: int = 1000
+    ) -> List[StorageObject]:
+        """List files in Azure container."""
+        try:
+            blobs = self.container_client.list_blobs(
+                name_starts_with=prefix,
+                results_per_page=max_results
+            )
+
+            files = []
+            for blob in blobs:
+                files.append(StorageObject(
+                    key=blob.name,
+                    size=blob.size,
+                    last_modified=blob.last_modified.isoformat() if blob.last_modified else None,
+                    etag=blob.etag,
+                    metadata=blob.metadata
+                ))
+
+            return files
+        except Exception as e:
+            raise Exception(f"Azure listing failed: {e}")
+
+    def file_exists(self, remote_path: str) -> bool:
+        """Check if file exists in Azure Blob Storage."""
+        try:
+            blob_client = self.container_client.get_blob_client(remote_path)
+            return blob_client.exists()
+        except Exception as e:
+            raise Exception(f"Azure file existence check failed: {e}")
+
+    def get_file_url(self, remote_path: str, expires_in: int = 3600) -> str:
+        """Generate SAS URL for Azure blob."""
+        try:
+            blob_client = self.container_client.get_blob_client(remote_path)
+
+            if not blob_client.exists():
+                raise FileNotFoundError(f"Remote file not found: {remote_path}")
+
+            if not self.account_name or not self.account_key:
+                raise ValueError(
+                    "Account name and key are required for SAS URL generation"
+                )
+
+            sas_token = generate_blob_sas(
+                account_name=self.account_name,
+                container_name=self.container_name,
+                blob_name=remote_path,
+                account_key=self.account_key,
+                permission=BlobSasPermissions(read=True),
+                expiry=datetime.utcnow() + timedelta(seconds=expires_in)
+            )
+
+            return f"{blob_client.url}?{sas_token}"
+        except FileNotFoundError:
+            raise
+        except Exception as e:
+            raise Exception(f"Azure SAS URL generation failed: {e}")
+
+    def copy_file(self, source_path: str, dest_path: str) -> None:
+        """Copy file within Azure container (server-side copy)."""
+        try:
+            source_blob = self.container_client.get_blob_client(source_path)
+
+            if not source_blob.exists():
+                raise FileNotFoundError(f"Source file not found: {source_path}")
+
+            dest_blob = self.container_client.get_blob_client(dest_path)
+
+            # Start copy operation
+            dest_blob.start_copy_from_url(source_blob.url)
+
+            # Wait for copy to complete
+            properties = dest_blob.get_blob_properties()
+            while properties.copy.status == 'pending':
+                import time
+                time.sleep(0.1)
+                properties = dest_blob.get_blob_properties()
+
+            if properties.copy.status != 'success':
+                raise Exception(f"Copy failed with status: {properties.copy.status}")
+
+        except FileNotFoundError:
+            raise
+        except Exception as e:
+            raise Exception(f"Azure copy failed: {e}")
--- a/src/skill_seekers/cli/storage/base_storage.py
+++ b/src/skill_seekers/cli/storage/base_storage.py
@@ -0,0 +1,275 @@
+"""
+Base storage adaptor interface for cloud storage providers.
+"""
+
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import List, Dict, Optional
+from dataclasses import dataclass
+
+
+@dataclass
+class StorageObject:
+    """
+    Represents a file/object in cloud storage.
+
+    Attributes:
+        key: Object key/path in storage
+        size: Size in bytes
+        last_modified: Last modification timestamp
+        etag: ETag/hash of object
+        metadata: Additional metadata
+    """
+
+    key: str
+    size: int
+    last_modified: Optional[str] = None
+    etag: Optional[str] = None
+    metadata: Optional[Dict[str, str]] = None
+
+
+class BaseStorageAdaptor(ABC):
+    """
+    Abstract base class for cloud storage adaptors.
+
+    Provides unified interface for different cloud storage providers.
+    All adaptors must implement these methods.
+    """
+
+    def __init__(self, **kwargs):
+        """
+        Initialize storage adaptor.
+
+        Args:
+            **kwargs: Provider-specific configuration
+        """
+        self.config = kwargs
+
+    @abstractmethod
+    def upload_file(
+        self, local_path: str, remote_path: str, metadata: Optional[Dict[str, str]] = None
+    ) -> str:
+        """
+        Upload file to cloud storage.
+
+        Args:
+            local_path: Path to local file
+            remote_path: Destination path in cloud storage
+            metadata: Optional metadata to attach to file
+
+        Returns:
+            URL or identifier of uploaded file
+
+        Raises:
+            FileNotFoundError: If local file doesn't exist
+            Exception: If upload fails
+        """
+        pass
+
+    @abstractmethod
+    def download_file(self, remote_path: str, local_path: str) -> None:
+        """
+        Download file from cloud storage.
+
+        Args:
+            remote_path: Path to file in cloud storage
+            local_path: Destination path for downloaded file
+
+        Raises:
+            FileNotFoundError: If remote file doesn't exist
+            Exception: If download fails
+        """
+        pass
+
+    @abstractmethod
+    def delete_file(self, remote_path: str) -> None:
+        """
+        Delete file from cloud storage.
+
+        Args:
+            remote_path: Path to file in cloud storage
+
+        Raises:
+            FileNotFoundError: If remote file doesn't exist
+            Exception: If deletion fails
+        """
+        pass
+
+    @abstractmethod
+    def list_files(
+        self, prefix: str = "", max_results: int = 1000
+    ) -> List[StorageObject]:
+        """
+        List files in cloud storage.
+
+        Args:
+            prefix: Prefix to filter files (directory path)
+            max_results: Maximum number of results to return
+
+        Returns:
+            List of StorageObject instances
+
+        Raises:
+            Exception: If listing fails
+        """
+        pass
+
+    @abstractmethod
+    def file_exists(self, remote_path: str) -> bool:
+        """
+        Check if file exists in cloud storage.
+
+        Args:
+            remote_path: Path to file in cloud storage
+
+        Returns:
+            True if file exists, False otherwise
+        """
+        pass
+
+    @abstractmethod
+    def get_file_url(self, remote_path: str, expires_in: int = 3600) -> str:
+        """
+        Generate signed URL for file access.
+
+        Args:
+            remote_path: Path to file in cloud storage
+            expires_in: URL expiration time in seconds (default: 1 hour)
+
+        Returns:
+            Signed URL for file access
+
+        Raises:
+            FileNotFoundError: If remote file doesn't exist
+            Exception: If URL generation fails
+        """
+        pass
+
+    def upload_directory(
+        self, local_dir: str, remote_prefix: str = "", exclude_patterns: Optional[List[str]] = None
+    ) -> List[str]:
+        """
+        Upload entire directory to cloud storage.
+
+        Args:
+            local_dir: Path to local directory
+            remote_prefix: Prefix for uploaded files
+            exclude_patterns: Glob patterns to exclude files
+
+        Returns:
+            List of uploaded file paths
+
+        Raises:
+            NotADirectoryError: If local_dir is not a directory
+            Exception: If upload fails
+        """
+        local_path = Path(local_dir)
+        if not local_path.is_dir():
+            raise NotADirectoryError(f"Not a directory: {local_dir}")
+
+        uploaded_files = []
+        exclude_patterns = exclude_patterns or []
+
+        for file_path in local_path.rglob("*"):
+            if file_path.is_file():
+                # Check exclusion patterns
+                should_exclude = False
+                for pattern in exclude_patterns:
+                    if file_path.match(pattern):
+                        should_exclude = True
+                        break
+
+                if should_exclude:
+                    continue
+
+                # Calculate relative path
+                relative_path = file_path.relative_to(local_path)
+                remote_path = f"{remote_prefix}/{relative_path}".lstrip("/")
+
+                # Upload file
+                self.upload_file(str(file_path), remote_path)
+                uploaded_files.append(remote_path)
+
+        return uploaded_files
+
+    def download_directory(
+        self, remote_prefix: str, local_dir: str
+    ) -> List[str]:
+        """
+        Download directory from cloud storage.
+
+        Args:
+            remote_prefix: Prefix of files to download
+            local_dir: Destination directory
+
+        Returns:
+            List of downloaded file paths
+
+        Raises:
+            Exception: If download fails
+        """
+        local_path = Path(local_dir)
+        local_path.mkdir(parents=True, exist_ok=True)
+
+        downloaded_files = []
+        files = self.list_files(prefix=remote_prefix)
+
+        for file_obj in files:
+            # Calculate local path
+            relative_path = file_obj.key.removeprefix(remote_prefix).lstrip("/")
+            local_file_path = local_path / relative_path
+
+            # Create parent directories
+            local_file_path.parent.mkdir(parents=True, exist_ok=True)
+
+            # Download file
+            self.download_file(file_obj.key, str(local_file_path))
+            downloaded_files.append(str(local_file_path))
+
+        return downloaded_files
+
+    def get_file_size(self, remote_path: str) -> int:
+        """
+        Get size of file in cloud storage.
+
+        Args:
+            remote_path: Path to file in cloud storage
+
+        Returns:
+            File size in bytes
+
+        Raises:
+            FileNotFoundError: If remote file doesn't exist
+        """
+        files = self.list_files(prefix=remote_path, max_results=1)
+        if not files or files[0].key != remote_path:
+            raise FileNotFoundError(f"File not found: {remote_path}")
+        return files[0].size
+
+    def copy_file(
+        self, source_path: str, dest_path: str
+    ) -> None:
+        """
+        Copy file within cloud storage.
+
+        Default implementation downloads then uploads.
+        Subclasses can override with provider-specific copy operations.
+
+        Args:
+            source_path: Source file path
+            dest_path: Destination file path
+
+        Raises:
+            FileNotFoundError: If source file doesn't exist
+            Exception: If copy fails
+        """
+        import tempfile
+
+        with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+            tmp_path = tmp_file.name
+
+        try:
+            self.download_file(source_path, tmp_path)
+            self.upload_file(tmp_path, dest_path)
+        finally:
+            Path(tmp_path).unlink(missing_ok=True)
--- a/src/skill_seekers/cli/storage/gcs_storage.py
+++ b/src/skill_seekers/cli/storage/gcs_storage.py
@@ -0,0 +1,194 @@
+"""
+Google Cloud Storage (GCS) adaptor implementation.
+"""
+
+import os
+from pathlib import Path
+from typing import List, Dict, Optional
+from datetime import timedelta
+
+try:
+    from google.cloud import storage
+    from google.cloud.exceptions import NotFound
+    GCS_AVAILABLE = True
+except ImportError:
+    GCS_AVAILABLE = False
+
+from .base_storage import BaseStorageAdaptor, StorageObject
+
+
+class GCSStorageAdaptor(BaseStorageAdaptor):
+    """
+    Google Cloud Storage adaptor.
+
+    Configuration:
+        bucket: GCS bucket name (required)
+        project: GCP project ID (optional, uses default)
+        credentials_path: Path to service account JSON (optional)
+
+    Environment Variables:
+        GOOGLE_APPLICATION_CREDENTIALS: Path to service account JSON
+        GOOGLE_CLOUD_PROJECT: GCP project ID
+
+    Examples:
+        # Using environment variables
+        adaptor = GCSStorageAdaptor(bucket='my-bucket')
+
+        # With explicit credentials
+        adaptor = GCSStorageAdaptor(
+            bucket='my-bucket',
+            project='my-project',
+            credentials_path='/path/to/credentials.json'
+        )
+
+        # Using default credentials
+        adaptor = GCSStorageAdaptor(
+            bucket='my-bucket',
+            project='my-project'
+        )
+    """
+
+    def __init__(self, **kwargs):
+        """
+        Initialize GCS storage adaptor.
+
+        Args:
+            bucket: GCS bucket name (required)
+            **kwargs: Additional GCS configuration
+        """
+        super().__init__(**kwargs)
+
+        if not GCS_AVAILABLE:
+            raise ImportError(
+                "google-cloud-storage is required for GCS storage. "
+                "Install with: pip install google-cloud-storage"
+            )
+
+        if 'bucket' not in kwargs:
+            raise ValueError("bucket parameter is required for GCS storage")
+
+        self.bucket_name = kwargs['bucket']
+        self.project = kwargs.get('project', os.getenv('GOOGLE_CLOUD_PROJECT'))
+
+        # Initialize GCS client
+        client_kwargs = {}
+        if self.project:
+            client_kwargs['project'] = self.project
+
+        if 'credentials_path' in kwargs:
+            os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = kwargs['credentials_path']
+
+        self.storage_client = storage.Client(**client_kwargs)
+        self.bucket = self.storage_client.bucket(self.bucket_name)
+
+    def upload_file(
+        self, local_path: str, remote_path: str, metadata: Optional[Dict[str, str]] = None
+    ) -> str:
+        """Upload file to GCS."""
+        local_file = Path(local_path)
+        if not local_file.exists():
+            raise FileNotFoundError(f"Local file not found: {local_path}")
+
+        try:
+            blob = self.bucket.blob(remote_path)
+
+            if metadata:
+                blob.metadata = metadata
+
+            blob.upload_from_filename(str(local_file))
+            return f"gs://{self.bucket_name}/{remote_path}"
+        except Exception as e:
+            raise Exception(f"GCS upload failed: {e}")
+
+    def download_file(self, remote_path: str, local_path: str) -> None:
+        """Download file from GCS."""
+        local_file = Path(local_path)
+        local_file.parent.mkdir(parents=True, exist_ok=True)
+
+        try:
+            blob = self.bucket.blob(remote_path)
+            blob.download_to_filename(str(local_file))
+        except NotFound:
+            raise FileNotFoundError(f"Remote file not found: {remote_path}")
+        except Exception as e:
+            raise Exception(f"GCS download failed: {e}")
+
+    def delete_file(self, remote_path: str) -> None:
+        """Delete file from GCS."""
+        try:
+            blob = self.bucket.blob(remote_path)
+            blob.delete()
+        except NotFound:
+            raise FileNotFoundError(f"Remote file not found: {remote_path}")
+        except Exception as e:
+            raise Exception(f"GCS deletion failed: {e}")
+
+    def list_files(
+        self, prefix: str = "", max_results: int = 1000
+    ) -> List[StorageObject]:
+        """List files in GCS bucket."""
+        try:
+            blobs = self.storage_client.list_blobs(
+                self.bucket_name,
+                prefix=prefix,
+                max_results=max_results
+            )
+
+            files = []
+            for blob in blobs:
+                files.append(StorageObject(
+                    key=blob.name,
+                    size=blob.size,
+                    last_modified=blob.updated.isoformat() if blob.updated else None,
+                    etag=blob.etag,
+                    metadata=blob.metadata
+                ))
+
+            return files
+        except Exception as e:
+            raise Exception(f"GCS listing failed: {e}")
+
+    def file_exists(self, remote_path: str) -> bool:
+        """Check if file exists in GCS."""
+        try:
+            blob = self.bucket.blob(remote_path)
+            return blob.exists()
+        except Exception as e:
+            raise Exception(f"GCS file existence check failed: {e}")
+
+    def get_file_url(self, remote_path: str, expires_in: int = 3600) -> str:
+        """Generate signed URL for GCS object."""
+        try:
+            blob = self.bucket.blob(remote_path)
+
+            if not blob.exists():
+                raise FileNotFoundError(f"Remote file not found: {remote_path}")
+
+            url = blob.generate_signed_url(
+                version="v4",
+                expiration=timedelta(seconds=expires_in),
+                method="GET"
+            )
+            return url
+        except FileNotFoundError:
+            raise
+        except Exception as e:
+            raise Exception(f"GCS signed URL generation failed: {e}")
+
+    def copy_file(self, source_path: str, dest_path: str) -> None:
+        """Copy file within GCS bucket (server-side copy)."""
+        try:
+            source_blob = self.bucket.blob(source_path)
+
+            if not source_blob.exists():
+                raise FileNotFoundError(f"Source file not found: {source_path}")
+
+            self.bucket.copy_blob(
+                source_blob,
+                self.bucket,
+                dest_path
+            )
+        except FileNotFoundError:
+            raise
+        except Exception as e:
+            raise Exception(f"GCS copy failed: {e}")
--- a/src/skill_seekers/cli/storage/s3_storage.py
+++ b/src/skill_seekers/cli/storage/s3_storage.py
@@ -0,0 +1,216 @@
+"""
+AWS S3 storage adaptor implementation.
+"""
+
+import os
+from pathlib import Path
+from typing import List, Dict, Optional
+
+try:
+    import boto3
+    from botocore.exceptions import ClientError
+    BOTO3_AVAILABLE = True
+except ImportError:
+    BOTO3_AVAILABLE = False
+
+from .base_storage import BaseStorageAdaptor, StorageObject
+
+
+class S3StorageAdaptor(BaseStorageAdaptor):
+    """
+    AWS S3 storage adaptor.
+
+    Configuration:
+        bucket: S3 bucket name (required)
+        region: AWS region (optional, default: us-east-1)
+        aws_access_key_id: AWS access key (optional, uses env/credentials)
+        aws_secret_access_key: AWS secret key (optional, uses env/credentials)
+        endpoint_url: Custom endpoint URL (optional, for S3-compatible services)
+
+    Environment Variables:
+        AWS_ACCESS_KEY_ID: AWS access key
+        AWS_SECRET_ACCESS_KEY: AWS secret key
+        AWS_DEFAULT_REGION: AWS region
+
+    Examples:
+        # Using environment variables
+        adaptor = S3StorageAdaptor(bucket='my-bucket')
+
+        # With explicit credentials
+        adaptor = S3StorageAdaptor(
+            bucket='my-bucket',
+            region='us-west-2',
+            aws_access_key_id='AKIAIOSFODNN7EXAMPLE',
+            aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'
+        )
+
+        # S3-compatible service (MinIO, DigitalOcean Spaces)
+        adaptor = S3StorageAdaptor(
+            bucket='my-bucket',
+            endpoint_url='https://nyc3.digitaloceanspaces.com',
+            aws_access_key_id='...',
+            aws_secret_access_key='...'
+        )
+    """
+
+    def __init__(self, **kwargs):
+        """
+        Initialize S3 storage adaptor.
+
+        Args:
+            bucket: S3 bucket name (required)
+            **kwargs: Additional S3 configuration
+        """
+        super().__init__(**kwargs)
+
+        if not BOTO3_AVAILABLE:
+            raise ImportError(
+                "boto3 is required for S3 storage. "
+                "Install with: pip install boto3"
+            )
+
+        if 'bucket' not in kwargs:
+            raise ValueError("bucket parameter is required for S3 storage")
+
+        self.bucket = kwargs['bucket']
+        self.region = kwargs.get('region', os.getenv('AWS_DEFAULT_REGION', 'us-east-1'))
+
+        # Initialize S3 client
+        client_kwargs = {
+            'region_name': self.region,
+        }
+
+        if 'endpoint_url' in kwargs:
+            client_kwargs['endpoint_url'] = kwargs['endpoint_url']
+
+        if 'aws_access_key_id' in kwargs:
+            client_kwargs['aws_access_key_id'] = kwargs['aws_access_key_id']
+
+        if 'aws_secret_access_key' in kwargs:
+            client_kwargs['aws_secret_access_key'] = kwargs['aws_secret_access_key']
+
+        self.s3_client = boto3.client('s3', **client_kwargs)
+        self.s3_resource = boto3.resource('s3', **client_kwargs)
+
+    def upload_file(
+        self, local_path: str, remote_path: str, metadata: Optional[Dict[str, str]] = None
+    ) -> str:
+        """Upload file to S3."""
+        local_file = Path(local_path)
+        if not local_file.exists():
+            raise FileNotFoundError(f"Local file not found: {local_path}")
+
+        extra_args = {}
+        if metadata:
+            extra_args['Metadata'] = metadata
+
+        try:
+            self.s3_client.upload_file(
+                str(local_file),
+                self.bucket,
+                remote_path,
+                ExtraArgs=extra_args if extra_args else None
+            )
+            return f"s3://{self.bucket}/{remote_path}"
+        except ClientError as e:
+            raise Exception(f"S3 upload failed: {e}")
+
+    def download_file(self, remote_path: str, local_path: str) -> None:
+        """Download file from S3."""
+        local_file = Path(local_path)
+        local_file.parent.mkdir(parents=True, exist_ok=True)
+
+        try:
+            self.s3_client.download_file(
+                self.bucket,
+                remote_path,
+                str(local_file)
+            )
+        except ClientError as e:
+            if e.response['Error']['Code'] == '404':
+                raise FileNotFoundError(f"Remote file not found: {remote_path}")
+            raise Exception(f"S3 download failed: {e}")
+
+    def delete_file(self, remote_path: str) -> None:
+        """Delete file from S3."""
+        try:
+            self.s3_client.delete_object(
+                Bucket=self.bucket,
+                Key=remote_path
+            )
+        except ClientError as e:
+            raise Exception(f"S3 deletion failed: {e}")
+
+    def list_files(
+        self, prefix: str = "", max_results: int = 1000
+    ) -> List[StorageObject]:
+        """List files in S3 bucket."""
+        try:
+            paginator = self.s3_client.get_paginator('list_objects_v2')
+            page_iterator = paginator.paginate(
+                Bucket=self.bucket,
+                Prefix=prefix,
+                PaginationConfig={'MaxItems': max_results}
+            )
+
+            files = []
+            for page in page_iterator:
+                if 'Contents' not in page:
+                    continue
+
+                for obj in page['Contents']:
+                    files.append(StorageObject(
+                        key=obj['Key'],
+                        size=obj['Size'],
+                        last_modified=obj['LastModified'].isoformat(),
+                        etag=obj.get('ETag', '').strip('"')
+                    ))
+
+            return files
+        except ClientError as e:
+            raise Exception(f"S3 listing failed: {e}")
+
+    def file_exists(self, remote_path: str) -> bool:
+        """Check if file exists in S3."""
+        try:
+            self.s3_client.head_object(
+                Bucket=self.bucket,
+                Key=remote_path
+            )
+            return True
+        except ClientError as e:
+            if e.response['Error']['Code'] == '404':
+                return False
+            raise Exception(f"S3 head_object failed: {e}")
+
+    def get_file_url(self, remote_path: str, expires_in: int = 3600) -> str:
+        """Generate presigned URL for S3 object."""
+        try:
+            url = self.s3_client.generate_presigned_url(
+                'get_object',
+                Params={
+                    'Bucket': self.bucket,
+                    'Key': remote_path
+                },
+                ExpiresIn=expires_in
+            )
+            return url
+        except ClientError as e:
+            raise Exception(f"S3 presigned URL generation failed: {e}")
+
+    def copy_file(self, source_path: str, dest_path: str) -> None:
+        """Copy file within S3 bucket (server-side copy)."""
+        try:
+            copy_source = {
+                'Bucket': self.bucket,
+                'Key': source_path
+            }
+            self.s3_client.copy_object(
+                CopySource=copy_source,
+                Bucket=self.bucket,
+                Key=dest_path
+            )
+        except ClientError as e:
+            if e.response['Error']['Code'] == '404':
+                raise FileNotFoundError(f"Source file not found: {source_path}")
+            raise Exception(f"S3 copy failed: {e}")
--- a/src/skill_seekers/cli/sync_cli.py
+++ b/src/skill_seekers/cli/sync_cli.py
@@ -0,0 +1,224 @@
+#!/usr/bin/env python3
+"""
+Documentation sync CLI.
+
+Monitor documentation for changes and automatically update skills.
+"""
+
+import sys
+import argparse
+import signal
+from pathlib import Path
+
+from ..sync import SyncMonitor
+
+
+def handle_signal(signum, frame):
+    """Handle interrupt signals."""
+    print("\n🛑 Stopping sync monitor...")
+    sys.exit(0)
+
+
+def start_command(args):
+    """Start monitoring."""
+    monitor = SyncMonitor(
+        config_path=args.config,
+        check_interval=args.interval,
+        auto_update=args.auto_update
+    )
+
+    # Register signal handlers
+    signal.signal(signal.SIGINT, handle_signal)
+    signal.signal(signal.SIGTERM, handle_signal)
+
+    try:
+        monitor.start()
+
+        print(f"\n📊 Monitoring {args.config}")
+        print(f"   Check interval: {args.interval}s ({args.interval // 60}m)")
+        print(f"   Auto-update: {'✅ enabled' if args.auto_update else '❌ disabled'}")
+        print("\nPress Ctrl+C to stop\n")
+
+        # Keep running
+        while True:
+            import time
+            time.sleep(1)
+
+    except KeyboardInterrupt:
+        print("\n🛑 Stopping...")
+        monitor.stop()
+
+
+def check_command(args):
+    """Check for changes once."""
+    monitor = SyncMonitor(
+        config_path=args.config,
+        check_interval=3600  # Not used for single check
+    )
+
+    print(f"🔍 Checking {args.config} for changes...")
+
+    report = monitor.check_now(generate_diffs=args.diff)
+
+    print(f"\n📊 Results:")
+    print(f"   Total pages: {report.total_pages}")
+    print(f"   Added: {len(report.added)}")
+    print(f"   Modified: {len(report.modified)}")
+    print(f"   Deleted: {len(report.deleted)}")
+    print(f"   Unchanged: {report.unchanged}")
+
+    if report.has_changes:
+        print(f"\n✨ Detected {report.change_count} changes!")
+
+        if args.verbose:
+            if report.added:
+                print("\n✅ Added pages:")
+                for change in report.added:
+                    print(f"   • {change.url}")
+
+            if report.modified:
+                print("\n✏️  Modified pages:")
+                for change in report.modified:
+                    print(f"   • {change.url}")
+                    if change.diff and args.diff:
+                        print(f"      Diff preview (first 5 lines):")
+                        for line in change.diff.split('\n')[:5]:
+                            print(f"        {line}")
+
+            if report.deleted:
+                print("\n❌ Deleted pages:")
+                for change in report.deleted:
+                    print(f"   • {change.url}")
+    else:
+        print("\n✅ No changes detected")
+
+
+def stats_command(args):
+    """Show monitoring statistics."""
+    monitor = SyncMonitor(
+        config_path=args.config,
+        check_interval=3600
+    )
+
+    stats = monitor.stats()
+
+    print(f"\n📊 Statistics for {stats['skill_name']}:")
+    print(f"   Status: {stats['status']}")
+    print(f"   Last check: {stats['last_check'] or 'Never'}")
+    print(f"   Last change: {stats['last_change'] or 'Never'}")
+    print(f"   Total checks: {stats['total_checks']}")
+    print(f"   Total changes: {stats['total_changes']}")
+    print(f"   Tracked pages: {stats['tracked_pages']}")
+    print(f"   Running: {'✅ Yes' if stats['running'] else '❌ No'}")
+
+
+def reset_command(args):
+    """Reset monitoring state."""
+    state_file = Path(f"{args.skill_name}_sync.json")
+
+    if state_file.exists():
+        if args.force or input(f"⚠️  Reset state for {args.skill_name}? [y/N]: ").lower() == 'y':
+            state_file.unlink()
+            print(f"✅ State reset for {args.skill_name}")
+        else:
+            print("❌ Reset cancelled")
+    else:
+        print(f"ℹ️  No state file found for {args.skill_name}")
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description='Monitor documentation for changes and update skills',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Start monitoring (checks every hour)
+  skill-seekers-sync start --config configs/react.json
+
+  # Start with custom interval (10 minutes)
+  skill-seekers-sync start --config configs/react.json --interval 600
+
+  # Start with auto-update
+  skill-seekers-sync start --config configs/react.json --auto-update
+
+  # Check once (no continuous monitoring)
+  skill-seekers-sync check --config configs/react.json
+
+  # Check with diffs
+  skill-seekers-sync check --config configs/react.json --diff -v
+
+  # Show statistics
+  skill-seekers-sync stats --config configs/react.json
+
+  # Reset state
+  skill-seekers-sync reset --skill-name react
+        """
+    )
+
+    subparsers = parser.add_subparsers(dest='command', help='Command to execute')
+
+    # Start command
+    start_parser = subparsers.add_parser('start', help='Start continuous monitoring')
+    start_parser.add_argument('--config', required=True, help='Path to skill config file')
+    start_parser.add_argument(
+        '--interval', '-i',
+        type=int,
+        default=3600,
+        help='Check interval in seconds (default: 3600 = 1 hour)'
+    )
+    start_parser.add_argument(
+        '--auto-update',
+        action='store_true',
+        help='Automatically rebuild skill on changes'
+    )
+
+    # Check command
+    check_parser = subparsers.add_parser('check', help='Check for changes once')
+    check_parser.add_argument('--config', required=True, help='Path to skill config file')
+    check_parser.add_argument(
+        '--diff', '-d',
+        action='store_true',
+        help='Generate content diffs'
+    )
+    check_parser.add_argument(
+        '--verbose', '-v',
+        action='store_true',
+        help='Show detailed output'
+    )
+
+    # Stats command
+    stats_parser = subparsers.add_parser('stats', help='Show monitoring statistics')
+    stats_parser.add_argument('--config', required=True, help='Path to skill config file')
+
+    # Reset command
+    reset_parser = subparsers.add_parser('reset', help='Reset monitoring state')
+    reset_parser.add_argument('--skill-name', required=True, help='Skill name')
+    reset_parser.add_argument(
+        '--force', '-f',
+        action='store_true',
+        help='Skip confirmation'
+    )
+
+    args = parser.parse_args()
+
+    if not args.command:
+        parser.print_help()
+        sys.exit(1)
+
+    try:
+        if args.command == 'start':
+            start_command(args)
+        elif args.command == 'check':
+            check_command(args)
+        elif args.command == 'stats':
+            stats_command(args)
+        elif args.command == 'reset':
+            reset_command(args)
+    except Exception as e:
+        print(f"\n❌ Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()