fix: Enforce min_chunk_size in RAG chunker
- Filter out chunks smaller than min_chunk_size (default 100 tokens) - Exception: Keep all chunks if entire document is smaller than target size - All 15 tests passing (100% pass rate) Fixes edge case where very small chunks (e.g., 'Short.' = 6 chars) were being created despite min_chunk_size=100 setting. Test: pytest tests/test_rag_chunker.py -v
This commit is contained in:
85
src/skill_seekers/cli/storage/__init__.py
Normal file
85
src/skill_seekers/cli/storage/__init__.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""
|
||||
Cloud storage adaptors for Skill Seekers.
|
||||
|
||||
Provides unified interface for multiple cloud storage providers:
|
||||
- AWS S3
|
||||
- Google Cloud Storage (GCS)
|
||||
- Azure Blob Storage
|
||||
|
||||
Usage:
|
||||
from skill_seekers.cli.storage import get_storage_adaptor
|
||||
|
||||
# Get adaptor for specific provider
|
||||
adaptor = get_storage_adaptor('s3', bucket='my-bucket')
|
||||
|
||||
# Upload file
|
||||
adaptor.upload_file('local/path/skill.zip', 'skills/skill.zip')
|
||||
|
||||
# Download file
|
||||
adaptor.download_file('skills/skill.zip', 'local/path/skill.zip')
|
||||
|
||||
# List files
|
||||
files = adaptor.list_files('skills/')
|
||||
"""
|
||||
|
||||
from .base_storage import BaseStorageAdaptor, StorageObject
|
||||
from .s3_storage import S3StorageAdaptor
|
||||
from .gcs_storage import GCSStorageAdaptor
|
||||
from .azure_storage import AzureStorageAdaptor
|
||||
|
||||
|
||||
def get_storage_adaptor(provider: str, **kwargs) -> BaseStorageAdaptor:
|
||||
"""
|
||||
Factory function to get storage adaptor for specified provider.
|
||||
|
||||
Args:
|
||||
provider: Storage provider name ('s3', 'gcs', 'azure')
|
||||
**kwargs: Provider-specific configuration
|
||||
|
||||
Returns:
|
||||
Storage adaptor instance
|
||||
|
||||
Raises:
|
||||
ValueError: If provider is not supported
|
||||
|
||||
Examples:
|
||||
# AWS S3
|
||||
adaptor = get_storage_adaptor('s3',
|
||||
bucket='my-bucket',
|
||||
region='us-west-2')
|
||||
|
||||
# Google Cloud Storage
|
||||
adaptor = get_storage_adaptor('gcs',
|
||||
bucket='my-bucket',
|
||||
project='my-project')
|
||||
|
||||
# Azure Blob Storage
|
||||
adaptor = get_storage_adaptor('azure',
|
||||
container='my-container',
|
||||
account_name='myaccount')
|
||||
"""
|
||||
adaptors = {
|
||||
's3': S3StorageAdaptor,
|
||||
'gcs': GCSStorageAdaptor,
|
||||
'azure': AzureStorageAdaptor,
|
||||
}
|
||||
|
||||
provider_lower = provider.lower()
|
||||
if provider_lower not in adaptors:
|
||||
supported = ', '.join(adaptors.keys())
|
||||
raise ValueError(
|
||||
f"Unsupported storage provider: {provider}. "
|
||||
f"Supported providers: {supported}"
|
||||
)
|
||||
|
||||
return adaptors[provider_lower](**kwargs)
|
||||
|
||||
|
||||
__all__ = [
|
||||
'BaseStorageAdaptor',
|
||||
'StorageObject',
|
||||
'S3StorageAdaptor',
|
||||
'GCSStorageAdaptor',
|
||||
'AzureStorageAdaptor',
|
||||
'get_storage_adaptor',
|
||||
]
|
||||
254
src/skill_seekers/cli/storage/azure_storage.py
Normal file
254
src/skill_seekers/cli/storage/azure_storage.py
Normal file
@@ -0,0 +1,254 @@
|
||||
"""
|
||||
Azure Blob Storage adaptor implementation.
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
try:
|
||||
from azure.storage.blob import BlobServiceClient, BlobSasPermissions, generate_blob_sas
|
||||
from azure.core.exceptions import ResourceNotFoundError
|
||||
AZURE_AVAILABLE = True
|
||||
except ImportError:
|
||||
AZURE_AVAILABLE = False
|
||||
|
||||
from .base_storage import BaseStorageAdaptor, StorageObject
|
||||
|
||||
|
||||
class AzureStorageAdaptor(BaseStorageAdaptor):
|
||||
"""
|
||||
Azure Blob Storage adaptor.
|
||||
|
||||
Configuration:
|
||||
container: Azure container name (required)
|
||||
account_name: Storage account name (optional, uses env)
|
||||
account_key: Storage account key (optional, uses env)
|
||||
connection_string: Connection string (optional, alternative to account_name/key)
|
||||
|
||||
Environment Variables:
|
||||
AZURE_STORAGE_CONNECTION_STRING: Azure storage connection string
|
||||
AZURE_STORAGE_ACCOUNT_NAME: Storage account name
|
||||
AZURE_STORAGE_ACCOUNT_KEY: Storage account key
|
||||
|
||||
Examples:
|
||||
# Using connection string
|
||||
adaptor = AzureStorageAdaptor(
|
||||
container='my-container',
|
||||
connection_string='DefaultEndpointsProtocol=https;...'
|
||||
)
|
||||
|
||||
# Using account name and key
|
||||
adaptor = AzureStorageAdaptor(
|
||||
container='my-container',
|
||||
account_name='myaccount',
|
||||
account_key='mykey'
|
||||
)
|
||||
|
||||
# Using environment variables
|
||||
adaptor = AzureStorageAdaptor(container='my-container')
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""
|
||||
Initialize Azure storage adaptor.
|
||||
|
||||
Args:
|
||||
container: Azure container name (required)
|
||||
**kwargs: Additional Azure configuration
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if not AZURE_AVAILABLE:
|
||||
raise ImportError(
|
||||
"azure-storage-blob is required for Azure storage. "
|
||||
"Install with: pip install azure-storage-blob"
|
||||
)
|
||||
|
||||
if 'container' not in kwargs:
|
||||
raise ValueError("container parameter is required for Azure storage")
|
||||
|
||||
self.container_name = kwargs['container']
|
||||
|
||||
# Initialize BlobServiceClient
|
||||
if 'connection_string' in kwargs:
|
||||
connection_string = kwargs['connection_string']
|
||||
else:
|
||||
connection_string = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
|
||||
|
||||
if connection_string:
|
||||
self.blob_service_client = BlobServiceClient.from_connection_string(
|
||||
connection_string
|
||||
)
|
||||
# Extract account name from connection string
|
||||
self.account_name = None
|
||||
self.account_key = None
|
||||
for part in connection_string.split(';'):
|
||||
if part.startswith('AccountName='):
|
||||
self.account_name = part.split('=', 1)[1]
|
||||
elif part.startswith('AccountKey='):
|
||||
self.account_key = part.split('=', 1)[1]
|
||||
else:
|
||||
account_name = kwargs.get(
|
||||
'account_name',
|
||||
os.getenv('AZURE_STORAGE_ACCOUNT_NAME')
|
||||
)
|
||||
account_key = kwargs.get(
|
||||
'account_key',
|
||||
os.getenv('AZURE_STORAGE_ACCOUNT_KEY')
|
||||
)
|
||||
|
||||
if not account_name or not account_key:
|
||||
raise ValueError(
|
||||
"Either connection_string or (account_name + account_key) "
|
||||
"must be provided for Azure storage"
|
||||
)
|
||||
|
||||
self.account_name = account_name
|
||||
self.account_key = account_key
|
||||
account_url = f"https://{account_name}.blob.core.windows.net"
|
||||
self.blob_service_client = BlobServiceClient(
|
||||
account_url=account_url,
|
||||
credential=account_key
|
||||
)
|
||||
|
||||
self.container_client = self.blob_service_client.get_container_client(
|
||||
self.container_name
|
||||
)
|
||||
|
||||
def upload_file(
|
||||
self, local_path: str, remote_path: str, metadata: Optional[Dict[str, str]] = None
|
||||
) -> str:
|
||||
"""Upload file to Azure Blob Storage."""
|
||||
local_file = Path(local_path)
|
||||
if not local_file.exists():
|
||||
raise FileNotFoundError(f"Local file not found: {local_path}")
|
||||
|
||||
try:
|
||||
blob_client = self.container_client.get_blob_client(remote_path)
|
||||
|
||||
with open(local_file, "rb") as data:
|
||||
blob_client.upload_blob(
|
||||
data,
|
||||
overwrite=True,
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
return f"https://{self.account_name}.blob.core.windows.net/{self.container_name}/{remote_path}"
|
||||
except Exception as e:
|
||||
raise Exception(f"Azure upload failed: {e}")
|
||||
|
||||
def download_file(self, remote_path: str, local_path: str) -> None:
|
||||
"""Download file from Azure Blob Storage."""
|
||||
local_file = Path(local_path)
|
||||
local_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
blob_client = self.container_client.get_blob_client(remote_path)
|
||||
|
||||
with open(local_file, "wb") as download_file:
|
||||
download_stream = blob_client.download_blob()
|
||||
download_file.write(download_stream.readall())
|
||||
except ResourceNotFoundError:
|
||||
raise FileNotFoundError(f"Remote file not found: {remote_path}")
|
||||
except Exception as e:
|
||||
raise Exception(f"Azure download failed: {e}")
|
||||
|
||||
def delete_file(self, remote_path: str) -> None:
|
||||
"""Delete file from Azure Blob Storage."""
|
||||
try:
|
||||
blob_client = self.container_client.get_blob_client(remote_path)
|
||||
blob_client.delete_blob()
|
||||
except ResourceNotFoundError:
|
||||
raise FileNotFoundError(f"Remote file not found: {remote_path}")
|
||||
except Exception as e:
|
||||
raise Exception(f"Azure deletion failed: {e}")
|
||||
|
||||
def list_files(
|
||||
self, prefix: str = "", max_results: int = 1000
|
||||
) -> List[StorageObject]:
|
||||
"""List files in Azure container."""
|
||||
try:
|
||||
blobs = self.container_client.list_blobs(
|
||||
name_starts_with=prefix,
|
||||
results_per_page=max_results
|
||||
)
|
||||
|
||||
files = []
|
||||
for blob in blobs:
|
||||
files.append(StorageObject(
|
||||
key=blob.name,
|
||||
size=blob.size,
|
||||
last_modified=blob.last_modified.isoformat() if blob.last_modified else None,
|
||||
etag=blob.etag,
|
||||
metadata=blob.metadata
|
||||
))
|
||||
|
||||
return files
|
||||
except Exception as e:
|
||||
raise Exception(f"Azure listing failed: {e}")
|
||||
|
||||
def file_exists(self, remote_path: str) -> bool:
|
||||
"""Check if file exists in Azure Blob Storage."""
|
||||
try:
|
||||
blob_client = self.container_client.get_blob_client(remote_path)
|
||||
return blob_client.exists()
|
||||
except Exception as e:
|
||||
raise Exception(f"Azure file existence check failed: {e}")
|
||||
|
||||
def get_file_url(self, remote_path: str, expires_in: int = 3600) -> str:
|
||||
"""Generate SAS URL for Azure blob."""
|
||||
try:
|
||||
blob_client = self.container_client.get_blob_client(remote_path)
|
||||
|
||||
if not blob_client.exists():
|
||||
raise FileNotFoundError(f"Remote file not found: {remote_path}")
|
||||
|
||||
if not self.account_name or not self.account_key:
|
||||
raise ValueError(
|
||||
"Account name and key are required for SAS URL generation"
|
||||
)
|
||||
|
||||
sas_token = generate_blob_sas(
|
||||
account_name=self.account_name,
|
||||
container_name=self.container_name,
|
||||
blob_name=remote_path,
|
||||
account_key=self.account_key,
|
||||
permission=BlobSasPermissions(read=True),
|
||||
expiry=datetime.utcnow() + timedelta(seconds=expires_in)
|
||||
)
|
||||
|
||||
return f"{blob_client.url}?{sas_token}"
|
||||
except FileNotFoundError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise Exception(f"Azure SAS URL generation failed: {e}")
|
||||
|
||||
def copy_file(self, source_path: str, dest_path: str) -> None:
|
||||
"""Copy file within Azure container (server-side copy)."""
|
||||
try:
|
||||
source_blob = self.container_client.get_blob_client(source_path)
|
||||
|
||||
if not source_blob.exists():
|
||||
raise FileNotFoundError(f"Source file not found: {source_path}")
|
||||
|
||||
dest_blob = self.container_client.get_blob_client(dest_path)
|
||||
|
||||
# Start copy operation
|
||||
dest_blob.start_copy_from_url(source_blob.url)
|
||||
|
||||
# Wait for copy to complete
|
||||
properties = dest_blob.get_blob_properties()
|
||||
while properties.copy.status == 'pending':
|
||||
import time
|
||||
time.sleep(0.1)
|
||||
properties = dest_blob.get_blob_properties()
|
||||
|
||||
if properties.copy.status != 'success':
|
||||
raise Exception(f"Copy failed with status: {properties.copy.status}")
|
||||
|
||||
except FileNotFoundError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise Exception(f"Azure copy failed: {e}")
|
||||
275
src/skill_seekers/cli/storage/base_storage.py
Normal file
275
src/skill_seekers/cli/storage/base_storage.py
Normal file
@@ -0,0 +1,275 @@
|
||||
"""
|
||||
Base storage adaptor interface for cloud storage providers.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class StorageObject:
|
||||
"""
|
||||
Represents a file/object in cloud storage.
|
||||
|
||||
Attributes:
|
||||
key: Object key/path in storage
|
||||
size: Size in bytes
|
||||
last_modified: Last modification timestamp
|
||||
etag: ETag/hash of object
|
||||
metadata: Additional metadata
|
||||
"""
|
||||
|
||||
key: str
|
||||
size: int
|
||||
last_modified: Optional[str] = None
|
||||
etag: Optional[str] = None
|
||||
metadata: Optional[Dict[str, str]] = None
|
||||
|
||||
|
||||
class BaseStorageAdaptor(ABC):
|
||||
"""
|
||||
Abstract base class for cloud storage adaptors.
|
||||
|
||||
Provides unified interface for different cloud storage providers.
|
||||
All adaptors must implement these methods.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""
|
||||
Initialize storage adaptor.
|
||||
|
||||
Args:
|
||||
**kwargs: Provider-specific configuration
|
||||
"""
|
||||
self.config = kwargs
|
||||
|
||||
@abstractmethod
|
||||
def upload_file(
|
||||
self, local_path: str, remote_path: str, metadata: Optional[Dict[str, str]] = None
|
||||
) -> str:
|
||||
"""
|
||||
Upload file to cloud storage.
|
||||
|
||||
Args:
|
||||
local_path: Path to local file
|
||||
remote_path: Destination path in cloud storage
|
||||
metadata: Optional metadata to attach to file
|
||||
|
||||
Returns:
|
||||
URL or identifier of uploaded file
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If local file doesn't exist
|
||||
Exception: If upload fails
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def download_file(self, remote_path: str, local_path: str) -> None:
|
||||
"""
|
||||
Download file from cloud storage.
|
||||
|
||||
Args:
|
||||
remote_path: Path to file in cloud storage
|
||||
local_path: Destination path for downloaded file
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If remote file doesn't exist
|
||||
Exception: If download fails
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def delete_file(self, remote_path: str) -> None:
|
||||
"""
|
||||
Delete file from cloud storage.
|
||||
|
||||
Args:
|
||||
remote_path: Path to file in cloud storage
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If remote file doesn't exist
|
||||
Exception: If deletion fails
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def list_files(
|
||||
self, prefix: str = "", max_results: int = 1000
|
||||
) -> List[StorageObject]:
|
||||
"""
|
||||
List files in cloud storage.
|
||||
|
||||
Args:
|
||||
prefix: Prefix to filter files (directory path)
|
||||
max_results: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
List of StorageObject instances
|
||||
|
||||
Raises:
|
||||
Exception: If listing fails
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def file_exists(self, remote_path: str) -> bool:
|
||||
"""
|
||||
Check if file exists in cloud storage.
|
||||
|
||||
Args:
|
||||
remote_path: Path to file in cloud storage
|
||||
|
||||
Returns:
|
||||
True if file exists, False otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_file_url(self, remote_path: str, expires_in: int = 3600) -> str:
|
||||
"""
|
||||
Generate signed URL for file access.
|
||||
|
||||
Args:
|
||||
remote_path: Path to file in cloud storage
|
||||
expires_in: URL expiration time in seconds (default: 1 hour)
|
||||
|
||||
Returns:
|
||||
Signed URL for file access
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If remote file doesn't exist
|
||||
Exception: If URL generation fails
|
||||
"""
|
||||
pass
|
||||
|
||||
def upload_directory(
|
||||
self, local_dir: str, remote_prefix: str = "", exclude_patterns: Optional[List[str]] = None
|
||||
) -> List[str]:
|
||||
"""
|
||||
Upload entire directory to cloud storage.
|
||||
|
||||
Args:
|
||||
local_dir: Path to local directory
|
||||
remote_prefix: Prefix for uploaded files
|
||||
exclude_patterns: Glob patterns to exclude files
|
||||
|
||||
Returns:
|
||||
List of uploaded file paths
|
||||
|
||||
Raises:
|
||||
NotADirectoryError: If local_dir is not a directory
|
||||
Exception: If upload fails
|
||||
"""
|
||||
local_path = Path(local_dir)
|
||||
if not local_path.is_dir():
|
||||
raise NotADirectoryError(f"Not a directory: {local_dir}")
|
||||
|
||||
uploaded_files = []
|
||||
exclude_patterns = exclude_patterns or []
|
||||
|
||||
for file_path in local_path.rglob("*"):
|
||||
if file_path.is_file():
|
||||
# Check exclusion patterns
|
||||
should_exclude = False
|
||||
for pattern in exclude_patterns:
|
||||
if file_path.match(pattern):
|
||||
should_exclude = True
|
||||
break
|
||||
|
||||
if should_exclude:
|
||||
continue
|
||||
|
||||
# Calculate relative path
|
||||
relative_path = file_path.relative_to(local_path)
|
||||
remote_path = f"{remote_prefix}/{relative_path}".lstrip("/")
|
||||
|
||||
# Upload file
|
||||
self.upload_file(str(file_path), remote_path)
|
||||
uploaded_files.append(remote_path)
|
||||
|
||||
return uploaded_files
|
||||
|
||||
def download_directory(
|
||||
self, remote_prefix: str, local_dir: str
|
||||
) -> List[str]:
|
||||
"""
|
||||
Download directory from cloud storage.
|
||||
|
||||
Args:
|
||||
remote_prefix: Prefix of files to download
|
||||
local_dir: Destination directory
|
||||
|
||||
Returns:
|
||||
List of downloaded file paths
|
||||
|
||||
Raises:
|
||||
Exception: If download fails
|
||||
"""
|
||||
local_path = Path(local_dir)
|
||||
local_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
downloaded_files = []
|
||||
files = self.list_files(prefix=remote_prefix)
|
||||
|
||||
for file_obj in files:
|
||||
# Calculate local path
|
||||
relative_path = file_obj.key.removeprefix(remote_prefix).lstrip("/")
|
||||
local_file_path = local_path / relative_path
|
||||
|
||||
# Create parent directories
|
||||
local_file_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download file
|
||||
self.download_file(file_obj.key, str(local_file_path))
|
||||
downloaded_files.append(str(local_file_path))
|
||||
|
||||
return downloaded_files
|
||||
|
||||
def get_file_size(self, remote_path: str) -> int:
|
||||
"""
|
||||
Get size of file in cloud storage.
|
||||
|
||||
Args:
|
||||
remote_path: Path to file in cloud storage
|
||||
|
||||
Returns:
|
||||
File size in bytes
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If remote file doesn't exist
|
||||
"""
|
||||
files = self.list_files(prefix=remote_path, max_results=1)
|
||||
if not files or files[0].key != remote_path:
|
||||
raise FileNotFoundError(f"File not found: {remote_path}")
|
||||
return files[0].size
|
||||
|
||||
def copy_file(
|
||||
self, source_path: str, dest_path: str
|
||||
) -> None:
|
||||
"""
|
||||
Copy file within cloud storage.
|
||||
|
||||
Default implementation downloads then uploads.
|
||||
Subclasses can override with provider-specific copy operations.
|
||||
|
||||
Args:
|
||||
source_path: Source file path
|
||||
dest_path: Destination file path
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If source file doesn't exist
|
||||
Exception: If copy fails
|
||||
"""
|
||||
import tempfile
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
|
||||
tmp_path = tmp_file.name
|
||||
|
||||
try:
|
||||
self.download_file(source_path, tmp_path)
|
||||
self.upload_file(tmp_path, dest_path)
|
||||
finally:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
194
src/skill_seekers/cli/storage/gcs_storage.py
Normal file
194
src/skill_seekers/cli/storage/gcs_storage.py
Normal file
@@ -0,0 +1,194 @@
|
||||
"""
|
||||
Google Cloud Storage (GCS) adaptor implementation.
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
from datetime import timedelta
|
||||
|
||||
try:
|
||||
from google.cloud import storage
|
||||
from google.cloud.exceptions import NotFound
|
||||
GCS_AVAILABLE = True
|
||||
except ImportError:
|
||||
GCS_AVAILABLE = False
|
||||
|
||||
from .base_storage import BaseStorageAdaptor, StorageObject
|
||||
|
||||
|
||||
class GCSStorageAdaptor(BaseStorageAdaptor):
|
||||
"""
|
||||
Google Cloud Storage adaptor.
|
||||
|
||||
Configuration:
|
||||
bucket: GCS bucket name (required)
|
||||
project: GCP project ID (optional, uses default)
|
||||
credentials_path: Path to service account JSON (optional)
|
||||
|
||||
Environment Variables:
|
||||
GOOGLE_APPLICATION_CREDENTIALS: Path to service account JSON
|
||||
GOOGLE_CLOUD_PROJECT: GCP project ID
|
||||
|
||||
Examples:
|
||||
# Using environment variables
|
||||
adaptor = GCSStorageAdaptor(bucket='my-bucket')
|
||||
|
||||
# With explicit credentials
|
||||
adaptor = GCSStorageAdaptor(
|
||||
bucket='my-bucket',
|
||||
project='my-project',
|
||||
credentials_path='/path/to/credentials.json'
|
||||
)
|
||||
|
||||
# Using default credentials
|
||||
adaptor = GCSStorageAdaptor(
|
||||
bucket='my-bucket',
|
||||
project='my-project'
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""
|
||||
Initialize GCS storage adaptor.
|
||||
|
||||
Args:
|
||||
bucket: GCS bucket name (required)
|
||||
**kwargs: Additional GCS configuration
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if not GCS_AVAILABLE:
|
||||
raise ImportError(
|
||||
"google-cloud-storage is required for GCS storage. "
|
||||
"Install with: pip install google-cloud-storage"
|
||||
)
|
||||
|
||||
if 'bucket' not in kwargs:
|
||||
raise ValueError("bucket parameter is required for GCS storage")
|
||||
|
||||
self.bucket_name = kwargs['bucket']
|
||||
self.project = kwargs.get('project', os.getenv('GOOGLE_CLOUD_PROJECT'))
|
||||
|
||||
# Initialize GCS client
|
||||
client_kwargs = {}
|
||||
if self.project:
|
||||
client_kwargs['project'] = self.project
|
||||
|
||||
if 'credentials_path' in kwargs:
|
||||
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = kwargs['credentials_path']
|
||||
|
||||
self.storage_client = storage.Client(**client_kwargs)
|
||||
self.bucket = self.storage_client.bucket(self.bucket_name)
|
||||
|
||||
def upload_file(
|
||||
self, local_path: str, remote_path: str, metadata: Optional[Dict[str, str]] = None
|
||||
) -> str:
|
||||
"""Upload file to GCS."""
|
||||
local_file = Path(local_path)
|
||||
if not local_file.exists():
|
||||
raise FileNotFoundError(f"Local file not found: {local_path}")
|
||||
|
||||
try:
|
||||
blob = self.bucket.blob(remote_path)
|
||||
|
||||
if metadata:
|
||||
blob.metadata = metadata
|
||||
|
||||
blob.upload_from_filename(str(local_file))
|
||||
return f"gs://{self.bucket_name}/{remote_path}"
|
||||
except Exception as e:
|
||||
raise Exception(f"GCS upload failed: {e}")
|
||||
|
||||
def download_file(self, remote_path: str, local_path: str) -> None:
|
||||
"""Download file from GCS."""
|
||||
local_file = Path(local_path)
|
||||
local_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
blob = self.bucket.blob(remote_path)
|
||||
blob.download_to_filename(str(local_file))
|
||||
except NotFound:
|
||||
raise FileNotFoundError(f"Remote file not found: {remote_path}")
|
||||
except Exception as e:
|
||||
raise Exception(f"GCS download failed: {e}")
|
||||
|
||||
def delete_file(self, remote_path: str) -> None:
|
||||
"""Delete file from GCS."""
|
||||
try:
|
||||
blob = self.bucket.blob(remote_path)
|
||||
blob.delete()
|
||||
except NotFound:
|
||||
raise FileNotFoundError(f"Remote file not found: {remote_path}")
|
||||
except Exception as e:
|
||||
raise Exception(f"GCS deletion failed: {e}")
|
||||
|
||||
def list_files(
|
||||
self, prefix: str = "", max_results: int = 1000
|
||||
) -> List[StorageObject]:
|
||||
"""List files in GCS bucket."""
|
||||
try:
|
||||
blobs = self.storage_client.list_blobs(
|
||||
self.bucket_name,
|
||||
prefix=prefix,
|
||||
max_results=max_results
|
||||
)
|
||||
|
||||
files = []
|
||||
for blob in blobs:
|
||||
files.append(StorageObject(
|
||||
key=blob.name,
|
||||
size=blob.size,
|
||||
last_modified=blob.updated.isoformat() if blob.updated else None,
|
||||
etag=blob.etag,
|
||||
metadata=blob.metadata
|
||||
))
|
||||
|
||||
return files
|
||||
except Exception as e:
|
||||
raise Exception(f"GCS listing failed: {e}")
|
||||
|
||||
def file_exists(self, remote_path: str) -> bool:
|
||||
"""Check if file exists in GCS."""
|
||||
try:
|
||||
blob = self.bucket.blob(remote_path)
|
||||
return blob.exists()
|
||||
except Exception as e:
|
||||
raise Exception(f"GCS file existence check failed: {e}")
|
||||
|
||||
def get_file_url(self, remote_path: str, expires_in: int = 3600) -> str:
|
||||
"""Generate signed URL for GCS object."""
|
||||
try:
|
||||
blob = self.bucket.blob(remote_path)
|
||||
|
||||
if not blob.exists():
|
||||
raise FileNotFoundError(f"Remote file not found: {remote_path}")
|
||||
|
||||
url = blob.generate_signed_url(
|
||||
version="v4",
|
||||
expiration=timedelta(seconds=expires_in),
|
||||
method="GET"
|
||||
)
|
||||
return url
|
||||
except FileNotFoundError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise Exception(f"GCS signed URL generation failed: {e}")
|
||||
|
||||
def copy_file(self, source_path: str, dest_path: str) -> None:
|
||||
"""Copy file within GCS bucket (server-side copy)."""
|
||||
try:
|
||||
source_blob = self.bucket.blob(source_path)
|
||||
|
||||
if not source_blob.exists():
|
||||
raise FileNotFoundError(f"Source file not found: {source_path}")
|
||||
|
||||
self.bucket.copy_blob(
|
||||
source_blob,
|
||||
self.bucket,
|
||||
dest_path
|
||||
)
|
||||
except FileNotFoundError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise Exception(f"GCS copy failed: {e}")
|
||||
216
src/skill_seekers/cli/storage/s3_storage.py
Normal file
216
src/skill_seekers/cli/storage/s3_storage.py
Normal file
@@ -0,0 +1,216 @@
|
||||
"""
|
||||
AWS S3 storage adaptor implementation.
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
try:
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
BOTO3_AVAILABLE = True
|
||||
except ImportError:
|
||||
BOTO3_AVAILABLE = False
|
||||
|
||||
from .base_storage import BaseStorageAdaptor, StorageObject
|
||||
|
||||
|
||||
class S3StorageAdaptor(BaseStorageAdaptor):
|
||||
"""
|
||||
AWS S3 storage adaptor.
|
||||
|
||||
Configuration:
|
||||
bucket: S3 bucket name (required)
|
||||
region: AWS region (optional, default: us-east-1)
|
||||
aws_access_key_id: AWS access key (optional, uses env/credentials)
|
||||
aws_secret_access_key: AWS secret key (optional, uses env/credentials)
|
||||
endpoint_url: Custom endpoint URL (optional, for S3-compatible services)
|
||||
|
||||
Environment Variables:
|
||||
AWS_ACCESS_KEY_ID: AWS access key
|
||||
AWS_SECRET_ACCESS_KEY: AWS secret key
|
||||
AWS_DEFAULT_REGION: AWS region
|
||||
|
||||
Examples:
|
||||
# Using environment variables
|
||||
adaptor = S3StorageAdaptor(bucket='my-bucket')
|
||||
|
||||
# With explicit credentials
|
||||
adaptor = S3StorageAdaptor(
|
||||
bucket='my-bucket',
|
||||
region='us-west-2',
|
||||
aws_access_key_id='AKIAIOSFODNN7EXAMPLE',
|
||||
aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'
|
||||
)
|
||||
|
||||
# S3-compatible service (MinIO, DigitalOcean Spaces)
|
||||
adaptor = S3StorageAdaptor(
|
||||
bucket='my-bucket',
|
||||
endpoint_url='https://nyc3.digitaloceanspaces.com',
|
||||
aws_access_key_id='...',
|
||||
aws_secret_access_key='...'
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""
|
||||
Initialize S3 storage adaptor.
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket name (required)
|
||||
**kwargs: Additional S3 configuration
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if not BOTO3_AVAILABLE:
|
||||
raise ImportError(
|
||||
"boto3 is required for S3 storage. "
|
||||
"Install with: pip install boto3"
|
||||
)
|
||||
|
||||
if 'bucket' not in kwargs:
|
||||
raise ValueError("bucket parameter is required for S3 storage")
|
||||
|
||||
self.bucket = kwargs['bucket']
|
||||
self.region = kwargs.get('region', os.getenv('AWS_DEFAULT_REGION', 'us-east-1'))
|
||||
|
||||
# Initialize S3 client
|
||||
client_kwargs = {
|
||||
'region_name': self.region,
|
||||
}
|
||||
|
||||
if 'endpoint_url' in kwargs:
|
||||
client_kwargs['endpoint_url'] = kwargs['endpoint_url']
|
||||
|
||||
if 'aws_access_key_id' in kwargs:
|
||||
client_kwargs['aws_access_key_id'] = kwargs['aws_access_key_id']
|
||||
|
||||
if 'aws_secret_access_key' in kwargs:
|
||||
client_kwargs['aws_secret_access_key'] = kwargs['aws_secret_access_key']
|
||||
|
||||
self.s3_client = boto3.client('s3', **client_kwargs)
|
||||
self.s3_resource = boto3.resource('s3', **client_kwargs)
|
||||
|
||||
def upload_file(
|
||||
self, local_path: str, remote_path: str, metadata: Optional[Dict[str, str]] = None
|
||||
) -> str:
|
||||
"""Upload file to S3."""
|
||||
local_file = Path(local_path)
|
||||
if not local_file.exists():
|
||||
raise FileNotFoundError(f"Local file not found: {local_path}")
|
||||
|
||||
extra_args = {}
|
||||
if metadata:
|
||||
extra_args['Metadata'] = metadata
|
||||
|
||||
try:
|
||||
self.s3_client.upload_file(
|
||||
str(local_file),
|
||||
self.bucket,
|
||||
remote_path,
|
||||
ExtraArgs=extra_args if extra_args else None
|
||||
)
|
||||
return f"s3://{self.bucket}/{remote_path}"
|
||||
except ClientError as e:
|
||||
raise Exception(f"S3 upload failed: {e}")
|
||||
|
||||
def download_file(self, remote_path: str, local_path: str) -> None:
|
||||
"""Download file from S3."""
|
||||
local_file = Path(local_path)
|
||||
local_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
self.s3_client.download_file(
|
||||
self.bucket,
|
||||
remote_path,
|
||||
str(local_file)
|
||||
)
|
||||
except ClientError as e:
|
||||
if e.response['Error']['Code'] == '404':
|
||||
raise FileNotFoundError(f"Remote file not found: {remote_path}")
|
||||
raise Exception(f"S3 download failed: {e}")
|
||||
|
||||
def delete_file(self, remote_path: str) -> None:
|
||||
"""Delete file from S3."""
|
||||
try:
|
||||
self.s3_client.delete_object(
|
||||
Bucket=self.bucket,
|
||||
Key=remote_path
|
||||
)
|
||||
except ClientError as e:
|
||||
raise Exception(f"S3 deletion failed: {e}")
|
||||
|
||||
def list_files(
|
||||
self, prefix: str = "", max_results: int = 1000
|
||||
) -> List[StorageObject]:
|
||||
"""List files in S3 bucket."""
|
||||
try:
|
||||
paginator = self.s3_client.get_paginator('list_objects_v2')
|
||||
page_iterator = paginator.paginate(
|
||||
Bucket=self.bucket,
|
||||
Prefix=prefix,
|
||||
PaginationConfig={'MaxItems': max_results}
|
||||
)
|
||||
|
||||
files = []
|
||||
for page in page_iterator:
|
||||
if 'Contents' not in page:
|
||||
continue
|
||||
|
||||
for obj in page['Contents']:
|
||||
files.append(StorageObject(
|
||||
key=obj['Key'],
|
||||
size=obj['Size'],
|
||||
last_modified=obj['LastModified'].isoformat(),
|
||||
etag=obj.get('ETag', '').strip('"')
|
||||
))
|
||||
|
||||
return files
|
||||
except ClientError as e:
|
||||
raise Exception(f"S3 listing failed: {e}")
|
||||
|
||||
def file_exists(self, remote_path: str) -> bool:
|
||||
"""Check if file exists in S3."""
|
||||
try:
|
||||
self.s3_client.head_object(
|
||||
Bucket=self.bucket,
|
||||
Key=remote_path
|
||||
)
|
||||
return True
|
||||
except ClientError as e:
|
||||
if e.response['Error']['Code'] == '404':
|
||||
return False
|
||||
raise Exception(f"S3 head_object failed: {e}")
|
||||
|
||||
def get_file_url(self, remote_path: str, expires_in: int = 3600) -> str:
|
||||
"""Generate presigned URL for S3 object."""
|
||||
try:
|
||||
url = self.s3_client.generate_presigned_url(
|
||||
'get_object',
|
||||
Params={
|
||||
'Bucket': self.bucket,
|
||||
'Key': remote_path
|
||||
},
|
||||
ExpiresIn=expires_in
|
||||
)
|
||||
return url
|
||||
except ClientError as e:
|
||||
raise Exception(f"S3 presigned URL generation failed: {e}")
|
||||
|
||||
def copy_file(self, source_path: str, dest_path: str) -> None:
|
||||
"""Copy file within S3 bucket (server-side copy)."""
|
||||
try:
|
||||
copy_source = {
|
||||
'Bucket': self.bucket,
|
||||
'Key': source_path
|
||||
}
|
||||
self.s3_client.copy_object(
|
||||
CopySource=copy_source,
|
||||
Bucket=self.bucket,
|
||||
Key=dest_path
|
||||
)
|
||||
except ClientError as e:
|
||||
if e.response['Error']['Code'] == '404':
|
||||
raise FileNotFoundError(f"Source file not found: {source_path}")
|
||||
raise Exception(f"S3 copy failed: {e}")
|
||||
Reference in New Issue
Block a user