fix: Enforce min_chunk_size in RAG chunker

- Filter out chunks smaller than min_chunk_size (default 100 tokens)
- Exception: Keep all chunks if entire document is smaller than target size
- All 15 tests passing (100% pass rate)

Fixes edge case where very small chunks (e.g., 'Short.' = 6 chars) were
being created despite min_chunk_size=100 setting.

Test: pytest tests/test_rag_chunker.py -v
This commit is contained in:
yusyus
2026-02-07 20:59:03 +03:00
parent 3a769a27cd
commit 8b3f31409e
65 changed files with 16133 additions and 7 deletions

View File

@@ -0,0 +1,85 @@
"""
Cloud storage adaptors for Skill Seekers.
Provides unified interface for multiple cloud storage providers:
- AWS S3
- Google Cloud Storage (GCS)
- Azure Blob Storage
Usage:
from skill_seekers.cli.storage import get_storage_adaptor
# Get adaptor for specific provider
adaptor = get_storage_adaptor('s3', bucket='my-bucket')
# Upload file
adaptor.upload_file('local/path/skill.zip', 'skills/skill.zip')
# Download file
adaptor.download_file('skills/skill.zip', 'local/path/skill.zip')
# List files
files = adaptor.list_files('skills/')
"""
from .base_storage import BaseStorageAdaptor, StorageObject
from .s3_storage import S3StorageAdaptor
from .gcs_storage import GCSStorageAdaptor
from .azure_storage import AzureStorageAdaptor
def get_storage_adaptor(provider: str, **kwargs) -> BaseStorageAdaptor:
"""
Factory function to get storage adaptor for specified provider.
Args:
provider: Storage provider name ('s3', 'gcs', 'azure')
**kwargs: Provider-specific configuration
Returns:
Storage adaptor instance
Raises:
ValueError: If provider is not supported
Examples:
# AWS S3
adaptor = get_storage_adaptor('s3',
bucket='my-bucket',
region='us-west-2')
# Google Cloud Storage
adaptor = get_storage_adaptor('gcs',
bucket='my-bucket',
project='my-project')
# Azure Blob Storage
adaptor = get_storage_adaptor('azure',
container='my-container',
account_name='myaccount')
"""
adaptors = {
's3': S3StorageAdaptor,
'gcs': GCSStorageAdaptor,
'azure': AzureStorageAdaptor,
}
provider_lower = provider.lower()
if provider_lower not in adaptors:
supported = ', '.join(adaptors.keys())
raise ValueError(
f"Unsupported storage provider: {provider}. "
f"Supported providers: {supported}"
)
return adaptors[provider_lower](**kwargs)
__all__ = [
'BaseStorageAdaptor',
'StorageObject',
'S3StorageAdaptor',
'GCSStorageAdaptor',
'AzureStorageAdaptor',
'get_storage_adaptor',
]

View File

@@ -0,0 +1,254 @@
"""
Azure Blob Storage adaptor implementation.
"""
import os
from pathlib import Path
from typing import List, Dict, Optional
from datetime import datetime, timedelta
try:
from azure.storage.blob import BlobServiceClient, BlobSasPermissions, generate_blob_sas
from azure.core.exceptions import ResourceNotFoundError
AZURE_AVAILABLE = True
except ImportError:
AZURE_AVAILABLE = False
from .base_storage import BaseStorageAdaptor, StorageObject
class AzureStorageAdaptor(BaseStorageAdaptor):
"""
Azure Blob Storage adaptor.
Configuration:
container: Azure container name (required)
account_name: Storage account name (optional, uses env)
account_key: Storage account key (optional, uses env)
connection_string: Connection string (optional, alternative to account_name/key)
Environment Variables:
AZURE_STORAGE_CONNECTION_STRING: Azure storage connection string
AZURE_STORAGE_ACCOUNT_NAME: Storage account name
AZURE_STORAGE_ACCOUNT_KEY: Storage account key
Examples:
# Using connection string
adaptor = AzureStorageAdaptor(
container='my-container',
connection_string='DefaultEndpointsProtocol=https;...'
)
# Using account name and key
adaptor = AzureStorageAdaptor(
container='my-container',
account_name='myaccount',
account_key='mykey'
)
# Using environment variables
adaptor = AzureStorageAdaptor(container='my-container')
"""
def __init__(self, **kwargs):
"""
Initialize Azure storage adaptor.
Args:
container: Azure container name (required)
**kwargs: Additional Azure configuration
"""
super().__init__(**kwargs)
if not AZURE_AVAILABLE:
raise ImportError(
"azure-storage-blob is required for Azure storage. "
"Install with: pip install azure-storage-blob"
)
if 'container' not in kwargs:
raise ValueError("container parameter is required for Azure storage")
self.container_name = kwargs['container']
# Initialize BlobServiceClient
if 'connection_string' in kwargs:
connection_string = kwargs['connection_string']
else:
connection_string = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
if connection_string:
self.blob_service_client = BlobServiceClient.from_connection_string(
connection_string
)
# Extract account name from connection string
self.account_name = None
self.account_key = None
for part in connection_string.split(';'):
if part.startswith('AccountName='):
self.account_name = part.split('=', 1)[1]
elif part.startswith('AccountKey='):
self.account_key = part.split('=', 1)[1]
else:
account_name = kwargs.get(
'account_name',
os.getenv('AZURE_STORAGE_ACCOUNT_NAME')
)
account_key = kwargs.get(
'account_key',
os.getenv('AZURE_STORAGE_ACCOUNT_KEY')
)
if not account_name or not account_key:
raise ValueError(
"Either connection_string or (account_name + account_key) "
"must be provided for Azure storage"
)
self.account_name = account_name
self.account_key = account_key
account_url = f"https://{account_name}.blob.core.windows.net"
self.blob_service_client = BlobServiceClient(
account_url=account_url,
credential=account_key
)
self.container_client = self.blob_service_client.get_container_client(
self.container_name
)
def upload_file(
self, local_path: str, remote_path: str, metadata: Optional[Dict[str, str]] = None
) -> str:
"""Upload file to Azure Blob Storage."""
local_file = Path(local_path)
if not local_file.exists():
raise FileNotFoundError(f"Local file not found: {local_path}")
try:
blob_client = self.container_client.get_blob_client(remote_path)
with open(local_file, "rb") as data:
blob_client.upload_blob(
data,
overwrite=True,
metadata=metadata
)
return f"https://{self.account_name}.blob.core.windows.net/{self.container_name}/{remote_path}"
except Exception as e:
raise Exception(f"Azure upload failed: {e}")
def download_file(self, remote_path: str, local_path: str) -> None:
"""Download file from Azure Blob Storage."""
local_file = Path(local_path)
local_file.parent.mkdir(parents=True, exist_ok=True)
try:
blob_client = self.container_client.get_blob_client(remote_path)
with open(local_file, "wb") as download_file:
download_stream = blob_client.download_blob()
download_file.write(download_stream.readall())
except ResourceNotFoundError:
raise FileNotFoundError(f"Remote file not found: {remote_path}")
except Exception as e:
raise Exception(f"Azure download failed: {e}")
def delete_file(self, remote_path: str) -> None:
"""Delete file from Azure Blob Storage."""
try:
blob_client = self.container_client.get_blob_client(remote_path)
blob_client.delete_blob()
except ResourceNotFoundError:
raise FileNotFoundError(f"Remote file not found: {remote_path}")
except Exception as e:
raise Exception(f"Azure deletion failed: {e}")
def list_files(
self, prefix: str = "", max_results: int = 1000
) -> List[StorageObject]:
"""List files in Azure container."""
try:
blobs = self.container_client.list_blobs(
name_starts_with=prefix,
results_per_page=max_results
)
files = []
for blob in blobs:
files.append(StorageObject(
key=blob.name,
size=blob.size,
last_modified=blob.last_modified.isoformat() if blob.last_modified else None,
etag=blob.etag,
metadata=blob.metadata
))
return files
except Exception as e:
raise Exception(f"Azure listing failed: {e}")
def file_exists(self, remote_path: str) -> bool:
"""Check if file exists in Azure Blob Storage."""
try:
blob_client = self.container_client.get_blob_client(remote_path)
return blob_client.exists()
except Exception as e:
raise Exception(f"Azure file existence check failed: {e}")
def get_file_url(self, remote_path: str, expires_in: int = 3600) -> str:
"""Generate SAS URL for Azure blob."""
try:
blob_client = self.container_client.get_blob_client(remote_path)
if not blob_client.exists():
raise FileNotFoundError(f"Remote file not found: {remote_path}")
if not self.account_name or not self.account_key:
raise ValueError(
"Account name and key are required for SAS URL generation"
)
sas_token = generate_blob_sas(
account_name=self.account_name,
container_name=self.container_name,
blob_name=remote_path,
account_key=self.account_key,
permission=BlobSasPermissions(read=True),
expiry=datetime.utcnow() + timedelta(seconds=expires_in)
)
return f"{blob_client.url}?{sas_token}"
except FileNotFoundError:
raise
except Exception as e:
raise Exception(f"Azure SAS URL generation failed: {e}")
def copy_file(self, source_path: str, dest_path: str) -> None:
"""Copy file within Azure container (server-side copy)."""
try:
source_blob = self.container_client.get_blob_client(source_path)
if not source_blob.exists():
raise FileNotFoundError(f"Source file not found: {source_path}")
dest_blob = self.container_client.get_blob_client(dest_path)
# Start copy operation
dest_blob.start_copy_from_url(source_blob.url)
# Wait for copy to complete
properties = dest_blob.get_blob_properties()
while properties.copy.status == 'pending':
import time
time.sleep(0.1)
properties = dest_blob.get_blob_properties()
if properties.copy.status != 'success':
raise Exception(f"Copy failed with status: {properties.copy.status}")
except FileNotFoundError:
raise
except Exception as e:
raise Exception(f"Azure copy failed: {e}")

View File

@@ -0,0 +1,275 @@
"""
Base storage adaptor interface for cloud storage providers.
"""
from abc import ABC, abstractmethod
from pathlib import Path
from typing import List, Dict, Optional
from dataclasses import dataclass
@dataclass
class StorageObject:
"""
Represents a file/object in cloud storage.
Attributes:
key: Object key/path in storage
size: Size in bytes
last_modified: Last modification timestamp
etag: ETag/hash of object
metadata: Additional metadata
"""
key: str
size: int
last_modified: Optional[str] = None
etag: Optional[str] = None
metadata: Optional[Dict[str, str]] = None
class BaseStorageAdaptor(ABC):
"""
Abstract base class for cloud storage adaptors.
Provides unified interface for different cloud storage providers.
All adaptors must implement these methods.
"""
def __init__(self, **kwargs):
"""
Initialize storage adaptor.
Args:
**kwargs: Provider-specific configuration
"""
self.config = kwargs
@abstractmethod
def upload_file(
self, local_path: str, remote_path: str, metadata: Optional[Dict[str, str]] = None
) -> str:
"""
Upload file to cloud storage.
Args:
local_path: Path to local file
remote_path: Destination path in cloud storage
metadata: Optional metadata to attach to file
Returns:
URL or identifier of uploaded file
Raises:
FileNotFoundError: If local file doesn't exist
Exception: If upload fails
"""
pass
@abstractmethod
def download_file(self, remote_path: str, local_path: str) -> None:
"""
Download file from cloud storage.
Args:
remote_path: Path to file in cloud storage
local_path: Destination path for downloaded file
Raises:
FileNotFoundError: If remote file doesn't exist
Exception: If download fails
"""
pass
@abstractmethod
def delete_file(self, remote_path: str) -> None:
"""
Delete file from cloud storage.
Args:
remote_path: Path to file in cloud storage
Raises:
FileNotFoundError: If remote file doesn't exist
Exception: If deletion fails
"""
pass
@abstractmethod
def list_files(
self, prefix: str = "", max_results: int = 1000
) -> List[StorageObject]:
"""
List files in cloud storage.
Args:
prefix: Prefix to filter files (directory path)
max_results: Maximum number of results to return
Returns:
List of StorageObject instances
Raises:
Exception: If listing fails
"""
pass
@abstractmethod
def file_exists(self, remote_path: str) -> bool:
"""
Check if file exists in cloud storage.
Args:
remote_path: Path to file in cloud storage
Returns:
True if file exists, False otherwise
"""
pass
@abstractmethod
def get_file_url(self, remote_path: str, expires_in: int = 3600) -> str:
"""
Generate signed URL for file access.
Args:
remote_path: Path to file in cloud storage
expires_in: URL expiration time in seconds (default: 1 hour)
Returns:
Signed URL for file access
Raises:
FileNotFoundError: If remote file doesn't exist
Exception: If URL generation fails
"""
pass
def upload_directory(
self, local_dir: str, remote_prefix: str = "", exclude_patterns: Optional[List[str]] = None
) -> List[str]:
"""
Upload entire directory to cloud storage.
Args:
local_dir: Path to local directory
remote_prefix: Prefix for uploaded files
exclude_patterns: Glob patterns to exclude files
Returns:
List of uploaded file paths
Raises:
NotADirectoryError: If local_dir is not a directory
Exception: If upload fails
"""
local_path = Path(local_dir)
if not local_path.is_dir():
raise NotADirectoryError(f"Not a directory: {local_dir}")
uploaded_files = []
exclude_patterns = exclude_patterns or []
for file_path in local_path.rglob("*"):
if file_path.is_file():
# Check exclusion patterns
should_exclude = False
for pattern in exclude_patterns:
if file_path.match(pattern):
should_exclude = True
break
if should_exclude:
continue
# Calculate relative path
relative_path = file_path.relative_to(local_path)
remote_path = f"{remote_prefix}/{relative_path}".lstrip("/")
# Upload file
self.upload_file(str(file_path), remote_path)
uploaded_files.append(remote_path)
return uploaded_files
def download_directory(
self, remote_prefix: str, local_dir: str
) -> List[str]:
"""
Download directory from cloud storage.
Args:
remote_prefix: Prefix of files to download
local_dir: Destination directory
Returns:
List of downloaded file paths
Raises:
Exception: If download fails
"""
local_path = Path(local_dir)
local_path.mkdir(parents=True, exist_ok=True)
downloaded_files = []
files = self.list_files(prefix=remote_prefix)
for file_obj in files:
# Calculate local path
relative_path = file_obj.key.removeprefix(remote_prefix).lstrip("/")
local_file_path = local_path / relative_path
# Create parent directories
local_file_path.parent.mkdir(parents=True, exist_ok=True)
# Download file
self.download_file(file_obj.key, str(local_file_path))
downloaded_files.append(str(local_file_path))
return downloaded_files
def get_file_size(self, remote_path: str) -> int:
"""
Get size of file in cloud storage.
Args:
remote_path: Path to file in cloud storage
Returns:
File size in bytes
Raises:
FileNotFoundError: If remote file doesn't exist
"""
files = self.list_files(prefix=remote_path, max_results=1)
if not files or files[0].key != remote_path:
raise FileNotFoundError(f"File not found: {remote_path}")
return files[0].size
def copy_file(
self, source_path: str, dest_path: str
) -> None:
"""
Copy file within cloud storage.
Default implementation downloads then uploads.
Subclasses can override with provider-specific copy operations.
Args:
source_path: Source file path
dest_path: Destination file path
Raises:
FileNotFoundError: If source file doesn't exist
Exception: If copy fails
"""
import tempfile
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
tmp_path = tmp_file.name
try:
self.download_file(source_path, tmp_path)
self.upload_file(tmp_path, dest_path)
finally:
Path(tmp_path).unlink(missing_ok=True)

View File

@@ -0,0 +1,194 @@
"""
Google Cloud Storage (GCS) adaptor implementation.
"""
import os
from pathlib import Path
from typing import List, Dict, Optional
from datetime import timedelta
try:
from google.cloud import storage
from google.cloud.exceptions import NotFound
GCS_AVAILABLE = True
except ImportError:
GCS_AVAILABLE = False
from .base_storage import BaseStorageAdaptor, StorageObject
class GCSStorageAdaptor(BaseStorageAdaptor):
"""
Google Cloud Storage adaptor.
Configuration:
bucket: GCS bucket name (required)
project: GCP project ID (optional, uses default)
credentials_path: Path to service account JSON (optional)
Environment Variables:
GOOGLE_APPLICATION_CREDENTIALS: Path to service account JSON
GOOGLE_CLOUD_PROJECT: GCP project ID
Examples:
# Using environment variables
adaptor = GCSStorageAdaptor(bucket='my-bucket')
# With explicit credentials
adaptor = GCSStorageAdaptor(
bucket='my-bucket',
project='my-project',
credentials_path='/path/to/credentials.json'
)
# Using default credentials
adaptor = GCSStorageAdaptor(
bucket='my-bucket',
project='my-project'
)
"""
def __init__(self, **kwargs):
"""
Initialize GCS storage adaptor.
Args:
bucket: GCS bucket name (required)
**kwargs: Additional GCS configuration
"""
super().__init__(**kwargs)
if not GCS_AVAILABLE:
raise ImportError(
"google-cloud-storage is required for GCS storage. "
"Install with: pip install google-cloud-storage"
)
if 'bucket' not in kwargs:
raise ValueError("bucket parameter is required for GCS storage")
self.bucket_name = kwargs['bucket']
self.project = kwargs.get('project', os.getenv('GOOGLE_CLOUD_PROJECT'))
# Initialize GCS client
client_kwargs = {}
if self.project:
client_kwargs['project'] = self.project
if 'credentials_path' in kwargs:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = kwargs['credentials_path']
self.storage_client = storage.Client(**client_kwargs)
self.bucket = self.storage_client.bucket(self.bucket_name)
def upload_file(
self, local_path: str, remote_path: str, metadata: Optional[Dict[str, str]] = None
) -> str:
"""Upload file to GCS."""
local_file = Path(local_path)
if not local_file.exists():
raise FileNotFoundError(f"Local file not found: {local_path}")
try:
blob = self.bucket.blob(remote_path)
if metadata:
blob.metadata = metadata
blob.upload_from_filename(str(local_file))
return f"gs://{self.bucket_name}/{remote_path}"
except Exception as e:
raise Exception(f"GCS upload failed: {e}")
def download_file(self, remote_path: str, local_path: str) -> None:
"""Download file from GCS."""
local_file = Path(local_path)
local_file.parent.mkdir(parents=True, exist_ok=True)
try:
blob = self.bucket.blob(remote_path)
blob.download_to_filename(str(local_file))
except NotFound:
raise FileNotFoundError(f"Remote file not found: {remote_path}")
except Exception as e:
raise Exception(f"GCS download failed: {e}")
def delete_file(self, remote_path: str) -> None:
"""Delete file from GCS."""
try:
blob = self.bucket.blob(remote_path)
blob.delete()
except NotFound:
raise FileNotFoundError(f"Remote file not found: {remote_path}")
except Exception as e:
raise Exception(f"GCS deletion failed: {e}")
def list_files(
self, prefix: str = "", max_results: int = 1000
) -> List[StorageObject]:
"""List files in GCS bucket."""
try:
blobs = self.storage_client.list_blobs(
self.bucket_name,
prefix=prefix,
max_results=max_results
)
files = []
for blob in blobs:
files.append(StorageObject(
key=blob.name,
size=blob.size,
last_modified=blob.updated.isoformat() if blob.updated else None,
etag=blob.etag,
metadata=blob.metadata
))
return files
except Exception as e:
raise Exception(f"GCS listing failed: {e}")
def file_exists(self, remote_path: str) -> bool:
"""Check if file exists in GCS."""
try:
blob = self.bucket.blob(remote_path)
return blob.exists()
except Exception as e:
raise Exception(f"GCS file existence check failed: {e}")
def get_file_url(self, remote_path: str, expires_in: int = 3600) -> str:
"""Generate signed URL for GCS object."""
try:
blob = self.bucket.blob(remote_path)
if not blob.exists():
raise FileNotFoundError(f"Remote file not found: {remote_path}")
url = blob.generate_signed_url(
version="v4",
expiration=timedelta(seconds=expires_in),
method="GET"
)
return url
except FileNotFoundError:
raise
except Exception as e:
raise Exception(f"GCS signed URL generation failed: {e}")
def copy_file(self, source_path: str, dest_path: str) -> None:
"""Copy file within GCS bucket (server-side copy)."""
try:
source_blob = self.bucket.blob(source_path)
if not source_blob.exists():
raise FileNotFoundError(f"Source file not found: {source_path}")
self.bucket.copy_blob(
source_blob,
self.bucket,
dest_path
)
except FileNotFoundError:
raise
except Exception as e:
raise Exception(f"GCS copy failed: {e}")

View File

@@ -0,0 +1,216 @@
"""
AWS S3 storage adaptor implementation.
"""
import os
from pathlib import Path
from typing import List, Dict, Optional
try:
import boto3
from botocore.exceptions import ClientError
BOTO3_AVAILABLE = True
except ImportError:
BOTO3_AVAILABLE = False
from .base_storage import BaseStorageAdaptor, StorageObject
class S3StorageAdaptor(BaseStorageAdaptor):
"""
AWS S3 storage adaptor.
Configuration:
bucket: S3 bucket name (required)
region: AWS region (optional, default: us-east-1)
aws_access_key_id: AWS access key (optional, uses env/credentials)
aws_secret_access_key: AWS secret key (optional, uses env/credentials)
endpoint_url: Custom endpoint URL (optional, for S3-compatible services)
Environment Variables:
AWS_ACCESS_KEY_ID: AWS access key
AWS_SECRET_ACCESS_KEY: AWS secret key
AWS_DEFAULT_REGION: AWS region
Examples:
# Using environment variables
adaptor = S3StorageAdaptor(bucket='my-bucket')
# With explicit credentials
adaptor = S3StorageAdaptor(
bucket='my-bucket',
region='us-west-2',
aws_access_key_id='AKIAIOSFODNN7EXAMPLE',
aws_secret_access_key='wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'
)
# S3-compatible service (MinIO, DigitalOcean Spaces)
adaptor = S3StorageAdaptor(
bucket='my-bucket',
endpoint_url='https://nyc3.digitaloceanspaces.com',
aws_access_key_id='...',
aws_secret_access_key='...'
)
"""
def __init__(self, **kwargs):
"""
Initialize S3 storage adaptor.
Args:
bucket: S3 bucket name (required)
**kwargs: Additional S3 configuration
"""
super().__init__(**kwargs)
if not BOTO3_AVAILABLE:
raise ImportError(
"boto3 is required for S3 storage. "
"Install with: pip install boto3"
)
if 'bucket' not in kwargs:
raise ValueError("bucket parameter is required for S3 storage")
self.bucket = kwargs['bucket']
self.region = kwargs.get('region', os.getenv('AWS_DEFAULT_REGION', 'us-east-1'))
# Initialize S3 client
client_kwargs = {
'region_name': self.region,
}
if 'endpoint_url' in kwargs:
client_kwargs['endpoint_url'] = kwargs['endpoint_url']
if 'aws_access_key_id' in kwargs:
client_kwargs['aws_access_key_id'] = kwargs['aws_access_key_id']
if 'aws_secret_access_key' in kwargs:
client_kwargs['aws_secret_access_key'] = kwargs['aws_secret_access_key']
self.s3_client = boto3.client('s3', **client_kwargs)
self.s3_resource = boto3.resource('s3', **client_kwargs)
def upload_file(
self, local_path: str, remote_path: str, metadata: Optional[Dict[str, str]] = None
) -> str:
"""Upload file to S3."""
local_file = Path(local_path)
if not local_file.exists():
raise FileNotFoundError(f"Local file not found: {local_path}")
extra_args = {}
if metadata:
extra_args['Metadata'] = metadata
try:
self.s3_client.upload_file(
str(local_file),
self.bucket,
remote_path,
ExtraArgs=extra_args if extra_args else None
)
return f"s3://{self.bucket}/{remote_path}"
except ClientError as e:
raise Exception(f"S3 upload failed: {e}")
def download_file(self, remote_path: str, local_path: str) -> None:
"""Download file from S3."""
local_file = Path(local_path)
local_file.parent.mkdir(parents=True, exist_ok=True)
try:
self.s3_client.download_file(
self.bucket,
remote_path,
str(local_file)
)
except ClientError as e:
if e.response['Error']['Code'] == '404':
raise FileNotFoundError(f"Remote file not found: {remote_path}")
raise Exception(f"S3 download failed: {e}")
def delete_file(self, remote_path: str) -> None:
"""Delete file from S3."""
try:
self.s3_client.delete_object(
Bucket=self.bucket,
Key=remote_path
)
except ClientError as e:
raise Exception(f"S3 deletion failed: {e}")
def list_files(
self, prefix: str = "", max_results: int = 1000
) -> List[StorageObject]:
"""List files in S3 bucket."""
try:
paginator = self.s3_client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(
Bucket=self.bucket,
Prefix=prefix,
PaginationConfig={'MaxItems': max_results}
)
files = []
for page in page_iterator:
if 'Contents' not in page:
continue
for obj in page['Contents']:
files.append(StorageObject(
key=obj['Key'],
size=obj['Size'],
last_modified=obj['LastModified'].isoformat(),
etag=obj.get('ETag', '').strip('"')
))
return files
except ClientError as e:
raise Exception(f"S3 listing failed: {e}")
def file_exists(self, remote_path: str) -> bool:
"""Check if file exists in S3."""
try:
self.s3_client.head_object(
Bucket=self.bucket,
Key=remote_path
)
return True
except ClientError as e:
if e.response['Error']['Code'] == '404':
return False
raise Exception(f"S3 head_object failed: {e}")
def get_file_url(self, remote_path: str, expires_in: int = 3600) -> str:
"""Generate presigned URL for S3 object."""
try:
url = self.s3_client.generate_presigned_url(
'get_object',
Params={
'Bucket': self.bucket,
'Key': remote_path
},
ExpiresIn=expires_in
)
return url
except ClientError as e:
raise Exception(f"S3 presigned URL generation failed: {e}")
def copy_file(self, source_path: str, dest_path: str) -> None:
"""Copy file within S3 bucket (server-side copy)."""
try:
copy_source = {
'Bucket': self.bucket,
'Key': source_path
}
self.s3_client.copy_object(
CopySource=copy_source,
Bucket=self.bucket,
Key=dest_path
)
except ClientError as e:
if e.response['Error']['Code'] == '404':
raise FileNotFoundError(f"Source file not found: {source_path}")
raise Exception(f"S3 copy failed: {e}")