- Filter out chunks smaller than min_chunk_size (default 100 tokens) - Exception: Keep all chunks if entire document is smaller than target size - All 15 tests passing (100% pass rate) Fixes edge case where very small chunks (e.g., 'Short.' = 6 chars) were being created despite min_chunk_size=100 setting. Test: pytest tests/test_rag_chunker.py -v
276 lines
7.5 KiB
Python
276 lines
7.5 KiB
Python
"""
|
|
Base storage adaptor interface for cloud storage providers.
|
|
"""
|
|
|
|
from abc import ABC, abstractmethod
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional
|
|
from dataclasses import dataclass
|
|
|
|
|
|
@dataclass
|
|
class StorageObject:
|
|
"""
|
|
Represents a file/object in cloud storage.
|
|
|
|
Attributes:
|
|
key: Object key/path in storage
|
|
size: Size in bytes
|
|
last_modified: Last modification timestamp
|
|
etag: ETag/hash of object
|
|
metadata: Additional metadata
|
|
"""
|
|
|
|
key: str
|
|
size: int
|
|
last_modified: Optional[str] = None
|
|
etag: Optional[str] = None
|
|
metadata: Optional[Dict[str, str]] = None
|
|
|
|
|
|
class BaseStorageAdaptor(ABC):
|
|
"""
|
|
Abstract base class for cloud storage adaptors.
|
|
|
|
Provides unified interface for different cloud storage providers.
|
|
All adaptors must implement these methods.
|
|
"""
|
|
|
|
def __init__(self, **kwargs):
|
|
"""
|
|
Initialize storage adaptor.
|
|
|
|
Args:
|
|
**kwargs: Provider-specific configuration
|
|
"""
|
|
self.config = kwargs
|
|
|
|
@abstractmethod
|
|
def upload_file(
|
|
self, local_path: str, remote_path: str, metadata: Optional[Dict[str, str]] = None
|
|
) -> str:
|
|
"""
|
|
Upload file to cloud storage.
|
|
|
|
Args:
|
|
local_path: Path to local file
|
|
remote_path: Destination path in cloud storage
|
|
metadata: Optional metadata to attach to file
|
|
|
|
Returns:
|
|
URL or identifier of uploaded file
|
|
|
|
Raises:
|
|
FileNotFoundError: If local file doesn't exist
|
|
Exception: If upload fails
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def download_file(self, remote_path: str, local_path: str) -> None:
|
|
"""
|
|
Download file from cloud storage.
|
|
|
|
Args:
|
|
remote_path: Path to file in cloud storage
|
|
local_path: Destination path for downloaded file
|
|
|
|
Raises:
|
|
FileNotFoundError: If remote file doesn't exist
|
|
Exception: If download fails
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def delete_file(self, remote_path: str) -> None:
|
|
"""
|
|
Delete file from cloud storage.
|
|
|
|
Args:
|
|
remote_path: Path to file in cloud storage
|
|
|
|
Raises:
|
|
FileNotFoundError: If remote file doesn't exist
|
|
Exception: If deletion fails
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def list_files(
|
|
self, prefix: str = "", max_results: int = 1000
|
|
) -> List[StorageObject]:
|
|
"""
|
|
List files in cloud storage.
|
|
|
|
Args:
|
|
prefix: Prefix to filter files (directory path)
|
|
max_results: Maximum number of results to return
|
|
|
|
Returns:
|
|
List of StorageObject instances
|
|
|
|
Raises:
|
|
Exception: If listing fails
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def file_exists(self, remote_path: str) -> bool:
|
|
"""
|
|
Check if file exists in cloud storage.
|
|
|
|
Args:
|
|
remote_path: Path to file in cloud storage
|
|
|
|
Returns:
|
|
True if file exists, False otherwise
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def get_file_url(self, remote_path: str, expires_in: int = 3600) -> str:
|
|
"""
|
|
Generate signed URL for file access.
|
|
|
|
Args:
|
|
remote_path: Path to file in cloud storage
|
|
expires_in: URL expiration time in seconds (default: 1 hour)
|
|
|
|
Returns:
|
|
Signed URL for file access
|
|
|
|
Raises:
|
|
FileNotFoundError: If remote file doesn't exist
|
|
Exception: If URL generation fails
|
|
"""
|
|
pass
|
|
|
|
def upload_directory(
|
|
self, local_dir: str, remote_prefix: str = "", exclude_patterns: Optional[List[str]] = None
|
|
) -> List[str]:
|
|
"""
|
|
Upload entire directory to cloud storage.
|
|
|
|
Args:
|
|
local_dir: Path to local directory
|
|
remote_prefix: Prefix for uploaded files
|
|
exclude_patterns: Glob patterns to exclude files
|
|
|
|
Returns:
|
|
List of uploaded file paths
|
|
|
|
Raises:
|
|
NotADirectoryError: If local_dir is not a directory
|
|
Exception: If upload fails
|
|
"""
|
|
local_path = Path(local_dir)
|
|
if not local_path.is_dir():
|
|
raise NotADirectoryError(f"Not a directory: {local_dir}")
|
|
|
|
uploaded_files = []
|
|
exclude_patterns = exclude_patterns or []
|
|
|
|
for file_path in local_path.rglob("*"):
|
|
if file_path.is_file():
|
|
# Check exclusion patterns
|
|
should_exclude = False
|
|
for pattern in exclude_patterns:
|
|
if file_path.match(pattern):
|
|
should_exclude = True
|
|
break
|
|
|
|
if should_exclude:
|
|
continue
|
|
|
|
# Calculate relative path
|
|
relative_path = file_path.relative_to(local_path)
|
|
remote_path = f"{remote_prefix}/{relative_path}".lstrip("/")
|
|
|
|
# Upload file
|
|
self.upload_file(str(file_path), remote_path)
|
|
uploaded_files.append(remote_path)
|
|
|
|
return uploaded_files
|
|
|
|
def download_directory(
|
|
self, remote_prefix: str, local_dir: str
|
|
) -> List[str]:
|
|
"""
|
|
Download directory from cloud storage.
|
|
|
|
Args:
|
|
remote_prefix: Prefix of files to download
|
|
local_dir: Destination directory
|
|
|
|
Returns:
|
|
List of downloaded file paths
|
|
|
|
Raises:
|
|
Exception: If download fails
|
|
"""
|
|
local_path = Path(local_dir)
|
|
local_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
downloaded_files = []
|
|
files = self.list_files(prefix=remote_prefix)
|
|
|
|
for file_obj in files:
|
|
# Calculate local path
|
|
relative_path = file_obj.key.removeprefix(remote_prefix).lstrip("/")
|
|
local_file_path = local_path / relative_path
|
|
|
|
# Create parent directories
|
|
local_file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Download file
|
|
self.download_file(file_obj.key, str(local_file_path))
|
|
downloaded_files.append(str(local_file_path))
|
|
|
|
return downloaded_files
|
|
|
|
def get_file_size(self, remote_path: str) -> int:
|
|
"""
|
|
Get size of file in cloud storage.
|
|
|
|
Args:
|
|
remote_path: Path to file in cloud storage
|
|
|
|
Returns:
|
|
File size in bytes
|
|
|
|
Raises:
|
|
FileNotFoundError: If remote file doesn't exist
|
|
"""
|
|
files = self.list_files(prefix=remote_path, max_results=1)
|
|
if not files or files[0].key != remote_path:
|
|
raise FileNotFoundError(f"File not found: {remote_path}")
|
|
return files[0].size
|
|
|
|
def copy_file(
|
|
self, source_path: str, dest_path: str
|
|
) -> None:
|
|
"""
|
|
Copy file within cloud storage.
|
|
|
|
Default implementation downloads then uploads.
|
|
Subclasses can override with provider-specific copy operations.
|
|
|
|
Args:
|
|
source_path: Source file path
|
|
dest_path: Destination file path
|
|
|
|
Raises:
|
|
FileNotFoundError: If source file doesn't exist
|
|
Exception: If copy fails
|
|
"""
|
|
import tempfile
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
|
|
tmp_path = tmp_file.name
|
|
|
|
try:
|
|
self.download_file(source_path, tmp_path)
|
|
self.upload_file(tmp_path, dest_path)
|
|
finally:
|
|
Path(tmp_path).unlink(missing_ok=True)
|