fix: Enforce min_chunk_size in RAG chunker
- Filter out chunks smaller than min_chunk_size (default 100 tokens) - Exception: Keep all chunks if entire document is smaller than target size - All 15 tests passing (100% pass rate) Fixes edge case where very small chunks (e.g., 'Short.' = 6 chars) were being created despite min_chunk_size=100 setting. Test: pytest tests/test_rag_chunker.py -v
This commit is contained in:
275
src/skill_seekers/cli/storage/base_storage.py
Normal file
275
src/skill_seekers/cli/storage/base_storage.py
Normal file
@@ -0,0 +1,275 @@
|
||||
"""
|
||||
Base storage adaptor interface for cloud storage providers.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class StorageObject:
|
||||
"""
|
||||
Represents a file/object in cloud storage.
|
||||
|
||||
Attributes:
|
||||
key: Object key/path in storage
|
||||
size: Size in bytes
|
||||
last_modified: Last modification timestamp
|
||||
etag: ETag/hash of object
|
||||
metadata: Additional metadata
|
||||
"""
|
||||
|
||||
key: str
|
||||
size: int
|
||||
last_modified: Optional[str] = None
|
||||
etag: Optional[str] = None
|
||||
metadata: Optional[Dict[str, str]] = None
|
||||
|
||||
|
||||
class BaseStorageAdaptor(ABC):
|
||||
"""
|
||||
Abstract base class for cloud storage adaptors.
|
||||
|
||||
Provides unified interface for different cloud storage providers.
|
||||
All adaptors must implement these methods.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""
|
||||
Initialize storage adaptor.
|
||||
|
||||
Args:
|
||||
**kwargs: Provider-specific configuration
|
||||
"""
|
||||
self.config = kwargs
|
||||
|
||||
@abstractmethod
|
||||
def upload_file(
|
||||
self, local_path: str, remote_path: str, metadata: Optional[Dict[str, str]] = None
|
||||
) -> str:
|
||||
"""
|
||||
Upload file to cloud storage.
|
||||
|
||||
Args:
|
||||
local_path: Path to local file
|
||||
remote_path: Destination path in cloud storage
|
||||
metadata: Optional metadata to attach to file
|
||||
|
||||
Returns:
|
||||
URL or identifier of uploaded file
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If local file doesn't exist
|
||||
Exception: If upload fails
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def download_file(self, remote_path: str, local_path: str) -> None:
|
||||
"""
|
||||
Download file from cloud storage.
|
||||
|
||||
Args:
|
||||
remote_path: Path to file in cloud storage
|
||||
local_path: Destination path for downloaded file
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If remote file doesn't exist
|
||||
Exception: If download fails
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def delete_file(self, remote_path: str) -> None:
|
||||
"""
|
||||
Delete file from cloud storage.
|
||||
|
||||
Args:
|
||||
remote_path: Path to file in cloud storage
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If remote file doesn't exist
|
||||
Exception: If deletion fails
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def list_files(
|
||||
self, prefix: str = "", max_results: int = 1000
|
||||
) -> List[StorageObject]:
|
||||
"""
|
||||
List files in cloud storage.
|
||||
|
||||
Args:
|
||||
prefix: Prefix to filter files (directory path)
|
||||
max_results: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
List of StorageObject instances
|
||||
|
||||
Raises:
|
||||
Exception: If listing fails
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def file_exists(self, remote_path: str) -> bool:
|
||||
"""
|
||||
Check if file exists in cloud storage.
|
||||
|
||||
Args:
|
||||
remote_path: Path to file in cloud storage
|
||||
|
||||
Returns:
|
||||
True if file exists, False otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_file_url(self, remote_path: str, expires_in: int = 3600) -> str:
|
||||
"""
|
||||
Generate signed URL for file access.
|
||||
|
||||
Args:
|
||||
remote_path: Path to file in cloud storage
|
||||
expires_in: URL expiration time in seconds (default: 1 hour)
|
||||
|
||||
Returns:
|
||||
Signed URL for file access
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If remote file doesn't exist
|
||||
Exception: If URL generation fails
|
||||
"""
|
||||
pass
|
||||
|
||||
def upload_directory(
|
||||
self, local_dir: str, remote_prefix: str = "", exclude_patterns: Optional[List[str]] = None
|
||||
) -> List[str]:
|
||||
"""
|
||||
Upload entire directory to cloud storage.
|
||||
|
||||
Args:
|
||||
local_dir: Path to local directory
|
||||
remote_prefix: Prefix for uploaded files
|
||||
exclude_patterns: Glob patterns to exclude files
|
||||
|
||||
Returns:
|
||||
List of uploaded file paths
|
||||
|
||||
Raises:
|
||||
NotADirectoryError: If local_dir is not a directory
|
||||
Exception: If upload fails
|
||||
"""
|
||||
local_path = Path(local_dir)
|
||||
if not local_path.is_dir():
|
||||
raise NotADirectoryError(f"Not a directory: {local_dir}")
|
||||
|
||||
uploaded_files = []
|
||||
exclude_patterns = exclude_patterns or []
|
||||
|
||||
for file_path in local_path.rglob("*"):
|
||||
if file_path.is_file():
|
||||
# Check exclusion patterns
|
||||
should_exclude = False
|
||||
for pattern in exclude_patterns:
|
||||
if file_path.match(pattern):
|
||||
should_exclude = True
|
||||
break
|
||||
|
||||
if should_exclude:
|
||||
continue
|
||||
|
||||
# Calculate relative path
|
||||
relative_path = file_path.relative_to(local_path)
|
||||
remote_path = f"{remote_prefix}/{relative_path}".lstrip("/")
|
||||
|
||||
# Upload file
|
||||
self.upload_file(str(file_path), remote_path)
|
||||
uploaded_files.append(remote_path)
|
||||
|
||||
return uploaded_files
|
||||
|
||||
def download_directory(
|
||||
self, remote_prefix: str, local_dir: str
|
||||
) -> List[str]:
|
||||
"""
|
||||
Download directory from cloud storage.
|
||||
|
||||
Args:
|
||||
remote_prefix: Prefix of files to download
|
||||
local_dir: Destination directory
|
||||
|
||||
Returns:
|
||||
List of downloaded file paths
|
||||
|
||||
Raises:
|
||||
Exception: If download fails
|
||||
"""
|
||||
local_path = Path(local_dir)
|
||||
local_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
downloaded_files = []
|
||||
files = self.list_files(prefix=remote_prefix)
|
||||
|
||||
for file_obj in files:
|
||||
# Calculate local path
|
||||
relative_path = file_obj.key.removeprefix(remote_prefix).lstrip("/")
|
||||
local_file_path = local_path / relative_path
|
||||
|
||||
# Create parent directories
|
||||
local_file_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download file
|
||||
self.download_file(file_obj.key, str(local_file_path))
|
||||
downloaded_files.append(str(local_file_path))
|
||||
|
||||
return downloaded_files
|
||||
|
||||
def get_file_size(self, remote_path: str) -> int:
|
||||
"""
|
||||
Get size of file in cloud storage.
|
||||
|
||||
Args:
|
||||
remote_path: Path to file in cloud storage
|
||||
|
||||
Returns:
|
||||
File size in bytes
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If remote file doesn't exist
|
||||
"""
|
||||
files = self.list_files(prefix=remote_path, max_results=1)
|
||||
if not files or files[0].key != remote_path:
|
||||
raise FileNotFoundError(f"File not found: {remote_path}")
|
||||
return files[0].size
|
||||
|
||||
def copy_file(
|
||||
self, source_path: str, dest_path: str
|
||||
) -> None:
|
||||
"""
|
||||
Copy file within cloud storage.
|
||||
|
||||
Default implementation downloads then uploads.
|
||||
Subclasses can override with provider-specific copy operations.
|
||||
|
||||
Args:
|
||||
source_path: Source file path
|
||||
dest_path: Destination file path
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If source file doesn't exist
|
||||
Exception: If copy fails
|
||||
"""
|
||||
import tempfile
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
|
||||
tmp_path = tmp_file.name
|
||||
|
||||
try:
|
||||
self.download_file(source_path, tmp_path)
|
||||
self.upload_file(tmp_path, dest_path)
|
||||
finally:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
Reference in New Issue
Block a user