feat: Add streaming ingestion for large docs (Task #14)
- Memory-efficient streaming with chunking - Progress tracking with real-time stats - Batch processing and resume capability - CLI integration with --streaming flag - 10 tests passing (100%) Files: - streaming_ingest.py: Core streaming engine - streaming_adaptor.py: Adaptor integration - package_skill.py: CLI flags added - test_streaming_ingestion.py: Comprehensive tests Week 2: 5/9 tasks complete (56%) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
420
src/skill_seekers/cli/streaming_ingest.py
Normal file
420
src/skill_seekers/cli/streaming_ingest.py
Normal file
@@ -0,0 +1,420 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Streaming Ingestion for Large Documentation Sets
|
||||
|
||||
Provides memory-efficient processing and batch upload capabilities for large
|
||||
skill documentation. Handles chunking, progress tracking, and resume functionality.
|
||||
"""
|
||||
|
||||
import json
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterator, Optional
|
||||
from dataclasses import dataclass
|
||||
import time
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkMetadata:
|
||||
"""Metadata for a document chunk."""
|
||||
chunk_id: str
|
||||
source: str
|
||||
category: str
|
||||
file: str
|
||||
chunk_index: int
|
||||
total_chunks: int
|
||||
char_start: int
|
||||
char_end: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class IngestionProgress:
|
||||
"""Progress tracking for streaming ingestion."""
|
||||
total_documents: int
|
||||
processed_documents: int
|
||||
total_chunks: int
|
||||
processed_chunks: int
|
||||
failed_chunks: int
|
||||
bytes_processed: int
|
||||
start_time: float
|
||||
|
||||
@property
|
||||
def progress_percent(self) -> float:
|
||||
"""Calculate progress percentage."""
|
||||
if self.total_chunks == 0:
|
||||
return 0.0
|
||||
return (self.processed_chunks / self.total_chunks) * 100
|
||||
|
||||
@property
|
||||
def elapsed_time(self) -> float:
|
||||
"""Calculate elapsed time in seconds."""
|
||||
return time.time() - self.start_time
|
||||
|
||||
@property
|
||||
def chunks_per_second(self) -> float:
|
||||
"""Calculate processing rate."""
|
||||
elapsed = self.elapsed_time
|
||||
if elapsed == 0:
|
||||
return 0.0
|
||||
return self.processed_chunks / elapsed
|
||||
|
||||
@property
|
||||
def eta_seconds(self) -> float:
|
||||
"""Estimate time remaining in seconds."""
|
||||
rate = self.chunks_per_second
|
||||
if rate == 0:
|
||||
return 0.0
|
||||
remaining = self.total_chunks - self.processed_chunks
|
||||
return remaining / rate
|
||||
|
||||
|
||||
class StreamingIngester:
|
||||
"""
|
||||
Streaming ingestion manager for large documentation sets.
|
||||
|
||||
Provides memory-efficient processing with chunking, progress tracking,
|
||||
and resume capabilities.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = 4000,
|
||||
chunk_overlap: int = 200,
|
||||
batch_size: int = 100,
|
||||
max_memory_mb: int = 500
|
||||
):
|
||||
"""
|
||||
Initialize streaming ingester.
|
||||
|
||||
Args:
|
||||
chunk_size: Maximum characters per chunk
|
||||
chunk_overlap: Overlap between chunks (for context)
|
||||
batch_size: Number of chunks per batch
|
||||
max_memory_mb: Maximum memory usage in MB
|
||||
"""
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_overlap = chunk_overlap
|
||||
self.batch_size = batch_size
|
||||
self.max_memory_mb = max_memory_mb
|
||||
self.progress = None
|
||||
|
||||
def chunk_document(
|
||||
self,
|
||||
content: str,
|
||||
metadata: dict,
|
||||
chunk_size: Optional[int] = None,
|
||||
chunk_overlap: Optional[int] = None
|
||||
) -> Iterator[tuple[str, ChunkMetadata]]:
|
||||
"""
|
||||
Split document into overlapping chunks.
|
||||
|
||||
Args:
|
||||
content: Document content
|
||||
metadata: Document metadata
|
||||
chunk_size: Override default chunk size
|
||||
chunk_overlap: Override default overlap
|
||||
|
||||
Yields:
|
||||
Tuple of (chunk_text, chunk_metadata)
|
||||
"""
|
||||
chunk_size = chunk_size or self.chunk_size
|
||||
chunk_overlap = chunk_overlap or self.chunk_overlap
|
||||
|
||||
if len(content) <= chunk_size:
|
||||
# Document fits in single chunk
|
||||
chunk_meta = ChunkMetadata(
|
||||
chunk_id=self._generate_chunk_id(content, metadata, 0),
|
||||
source=metadata.get("source", ""),
|
||||
category=metadata.get("category", ""),
|
||||
file=metadata.get("file", ""),
|
||||
chunk_index=0,
|
||||
total_chunks=1,
|
||||
char_start=0,
|
||||
char_end=len(content)
|
||||
)
|
||||
yield content, chunk_meta
|
||||
return
|
||||
|
||||
# Calculate total chunks
|
||||
effective_chunk_size = chunk_size - chunk_overlap
|
||||
total_chunks = (len(content) - chunk_overlap) // effective_chunk_size + 1
|
||||
|
||||
# Generate chunks with overlap
|
||||
for i in range(total_chunks):
|
||||
start = i * effective_chunk_size
|
||||
end = start + chunk_size
|
||||
|
||||
# Ensure we don't go past the end
|
||||
if end > len(content):
|
||||
end = len(content)
|
||||
|
||||
chunk_text = content[start:end]
|
||||
|
||||
# Skip empty chunks
|
||||
if not chunk_text.strip():
|
||||
continue
|
||||
|
||||
chunk_meta = ChunkMetadata(
|
||||
chunk_id=self._generate_chunk_id(chunk_text, metadata, i),
|
||||
source=metadata.get("source", ""),
|
||||
category=metadata.get("category", ""),
|
||||
file=metadata.get("file", ""),
|
||||
chunk_index=i,
|
||||
total_chunks=total_chunks,
|
||||
char_start=start,
|
||||
char_end=end
|
||||
)
|
||||
|
||||
yield chunk_text, chunk_meta
|
||||
|
||||
def _generate_chunk_id(self, content: str, metadata: dict, chunk_index: int) -> str:
|
||||
"""Generate deterministic chunk ID."""
|
||||
id_string = (
|
||||
f"{metadata.get('source', '')}-"
|
||||
f"{metadata.get('file', '')}-"
|
||||
f"{chunk_index}-"
|
||||
f"{content[:50]}"
|
||||
)
|
||||
return hashlib.md5(id_string.encode()).hexdigest()
|
||||
|
||||
def stream_skill_directory(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
callback: Optional[callable] = None
|
||||
) -> Iterator[tuple[str, dict]]:
|
||||
"""
|
||||
Stream all documents from skill directory.
|
||||
|
||||
Args:
|
||||
skill_dir: Path to skill directory
|
||||
callback: Optional progress callback(progress: IngestionProgress)
|
||||
|
||||
Yields:
|
||||
Tuple of (chunk_text, chunk_metadata_dict)
|
||||
"""
|
||||
skill_dir = Path(skill_dir)
|
||||
|
||||
# Count total documents first
|
||||
doc_files = []
|
||||
|
||||
# SKILL.md
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
if skill_md.exists():
|
||||
doc_files.append(("SKILL.md", "overview", skill_md))
|
||||
|
||||
# Reference files
|
||||
refs_dir = skill_dir / "references"
|
||||
if refs_dir.exists():
|
||||
for ref_file in sorted(refs_dir.glob("*.md")):
|
||||
if ref_file.is_file() and not ref_file.name.startswith("."):
|
||||
category = ref_file.stem.replace("_", " ").lower()
|
||||
doc_files.append((ref_file.name, category, ref_file))
|
||||
|
||||
# Initialize progress tracking
|
||||
self.progress = IngestionProgress(
|
||||
total_documents=len(doc_files),
|
||||
processed_documents=0,
|
||||
total_chunks=0, # Will be updated as we chunk
|
||||
processed_chunks=0,
|
||||
failed_chunks=0,
|
||||
bytes_processed=0,
|
||||
start_time=time.time()
|
||||
)
|
||||
|
||||
# Process each document
|
||||
for filename, category, filepath in doc_files:
|
||||
try:
|
||||
content = filepath.read_text(encoding="utf-8")
|
||||
|
||||
if not content.strip():
|
||||
self.progress.processed_documents += 1
|
||||
continue
|
||||
|
||||
metadata = {
|
||||
"source": skill_dir.name,
|
||||
"category": category,
|
||||
"file": filename,
|
||||
"type": "documentation" if filename == "SKILL.md" else "reference",
|
||||
"version": "1.0.0"
|
||||
}
|
||||
|
||||
# Chunk document and yield chunks
|
||||
chunk_count = 0
|
||||
for chunk_text, chunk_meta in self.chunk_document(content, metadata):
|
||||
chunk_count += 1
|
||||
self.progress.total_chunks += 1
|
||||
|
||||
# Convert chunk metadata to dict
|
||||
chunk_dict = {
|
||||
"content": chunk_text,
|
||||
"chunk_id": chunk_meta.chunk_id,
|
||||
"source": chunk_meta.source,
|
||||
"category": chunk_meta.category,
|
||||
"file": chunk_meta.file,
|
||||
"chunk_index": chunk_meta.chunk_index,
|
||||
"total_chunks": chunk_meta.total_chunks,
|
||||
"char_start": chunk_meta.char_start,
|
||||
"char_end": chunk_meta.char_end,
|
||||
}
|
||||
|
||||
yield chunk_text, chunk_dict
|
||||
|
||||
self.progress.processed_chunks += 1
|
||||
self.progress.bytes_processed += len(chunk_text.encode("utf-8"))
|
||||
|
||||
# Callback for progress updates
|
||||
if callback:
|
||||
callback(self.progress)
|
||||
|
||||
self.progress.processed_documents += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ Warning: Failed to process {filename}: {e}")
|
||||
self.progress.failed_chunks += 1
|
||||
continue
|
||||
|
||||
def batch_iterator(
|
||||
self,
|
||||
chunks: Iterator[tuple[str, dict]],
|
||||
batch_size: Optional[int] = None
|
||||
) -> Iterator[list[tuple[str, dict]]]:
|
||||
"""
|
||||
Group chunks into batches for efficient processing.
|
||||
|
||||
Args:
|
||||
chunks: Iterator of (chunk_text, chunk_metadata) tuples
|
||||
batch_size: Override default batch size
|
||||
|
||||
Yields:
|
||||
List of chunks (batch)
|
||||
"""
|
||||
batch_size = batch_size or self.batch_size
|
||||
batch = []
|
||||
|
||||
for chunk in chunks:
|
||||
batch.append(chunk)
|
||||
|
||||
if len(batch) >= batch_size:
|
||||
yield batch
|
||||
batch = []
|
||||
|
||||
# Yield remaining chunks
|
||||
if batch:
|
||||
yield batch
|
||||
|
||||
def save_checkpoint(self, checkpoint_path: Path, state: dict) -> None:
|
||||
"""
|
||||
Save ingestion checkpoint for resume capability.
|
||||
|
||||
Args:
|
||||
checkpoint_path: Path to checkpoint file
|
||||
state: State dictionary to save
|
||||
"""
|
||||
checkpoint_path = Path(checkpoint_path)
|
||||
checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
checkpoint_data = {
|
||||
"timestamp": time.time(),
|
||||
"progress": {
|
||||
"total_documents": self.progress.total_documents,
|
||||
"processed_documents": self.progress.processed_documents,
|
||||
"total_chunks": self.progress.total_chunks,
|
||||
"processed_chunks": self.progress.processed_chunks,
|
||||
"failed_chunks": self.progress.failed_chunks,
|
||||
"bytes_processed": self.progress.bytes_processed,
|
||||
},
|
||||
"state": state
|
||||
}
|
||||
|
||||
checkpoint_path.write_text(json.dumps(checkpoint_data, indent=2))
|
||||
|
||||
def load_checkpoint(self, checkpoint_path: Path) -> Optional[dict]:
|
||||
"""
|
||||
Load ingestion checkpoint for resume.
|
||||
|
||||
Args:
|
||||
checkpoint_path: Path to checkpoint file
|
||||
|
||||
Returns:
|
||||
State dictionary or None if not found
|
||||
"""
|
||||
checkpoint_path = Path(checkpoint_path)
|
||||
|
||||
if not checkpoint_path.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
checkpoint_data = json.loads(checkpoint_path.read_text())
|
||||
return checkpoint_data.get("state")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Warning: Failed to load checkpoint: {e}")
|
||||
return None
|
||||
|
||||
def format_progress(self) -> str:
|
||||
"""
|
||||
Format progress as human-readable string.
|
||||
|
||||
Returns:
|
||||
Progress string
|
||||
"""
|
||||
if not self.progress:
|
||||
return "No progress data"
|
||||
|
||||
p = self.progress
|
||||
|
||||
lines = [
|
||||
f"📊 Progress: {p.progress_percent:.1f}% complete",
|
||||
f" Documents: {p.processed_documents}/{p.total_documents}",
|
||||
f" Chunks: {p.processed_chunks}/{p.total_chunks}",
|
||||
f" Rate: {p.chunks_per_second:.1f} chunks/sec",
|
||||
f" Elapsed: {p.elapsed_time:.1f}s",
|
||||
]
|
||||
|
||||
if p.eta_seconds > 0:
|
||||
lines.append(f" ETA: {p.eta_seconds:.1f}s")
|
||||
|
||||
if p.failed_chunks > 0:
|
||||
lines.append(f" ⚠️ Failed: {p.failed_chunks} chunks")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def example_usage():
|
||||
"""Example usage of streaming ingestion."""
|
||||
|
||||
# Initialize ingester
|
||||
ingester = StreamingIngester(
|
||||
chunk_size=4000,
|
||||
chunk_overlap=200,
|
||||
batch_size=100
|
||||
)
|
||||
|
||||
# Progress callback
|
||||
def on_progress(progress: IngestionProgress):
|
||||
if progress.processed_chunks % 10 == 0:
|
||||
print(f"Progress: {progress.progress_percent:.1f}% - "
|
||||
f"{progress.processed_chunks}/{progress.total_chunks} chunks")
|
||||
|
||||
# Stream skill directory
|
||||
skill_dir = Path("output/react")
|
||||
chunks = ingester.stream_skill_directory(skill_dir, callback=on_progress)
|
||||
|
||||
# Process in batches
|
||||
all_chunks = []
|
||||
for batch in ingester.batch_iterator(chunks, batch_size=50):
|
||||
print(f"\nProcessing batch of {len(batch)} chunks...")
|
||||
all_chunks.extend(batch)
|
||||
|
||||
# Save checkpoint every batch
|
||||
ingester.save_checkpoint(
|
||||
Path("output/.checkpoints/react.json"),
|
||||
{"processed_batches": len(all_chunks) // 50}
|
||||
)
|
||||
|
||||
# Final progress
|
||||
print("\n" + ingester.format_progress())
|
||||
print(f"\n✅ Processed {len(all_chunks)} total chunks")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
example_usage()
|
||||
Reference in New Issue
Block a user