**Critical Issues Fixed:** Issue #1: CLI Commands Were BROKEN ⚠️ CRITICAL - Problem: 4 CLI commands existed but failed at runtime with ImportError - Root Cause: Modules had example_usage() instead of main() functions - Impact: Users couldn't use quality, stream, update, multilang features **Fixed Files:** - src/skill_seekers/cli/quality_metrics.py - Renamed example_usage() → main() - Added argparse with --report, --output flags - Proper exit codes and error handling - src/skill_seekers/cli/streaming_ingest.py - Renamed example_usage() → main() - Added argparse with --chunk-size, --batch-size, --checkpoint flags - Supports both file and directory inputs - src/skill_seekers/cli/incremental_updater.py - Renamed example_usage() → main() - Added argparse with --check-changes, --generate-package, --apply-update flags - Proper error handling and exit codes - src/skill_seekers/cli/multilang_support.py - Renamed example_usage() → main() - Added argparse with --detect, --report, --export flags - Loads skill documents from directory Issue #2: Haystack Missing from Package Choices ⚠️ CRITICAL - Problem: Haystack adaptor worked but couldn't be used via CLI - Root Cause: package_skill.py missing "haystack" in --target choices - Impact: Users got "invalid choice" error when packaging for Haystack **Fixed:** - src/skill_seekers/cli/package_skill.py:188 - Added "haystack" to --target choices list - Now matches main.py choices (all 11 platforms) **Verification:** ✅ All 4 CLI commands now work: $ skill-seekers quality --help $ skill-seekers stream --help $ skill-seekers update --help $ skill-seekers multilang --help ✅ Haystack now available: $ skill-seekers package output/skill --target haystack ✅ All 164 adaptor tests still passing ✅ No regressions detected **Credits:** - Issues identified by: Kimi QA Review - Fixes implemented by: Claude Sonnet 4.5 Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
440 lines
14 KiB
Python
440 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Streaming Ingestion for Large Documentation Sets
|
|
|
|
Provides memory-efficient processing and batch upload capabilities for large
|
|
skill documentation. Handles chunking, progress tracking, and resume functionality.
|
|
"""
|
|
|
|
import json
|
|
import hashlib
|
|
from pathlib import Path
|
|
from typing import Any, Iterator, Optional
|
|
from dataclasses import dataclass
|
|
import time
|
|
|
|
|
|
@dataclass
|
|
class ChunkMetadata:
|
|
"""Metadata for a document chunk."""
|
|
chunk_id: str
|
|
source: str
|
|
category: str
|
|
file: str
|
|
chunk_index: int
|
|
total_chunks: int
|
|
char_start: int
|
|
char_end: int
|
|
|
|
|
|
@dataclass
|
|
class IngestionProgress:
|
|
"""Progress tracking for streaming ingestion."""
|
|
total_documents: int
|
|
processed_documents: int
|
|
total_chunks: int
|
|
processed_chunks: int
|
|
failed_chunks: int
|
|
bytes_processed: int
|
|
start_time: float
|
|
|
|
@property
|
|
def progress_percent(self) -> float:
|
|
"""Calculate progress percentage."""
|
|
if self.total_chunks == 0:
|
|
return 0.0
|
|
return (self.processed_chunks / self.total_chunks) * 100
|
|
|
|
@property
|
|
def elapsed_time(self) -> float:
|
|
"""Calculate elapsed time in seconds."""
|
|
return time.time() - self.start_time
|
|
|
|
@property
|
|
def chunks_per_second(self) -> float:
|
|
"""Calculate processing rate."""
|
|
elapsed = self.elapsed_time
|
|
if elapsed == 0:
|
|
return 0.0
|
|
return self.processed_chunks / elapsed
|
|
|
|
@property
|
|
def eta_seconds(self) -> float:
|
|
"""Estimate time remaining in seconds."""
|
|
rate = self.chunks_per_second
|
|
if rate == 0:
|
|
return 0.0
|
|
remaining = self.total_chunks - self.processed_chunks
|
|
return remaining / rate
|
|
|
|
|
|
class StreamingIngester:
|
|
"""
|
|
Streaming ingestion manager for large documentation sets.
|
|
|
|
Provides memory-efficient processing with chunking, progress tracking,
|
|
and resume capabilities.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
chunk_size: int = 4000,
|
|
chunk_overlap: int = 200,
|
|
batch_size: int = 100,
|
|
max_memory_mb: int = 500
|
|
):
|
|
"""
|
|
Initialize streaming ingester.
|
|
|
|
Args:
|
|
chunk_size: Maximum characters per chunk
|
|
chunk_overlap: Overlap between chunks (for context)
|
|
batch_size: Number of chunks per batch
|
|
max_memory_mb: Maximum memory usage in MB
|
|
"""
|
|
self.chunk_size = chunk_size
|
|
self.chunk_overlap = chunk_overlap
|
|
self.batch_size = batch_size
|
|
self.max_memory_mb = max_memory_mb
|
|
self.progress = None
|
|
|
|
def chunk_document(
|
|
self,
|
|
content: str,
|
|
metadata: dict,
|
|
chunk_size: Optional[int] = None,
|
|
chunk_overlap: Optional[int] = None
|
|
) -> Iterator[tuple[str, ChunkMetadata]]:
|
|
"""
|
|
Split document into overlapping chunks.
|
|
|
|
Args:
|
|
content: Document content
|
|
metadata: Document metadata
|
|
chunk_size: Override default chunk size
|
|
chunk_overlap: Override default overlap
|
|
|
|
Yields:
|
|
Tuple of (chunk_text, chunk_metadata)
|
|
"""
|
|
chunk_size = chunk_size or self.chunk_size
|
|
chunk_overlap = chunk_overlap or self.chunk_overlap
|
|
|
|
if len(content) <= chunk_size:
|
|
# Document fits in single chunk
|
|
chunk_meta = ChunkMetadata(
|
|
chunk_id=self._generate_chunk_id(content, metadata, 0),
|
|
source=metadata.get("source", ""),
|
|
category=metadata.get("category", ""),
|
|
file=metadata.get("file", ""),
|
|
chunk_index=0,
|
|
total_chunks=1,
|
|
char_start=0,
|
|
char_end=len(content)
|
|
)
|
|
yield content, chunk_meta
|
|
return
|
|
|
|
# Calculate total chunks
|
|
effective_chunk_size = chunk_size - chunk_overlap
|
|
total_chunks = (len(content) - chunk_overlap) // effective_chunk_size + 1
|
|
|
|
# Generate chunks with overlap
|
|
for i in range(total_chunks):
|
|
start = i * effective_chunk_size
|
|
end = start + chunk_size
|
|
|
|
# Ensure we don't go past the end
|
|
if end > len(content):
|
|
end = len(content)
|
|
|
|
chunk_text = content[start:end]
|
|
|
|
# Skip empty chunks
|
|
if not chunk_text.strip():
|
|
continue
|
|
|
|
chunk_meta = ChunkMetadata(
|
|
chunk_id=self._generate_chunk_id(chunk_text, metadata, i),
|
|
source=metadata.get("source", ""),
|
|
category=metadata.get("category", ""),
|
|
file=metadata.get("file", ""),
|
|
chunk_index=i,
|
|
total_chunks=total_chunks,
|
|
char_start=start,
|
|
char_end=end
|
|
)
|
|
|
|
yield chunk_text, chunk_meta
|
|
|
|
def _generate_chunk_id(self, content: str, metadata: dict, chunk_index: int) -> str:
|
|
"""Generate deterministic chunk ID."""
|
|
id_string = (
|
|
f"{metadata.get('source', '')}-"
|
|
f"{metadata.get('file', '')}-"
|
|
f"{chunk_index}-"
|
|
f"{content[:50]}"
|
|
)
|
|
return hashlib.md5(id_string.encode()).hexdigest()
|
|
|
|
def stream_skill_directory(
|
|
self,
|
|
skill_dir: Path,
|
|
callback: Optional[callable] = None
|
|
) -> Iterator[tuple[str, dict]]:
|
|
"""
|
|
Stream all documents from skill directory.
|
|
|
|
Args:
|
|
skill_dir: Path to skill directory
|
|
callback: Optional progress callback(progress: IngestionProgress)
|
|
|
|
Yields:
|
|
Tuple of (chunk_text, chunk_metadata_dict)
|
|
"""
|
|
skill_dir = Path(skill_dir)
|
|
|
|
# Count total documents first
|
|
doc_files = []
|
|
|
|
# SKILL.md
|
|
skill_md = skill_dir / "SKILL.md"
|
|
if skill_md.exists():
|
|
doc_files.append(("SKILL.md", "overview", skill_md))
|
|
|
|
# Reference files
|
|
refs_dir = skill_dir / "references"
|
|
if refs_dir.exists():
|
|
for ref_file in sorted(refs_dir.glob("*.md")):
|
|
if ref_file.is_file() and not ref_file.name.startswith("."):
|
|
category = ref_file.stem.replace("_", " ").lower()
|
|
doc_files.append((ref_file.name, category, ref_file))
|
|
|
|
# Initialize progress tracking
|
|
self.progress = IngestionProgress(
|
|
total_documents=len(doc_files),
|
|
processed_documents=0,
|
|
total_chunks=0, # Will be updated as we chunk
|
|
processed_chunks=0,
|
|
failed_chunks=0,
|
|
bytes_processed=0,
|
|
start_time=time.time()
|
|
)
|
|
|
|
# Process each document
|
|
for filename, category, filepath in doc_files:
|
|
try:
|
|
content = filepath.read_text(encoding="utf-8")
|
|
|
|
if not content.strip():
|
|
self.progress.processed_documents += 1
|
|
continue
|
|
|
|
metadata = {
|
|
"source": skill_dir.name,
|
|
"category": category,
|
|
"file": filename,
|
|
"type": "documentation" if filename == "SKILL.md" else "reference",
|
|
"version": "1.0.0"
|
|
}
|
|
|
|
# Chunk document and yield chunks
|
|
chunk_count = 0
|
|
for chunk_text, chunk_meta in self.chunk_document(content, metadata):
|
|
chunk_count += 1
|
|
self.progress.total_chunks += 1
|
|
|
|
# Convert chunk metadata to dict
|
|
chunk_dict = {
|
|
"content": chunk_text,
|
|
"chunk_id": chunk_meta.chunk_id,
|
|
"source": chunk_meta.source,
|
|
"category": chunk_meta.category,
|
|
"file": chunk_meta.file,
|
|
"chunk_index": chunk_meta.chunk_index,
|
|
"total_chunks": chunk_meta.total_chunks,
|
|
"char_start": chunk_meta.char_start,
|
|
"char_end": chunk_meta.char_end,
|
|
}
|
|
|
|
yield chunk_text, chunk_dict
|
|
|
|
self.progress.processed_chunks += 1
|
|
self.progress.bytes_processed += len(chunk_text.encode("utf-8"))
|
|
|
|
# Callback for progress updates
|
|
if callback:
|
|
callback(self.progress)
|
|
|
|
self.progress.processed_documents += 1
|
|
|
|
except Exception as e:
|
|
print(f"⚠️ Warning: Failed to process {filename}: {e}")
|
|
self.progress.failed_chunks += 1
|
|
continue
|
|
|
|
def batch_iterator(
|
|
self,
|
|
chunks: Iterator[tuple[str, dict]],
|
|
batch_size: Optional[int] = None
|
|
) -> Iterator[list[tuple[str, dict]]]:
|
|
"""
|
|
Group chunks into batches for efficient processing.
|
|
|
|
Args:
|
|
chunks: Iterator of (chunk_text, chunk_metadata) tuples
|
|
batch_size: Override default batch size
|
|
|
|
Yields:
|
|
List of chunks (batch)
|
|
"""
|
|
batch_size = batch_size or self.batch_size
|
|
batch = []
|
|
|
|
for chunk in chunks:
|
|
batch.append(chunk)
|
|
|
|
if len(batch) >= batch_size:
|
|
yield batch
|
|
batch = []
|
|
|
|
# Yield remaining chunks
|
|
if batch:
|
|
yield batch
|
|
|
|
def save_checkpoint(self, checkpoint_path: Path, state: dict) -> None:
|
|
"""
|
|
Save ingestion checkpoint for resume capability.
|
|
|
|
Args:
|
|
checkpoint_path: Path to checkpoint file
|
|
state: State dictionary to save
|
|
"""
|
|
checkpoint_path = Path(checkpoint_path)
|
|
checkpoint_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
checkpoint_data = {
|
|
"timestamp": time.time(),
|
|
"progress": {
|
|
"total_documents": self.progress.total_documents,
|
|
"processed_documents": self.progress.processed_documents,
|
|
"total_chunks": self.progress.total_chunks,
|
|
"processed_chunks": self.progress.processed_chunks,
|
|
"failed_chunks": self.progress.failed_chunks,
|
|
"bytes_processed": self.progress.bytes_processed,
|
|
},
|
|
"state": state
|
|
}
|
|
|
|
checkpoint_path.write_text(json.dumps(checkpoint_data, indent=2))
|
|
|
|
def load_checkpoint(self, checkpoint_path: Path) -> Optional[dict]:
|
|
"""
|
|
Load ingestion checkpoint for resume.
|
|
|
|
Args:
|
|
checkpoint_path: Path to checkpoint file
|
|
|
|
Returns:
|
|
State dictionary or None if not found
|
|
"""
|
|
checkpoint_path = Path(checkpoint_path)
|
|
|
|
if not checkpoint_path.exists():
|
|
return None
|
|
|
|
try:
|
|
checkpoint_data = json.loads(checkpoint_path.read_text())
|
|
return checkpoint_data.get("state")
|
|
except Exception as e:
|
|
print(f"⚠️ Warning: Failed to load checkpoint: {e}")
|
|
return None
|
|
|
|
def format_progress(self) -> str:
|
|
"""
|
|
Format progress as human-readable string.
|
|
|
|
Returns:
|
|
Progress string
|
|
"""
|
|
if not self.progress:
|
|
return "No progress data"
|
|
|
|
p = self.progress
|
|
|
|
lines = [
|
|
f"📊 Progress: {p.progress_percent:.1f}% complete",
|
|
f" Documents: {p.processed_documents}/{p.total_documents}",
|
|
f" Chunks: {p.processed_chunks}/{p.total_chunks}",
|
|
f" Rate: {p.chunks_per_second:.1f} chunks/sec",
|
|
f" Elapsed: {p.elapsed_time:.1f}s",
|
|
]
|
|
|
|
if p.eta_seconds > 0:
|
|
lines.append(f" ETA: {p.eta_seconds:.1f}s")
|
|
|
|
if p.failed_chunks > 0:
|
|
lines.append(f" ⚠️ Failed: {p.failed_chunks} chunks")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main():
|
|
"""CLI entry point for streaming ingestion."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Stream and chunk skill documents")
|
|
parser.add_argument("input", help="Input file or directory path")
|
|
parser.add_argument("--chunk-size", type=int, default=4000, help="Chunk size in characters")
|
|
parser.add_argument("--chunk-overlap", type=int, default=200, help="Chunk overlap in characters")
|
|
parser.add_argument("--batch-size", type=int, default=100, help="Batch size for processing")
|
|
parser.add_argument("--checkpoint", help="Checkpoint file path")
|
|
args = parser.parse_args()
|
|
|
|
# Initialize ingester
|
|
ingester = StreamingIngester(
|
|
chunk_size=args.chunk_size,
|
|
chunk_overlap=args.chunk_overlap,
|
|
batch_size=args.batch_size
|
|
)
|
|
|
|
# Progress callback
|
|
def on_progress(progress: IngestionProgress):
|
|
if progress.processed_chunks % 10 == 0:
|
|
print(f"Progress: {progress.progress_percent:.1f}% - "
|
|
f"{progress.processed_chunks}/{progress.total_chunks} chunks")
|
|
|
|
# Stream input
|
|
input_path = Path(args.input)
|
|
if not input_path.exists():
|
|
print(f"❌ Error: Path not found: {input_path}")
|
|
return 1
|
|
|
|
if input_path.is_dir():
|
|
chunks = ingester.stream_skill_directory(input_path, callback=on_progress)
|
|
else:
|
|
chunks = ingester.stream_file(input_path, callback=on_progress)
|
|
|
|
# Process in batches
|
|
all_chunks = []
|
|
for batch in ingester.batch_iterator(chunks, batch_size=args.batch_size):
|
|
print(f"\nProcessing batch of {len(batch)} chunks...")
|
|
all_chunks.extend(batch)
|
|
|
|
# Save checkpoint if specified
|
|
if args.checkpoint:
|
|
ingester.save_checkpoint(
|
|
Path(args.checkpoint),
|
|
{"processed_batches": len(all_chunks) // args.batch_size}
|
|
)
|
|
|
|
# Final progress
|
|
print("\n" + ingester.format_progress())
|
|
print(f"\n✅ Processed {len(all_chunks)} total chunks")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
sys.exit(main())
|