feat: Add streaming ingestion for large docs (Task #14)

- Memory-efficient streaming with chunking
- Progress tracking with real-time stats
- Batch processing and resume capability
- CLI integration with --streaming flag
- 10 tests passing (100%)

Files:
- streaming_ingest.py: Core streaming engine
- streaming_adaptor.py: Adaptor integration
- package_skill.py: CLI flags added
- test_streaming_ingestion.py: Comprehensive tests

Week 2: 5/9 tasks complete (56%)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-02-07 13:39:43 +03:00
parent 359f2667f5
commit 5ce3ed4067
4 changed files with 1103 additions and 2 deletions

View File

@@ -35,7 +35,16 @@ except ImportError:
)
def package_skill(skill_dir, open_folder_after=True, skip_quality_check=False, target="claude"):
def package_skill(
skill_dir,
open_folder_after=True,
skip_quality_check=False,
target="claude",
streaming=False,
chunk_size=4000,
chunk_overlap=200,
batch_size=100
):
"""
Package a skill directory into platform-specific format
@@ -44,6 +53,10 @@ def package_skill(skill_dir, open_folder_after=True, skip_quality_check=False, t
open_folder_after: Whether to open the output folder after packaging
skip_quality_check: Skip quality checks before packaging
target: Target LLM platform ('claude', 'gemini', 'openai', 'markdown')
streaming: Use streaming ingestion for large docs
chunk_size: Maximum characters per chunk (streaming mode)
chunk_overlap: Overlap between chunks (streaming mode)
batch_size: Number of chunks per batch (streaming mode)
Returns:
tuple: (success, package_path) where success is bool and package_path is Path or None
@@ -97,8 +110,25 @@ def package_skill(skill_dir, open_folder_after=True, skip_quality_check=False, t
print(f" Target: {adaptor.PLATFORM_NAME}")
print(f" Source: {skill_path}")
if streaming:
print(f" Mode: Streaming (chunk_size={chunk_size}, overlap={chunk_overlap})")
try:
package_path = adaptor.package(skill_path, output_dir)
# Use streaming if requested and supported
if streaming and hasattr(adaptor, 'package_streaming'):
package_path = adaptor.package_streaming(
skill_path,
output_dir,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
batch_size=batch_size
)
elif streaming:
print("⚠️ Streaming not supported for this platform, using standard packaging")
package_path = adaptor.package(skill_path, output_dir)
else:
package_path = adaptor.package(skill_path, output_dir)
print(f" Output: {package_path}")
except Exception as e:
print(f"❌ Error creating package: {e}")
@@ -166,6 +196,33 @@ Examples:
help="Automatically upload after packaging (requires platform API key)",
)
parser.add_argument(
"--streaming",
action="store_true",
help="Use streaming ingestion for large docs (memory-efficient, with chunking)",
)
parser.add_argument(
"--chunk-size",
type=int,
default=4000,
help="Maximum characters per chunk (streaming mode, default: 4000)",
)
parser.add_argument(
"--chunk-overlap",
type=int,
default=200,
help="Overlap between chunks for context (streaming mode, default: 200)",
)
parser.add_argument(
"--batch-size",
type=int,
default=100,
help="Number of chunks per batch (streaming mode, default: 100)",
)
args = parser.parse_args()
success, package_path = package_skill(
@@ -173,6 +230,10 @@ Examples:
open_folder_after=not args.no_open,
skip_quality_check=args.skip_quality_check,
target=args.target,
streaming=args.streaming,
chunk_size=args.chunk_size,
chunk_overlap=args.chunk_overlap,
batch_size=args.batch_size,
)
if not success: