refactor: rename all chunk flags to include explicit units
Replace ambiguous --chunk-size / --chunk-overlap names that meant different things in different contexts (tokens vs characters) with fully explicit names: - --chunk-size (RAG tokens) → --chunk-tokens - --chunk-overlap (RAG tokens) → --chunk-overlap-tokens - --chunk (enable RAG chunking) → --chunk-for-rag - --streaming-chunk-size (chars) → --streaming-chunk-chars - --streaming-overlap (chars) → --streaming-overlap-chars - --chunk-size (PDF pages) → --pdf-pages-per-chunk (poc file) Also aligns stream_parser.py help with streaming_ingest.py standalone parser. All 2167 tests pass. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -101,8 +101,8 @@ RAG_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"help": "Enable semantic chunking for RAG pipelines",
|
||||
},
|
||||
},
|
||||
"chunk_size": {
|
||||
"flags": ("--chunk-size",),
|
||||
"chunk_tokens": {
|
||||
"flags": ("--chunk-tokens",),
|
||||
"kwargs": {
|
||||
"type": int,
|
||||
"default": 512,
|
||||
@@ -110,8 +110,8 @@ RAG_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"help": "Chunk size in tokens for RAG (default: 512)",
|
||||
},
|
||||
},
|
||||
"chunk_overlap": {
|
||||
"flags": ("--chunk-overlap",),
|
||||
"chunk_overlap_tokens": {
|
||||
"flags": ("--chunk-overlap-tokens",),
|
||||
"kwargs": {
|
||||
"type": int,
|
||||
"default": 50,
|
||||
@@ -183,7 +183,7 @@ def add_rag_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
Example:
|
||||
>>> parser = argparse.ArgumentParser()
|
||||
>>> add_rag_arguments(parser)
|
||||
>>> # Now parser has --chunk-for-rag, --chunk-size, --chunk-overlap
|
||||
>>> # Now parser has --chunk-for-rag, --chunk-tokens, --chunk-overlap-tokens
|
||||
"""
|
||||
for arg_name, arg_def in RAG_ARGUMENTS.items():
|
||||
flags = arg_def["flags"]
|
||||
@@ -195,7 +195,7 @@ def get_rag_argument_names() -> set:
|
||||
"""Get the set of RAG argument destination names.
|
||||
|
||||
Returns:
|
||||
Set of argument dest names (e.g., {'chunk_for_rag', 'chunk_size', 'chunk_overlap'})
|
||||
Set of argument dest names (e.g., {'chunk_for_rag', 'chunk_tokens', 'chunk_overlap_tokens'})
|
||||
"""
|
||||
return set(RAG_ARGUMENTS.keys())
|
||||
|
||||
|
||||
@@ -70,8 +70,8 @@ PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"help": "Use streaming ingestion for large docs (memory-efficient)",
|
||||
},
|
||||
},
|
||||
"streaming_chunk_size": {
|
||||
"flags": ("--streaming-chunk-size",),
|
||||
"streaming_chunk_chars": {
|
||||
"flags": ("--streaming-chunk-chars",),
|
||||
"kwargs": {
|
||||
"type": int,
|
||||
"default": 4000,
|
||||
@@ -79,8 +79,8 @@ PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"metavar": "N",
|
||||
},
|
||||
},
|
||||
"streaming_overlap": {
|
||||
"flags": ("--streaming-overlap",),
|
||||
"streaming_overlap_chars": {
|
||||
"flags": ("--streaming-overlap-chars",),
|
||||
"kwargs": {
|
||||
"type": int,
|
||||
"default": 200,
|
||||
@@ -98,8 +98,8 @@ PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
},
|
||||
},
|
||||
# RAG chunking options
|
||||
"chunk": {
|
||||
"flags": ("--chunk",),
|
||||
"chunk_for_rag": {
|
||||
"flags": ("--chunk-for-rag",),
|
||||
"kwargs": {
|
||||
"action": "store_true",
|
||||
"help": "Enable intelligent chunking for RAG platforms (auto-enabled for RAG adaptors)",
|
||||
|
||||
@@ -106,8 +106,8 @@ class CreateCommand:
|
||||
# Check against common defaults
|
||||
defaults = {
|
||||
"max_issues": 100,
|
||||
"chunk_size": 512,
|
||||
"chunk_overlap": 50,
|
||||
"chunk_tokens": 512,
|
||||
"chunk_overlap_tokens": 50,
|
||||
"output": None,
|
||||
}
|
||||
|
||||
@@ -158,10 +158,10 @@ class CreateCommand:
|
||||
# RAG arguments (web scraper only)
|
||||
if getattr(self.args, "chunk_for_rag", False):
|
||||
argv.append("--chunk-for-rag")
|
||||
if getattr(self.args, "chunk_size", None) and self.args.chunk_size != 512:
|
||||
argv.extend(["--chunk-size", str(self.args.chunk_size)])
|
||||
if getattr(self.args, "chunk_overlap", None) and self.args.chunk_overlap != 50:
|
||||
argv.extend(["--chunk-overlap", str(self.args.chunk_overlap)])
|
||||
if getattr(self.args, "chunk_tokens", None) and self.args.chunk_tokens != 512:
|
||||
argv.extend(["--chunk-tokens", str(self.args.chunk_tokens)])
|
||||
if getattr(self.args, "chunk_overlap_tokens", None) and self.args.chunk_overlap_tokens != 50:
|
||||
argv.extend(["--chunk-overlap-tokens", str(self.args.chunk_overlap_tokens)])
|
||||
|
||||
# Advanced web-specific arguments
|
||||
if getattr(self.args, "no_preserve_code_blocks", False):
|
||||
|
||||
@@ -2227,8 +2227,8 @@ def execute_scraping_and_building(
|
||||
from skill_seekers.cli.rag_chunker import RAGChunker
|
||||
|
||||
chunker = RAGChunker(
|
||||
chunk_size=args.chunk_size,
|
||||
chunk_overlap=args.chunk_overlap,
|
||||
chunk_size=args.chunk_tokens,
|
||||
chunk_overlap=args.chunk_overlap_tokens,
|
||||
preserve_code_blocks=not args.no_preserve_code_blocks,
|
||||
preserve_paragraphs=not args.no_preserve_paragraphs,
|
||||
)
|
||||
|
||||
@@ -221,10 +221,10 @@ Examples:
|
||||
skip_quality_check=args.skip_quality_check,
|
||||
target=args.target,
|
||||
streaming=args.streaming,
|
||||
chunk_size=args.streaming_chunk_size,
|
||||
chunk_overlap=args.streaming_overlap,
|
||||
chunk_size=args.streaming_chunk_chars,
|
||||
chunk_overlap=args.streaming_overlap_chars,
|
||||
batch_size=args.batch_size,
|
||||
enable_chunking=args.chunk,
|
||||
enable_chunking=args.chunk_for_rag,
|
||||
chunk_max_tokens=args.chunk_tokens,
|
||||
preserve_code_blocks=not args.no_preserve_code,
|
||||
)
|
||||
|
||||
@@ -21,5 +21,5 @@ class StreamParser(SubcommandParser):
|
||||
def add_arguments(self, parser):
|
||||
"""Add stream-specific arguments."""
|
||||
parser.add_argument("input_file", help="Large file to stream")
|
||||
parser.add_argument("--chunk-size", type=int, default=1024, help="Chunk size in KB")
|
||||
parser.add_argument("--streaming-chunk-chars", type=int, default=4000, help="Maximum characters per chunk (default: 4000)")
|
||||
parser.add_argument("--output", help="Output directory")
|
||||
|
||||
@@ -44,7 +44,7 @@ Usage:
|
||||
|
||||
Example:
|
||||
python3 pdf_extractor_poc.py docs/manual.pdf -o output.json -v \
|
||||
--chunk-size 15 --min-quality 6.0 --extract-images \
|
||||
--pdf-pages-per-chunk 15 --min-quality 6.0 --extract-images \
|
||||
--extract-tables --parallel
|
||||
"""
|
||||
|
||||
@@ -1079,7 +1079,7 @@ Examples:
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
||||
parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
|
||||
parser.add_argument(
|
||||
"--chunk-size", type=int, default=10, help="Pages per chunk (0 = no chunking, default: 10)"
|
||||
"--pdf-pages-per-chunk", type=int, default=10, help="Pages per chunk (0 = no chunking, default: 10)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-merge", action="store_true", help="Disable merging code blocks across pages"
|
||||
@@ -1138,7 +1138,7 @@ Examples:
|
||||
extractor = PDFExtractor(
|
||||
args.pdf_file,
|
||||
verbose=args.verbose,
|
||||
chunk_size=args.chunk_size,
|
||||
chunk_size=args.pdf_pages_per_chunk,
|
||||
min_quality=args.min_quality,
|
||||
extract_images=args.extract_images,
|
||||
image_dir=args.image_dir,
|
||||
|
||||
@@ -383,8 +383,8 @@ def main():
|
||||
)
|
||||
parser.add_argument("skill_dir", type=Path, help="Path to skill directory")
|
||||
parser.add_argument("--output", "-o", type=Path, help="Output JSON file")
|
||||
parser.add_argument("--chunk-size", type=int, default=512, help="Target chunk size in tokens")
|
||||
parser.add_argument("--chunk-overlap", type=int, default=50, help="Overlap size in tokens")
|
||||
parser.add_argument("--chunk-tokens", type=int, default=512, help="Target chunk size in tokens")
|
||||
parser.add_argument("--chunk-overlap-tokens", type=int, default=50, help="Overlap size in tokens")
|
||||
parser.add_argument("--no-code-blocks", action="store_true", help="Don't preserve code blocks")
|
||||
parser.add_argument("--no-paragraphs", action="store_true", help="Don't preserve paragraphs")
|
||||
|
||||
@@ -392,8 +392,8 @@ def main():
|
||||
|
||||
# Create chunker
|
||||
chunker = RAGChunker(
|
||||
chunk_size=args.chunk_size,
|
||||
chunk_overlap=args.chunk_overlap,
|
||||
chunk_size=args.chunk_tokens,
|
||||
chunk_overlap=args.chunk_overlap_tokens,
|
||||
preserve_code_blocks=not args.no_code_blocks,
|
||||
preserve_paragraphs=not args.no_paragraphs,
|
||||
)
|
||||
|
||||
@@ -380,9 +380,9 @@ def main():
|
||||
|
||||
parser = argparse.ArgumentParser(description="Stream and chunk skill documents")
|
||||
parser.add_argument("input", help="Input file or directory path")
|
||||
parser.add_argument("--chunk-size", type=int, default=4000, help="Chunk size in characters")
|
||||
parser.add_argument("--streaming-chunk-chars", type=int, default=4000, help="Chunk size in characters")
|
||||
parser.add_argument(
|
||||
"--chunk-overlap", type=int, default=200, help="Chunk overlap in characters"
|
||||
"--streaming-overlap-chars", type=int, default=200, help="Chunk overlap in characters"
|
||||
)
|
||||
parser.add_argument("--batch-size", type=int, default=100, help="Batch size for processing")
|
||||
parser.add_argument("--checkpoint", help="Checkpoint file path")
|
||||
@@ -390,7 +390,7 @@ def main():
|
||||
|
||||
# Initialize ingester
|
||||
ingester = StreamingIngester(
|
||||
chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap, batch_size=args.batch_size
|
||||
chunk_size=args.streaming_chunk_chars, chunk_overlap=args.streaming_overlap_chars, batch_size=args.batch_size
|
||||
)
|
||||
|
||||
# Progress callback
|
||||
|
||||
@@ -291,7 +291,7 @@ class TestChunkingCLIIntegration:
|
||||
"""Test chunking via CLI arguments."""
|
||||
|
||||
def test_chunk_flag(self, tmp_path):
|
||||
"""Test --chunk flag enables chunking."""
|
||||
"""Test --chunk-for-rag flag enables chunking."""
|
||||
from skill_seekers.cli.package_skill import package_skill
|
||||
|
||||
skill_dir = create_test_skill(tmp_path, large_doc=True)
|
||||
@@ -301,7 +301,7 @@ class TestChunkingCLIIntegration:
|
||||
open_folder_after=False,
|
||||
skip_quality_check=True,
|
||||
target="langchain",
|
||||
enable_chunking=True, # --chunk flag
|
||||
enable_chunking=True, # --chunk-for-rag flag
|
||||
chunk_max_tokens=512,
|
||||
preserve_code_blocks=True,
|
||||
)
|
||||
|
||||
@@ -32,8 +32,8 @@ class TestParserSync:
|
||||
["skill-seekers", "scrape", "--help"], capture_output=True, text=True
|
||||
)
|
||||
assert "--chunk-for-rag" in result.stdout, "Help should show --chunk-for-rag flag"
|
||||
assert "--chunk-size" in result.stdout, "Help should show --chunk-size flag"
|
||||
assert "--chunk-overlap" in result.stdout, "Help should show --chunk-overlap flag"
|
||||
assert "--chunk-tokens" in result.stdout, "Help should show --chunk-tokens flag"
|
||||
assert "--chunk-overlap-tokens" in result.stdout, "Help should show --chunk-overlap-tokens flag"
|
||||
|
||||
def test_scrape_verbose_flag_works(self):
|
||||
"""Test that --verbose flag (previously missing) now works."""
|
||||
|
||||
@@ -40,8 +40,8 @@ class TestUniversalArguments:
|
||||
"verbose",
|
||||
"quiet",
|
||||
"chunk_for_rag",
|
||||
"chunk_size",
|
||||
"chunk_overlap", # Phase 2: RAG args from common.py
|
||||
"chunk_tokens",
|
||||
"chunk_overlap_tokens", # Phase 2: RAG args from common.py
|
||||
"preset",
|
||||
"config",
|
||||
# Phase 2: Workflow arguments (universal workflow support)
|
||||
|
||||
Reference in New Issue
Block a user