refactor: rename all chunk flags to include explicit units

Replace ambiguous --chunk-size / --chunk-overlap names that meant different
things in different contexts (tokens vs characters) with fully explicit names:

- --chunk-size (RAG tokens)     → --chunk-tokens
- --chunk-overlap (RAG tokens)  → --chunk-overlap-tokens
- --chunk (enable RAG chunking) → --chunk-for-rag
- --streaming-chunk-size (chars) → --streaming-chunk-chars
- --streaming-overlap (chars)    → --streaming-overlap-chars
- --chunk-size (PDF pages)       → --pdf-pages-per-chunk (poc file)

Also aligns stream_parser.py help with streaming_ingest.py standalone parser.
All 2167 tests pass.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-02-24 22:07:56 +03:00
parent b636a0a292
commit 7a2ffb286c
12 changed files with 40 additions and 40 deletions

View File

@@ -101,8 +101,8 @@ RAG_ARGUMENTS: dict[str, dict[str, Any]] = {
"help": "Enable semantic chunking for RAG pipelines",
},
},
"chunk_size": {
"flags": ("--chunk-size",),
"chunk_tokens": {
"flags": ("--chunk-tokens",),
"kwargs": {
"type": int,
"default": 512,
@@ -110,8 +110,8 @@ RAG_ARGUMENTS: dict[str, dict[str, Any]] = {
"help": "Chunk size in tokens for RAG (default: 512)",
},
},
"chunk_overlap": {
"flags": ("--chunk-overlap",),
"chunk_overlap_tokens": {
"flags": ("--chunk-overlap-tokens",),
"kwargs": {
"type": int,
"default": 50,
@@ -183,7 +183,7 @@ def add_rag_arguments(parser: argparse.ArgumentParser) -> None:
Example:
>>> parser = argparse.ArgumentParser()
>>> add_rag_arguments(parser)
>>> # Now parser has --chunk-for-rag, --chunk-size, --chunk-overlap
>>> # Now parser has --chunk-for-rag, --chunk-tokens, --chunk-overlap-tokens
"""
for arg_name, arg_def in RAG_ARGUMENTS.items():
flags = arg_def["flags"]
@@ -195,7 +195,7 @@ def get_rag_argument_names() -> set:
"""Get the set of RAG argument destination names.
Returns:
Set of argument dest names (e.g., {'chunk_for_rag', 'chunk_size', 'chunk_overlap'})
Set of argument dest names (e.g., {'chunk_for_rag', 'chunk_tokens', 'chunk_overlap_tokens'})
"""
return set(RAG_ARGUMENTS.keys())

View File

@@ -70,8 +70,8 @@ PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
"help": "Use streaming ingestion for large docs (memory-efficient)",
},
},
"streaming_chunk_size": {
"flags": ("--streaming-chunk-size",),
"streaming_chunk_chars": {
"flags": ("--streaming-chunk-chars",),
"kwargs": {
"type": int,
"default": 4000,
@@ -79,8 +79,8 @@ PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
"metavar": "N",
},
},
"streaming_overlap": {
"flags": ("--streaming-overlap",),
"streaming_overlap_chars": {
"flags": ("--streaming-overlap-chars",),
"kwargs": {
"type": int,
"default": 200,
@@ -98,8 +98,8 @@ PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
},
},
# RAG chunking options
"chunk": {
"flags": ("--chunk",),
"chunk_for_rag": {
"flags": ("--chunk-for-rag",),
"kwargs": {
"action": "store_true",
"help": "Enable intelligent chunking for RAG platforms (auto-enabled for RAG adaptors)",

View File

@@ -106,8 +106,8 @@ class CreateCommand:
# Check against common defaults
defaults = {
"max_issues": 100,
"chunk_size": 512,
"chunk_overlap": 50,
"chunk_tokens": 512,
"chunk_overlap_tokens": 50,
"output": None,
}
@@ -158,10 +158,10 @@ class CreateCommand:
# RAG arguments (web scraper only)
if getattr(self.args, "chunk_for_rag", False):
argv.append("--chunk-for-rag")
if getattr(self.args, "chunk_size", None) and self.args.chunk_size != 512:
argv.extend(["--chunk-size", str(self.args.chunk_size)])
if getattr(self.args, "chunk_overlap", None) and self.args.chunk_overlap != 50:
argv.extend(["--chunk-overlap", str(self.args.chunk_overlap)])
if getattr(self.args, "chunk_tokens", None) and self.args.chunk_tokens != 512:
argv.extend(["--chunk-tokens", str(self.args.chunk_tokens)])
if getattr(self.args, "chunk_overlap_tokens", None) and self.args.chunk_overlap_tokens != 50:
argv.extend(["--chunk-overlap-tokens", str(self.args.chunk_overlap_tokens)])
# Advanced web-specific arguments
if getattr(self.args, "no_preserve_code_blocks", False):

View File

@@ -2227,8 +2227,8 @@ def execute_scraping_and_building(
from skill_seekers.cli.rag_chunker import RAGChunker
chunker = RAGChunker(
chunk_size=args.chunk_size,
chunk_overlap=args.chunk_overlap,
chunk_size=args.chunk_tokens,
chunk_overlap=args.chunk_overlap_tokens,
preserve_code_blocks=not args.no_preserve_code_blocks,
preserve_paragraphs=not args.no_preserve_paragraphs,
)

View File

@@ -221,10 +221,10 @@ Examples:
skip_quality_check=args.skip_quality_check,
target=args.target,
streaming=args.streaming,
chunk_size=args.streaming_chunk_size,
chunk_overlap=args.streaming_overlap,
chunk_size=args.streaming_chunk_chars,
chunk_overlap=args.streaming_overlap_chars,
batch_size=args.batch_size,
enable_chunking=args.chunk,
enable_chunking=args.chunk_for_rag,
chunk_max_tokens=args.chunk_tokens,
preserve_code_blocks=not args.no_preserve_code,
)

View File

@@ -21,5 +21,5 @@ class StreamParser(SubcommandParser):
def add_arguments(self, parser):
"""Add stream-specific arguments."""
parser.add_argument("input_file", help="Large file to stream")
parser.add_argument("--chunk-size", type=int, default=1024, help="Chunk size in KB")
parser.add_argument("--streaming-chunk-chars", type=int, default=4000, help="Maximum characters per chunk (default: 4000)")
parser.add_argument("--output", help="Output directory")

View File

@@ -44,7 +44,7 @@ Usage:
Example:
python3 pdf_extractor_poc.py docs/manual.pdf -o output.json -v \
--chunk-size 15 --min-quality 6.0 --extract-images \
--pdf-pages-per-chunk 15 --min-quality 6.0 --extract-images \
--extract-tables --parallel
"""
@@ -1079,7 +1079,7 @@ Examples:
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output")
parser.add_argument(
"--chunk-size", type=int, default=10, help="Pages per chunk (0 = no chunking, default: 10)"
"--pdf-pages-per-chunk", type=int, default=10, help="Pages per chunk (0 = no chunking, default: 10)"
)
parser.add_argument(
"--no-merge", action="store_true", help="Disable merging code blocks across pages"
@@ -1138,7 +1138,7 @@ Examples:
extractor = PDFExtractor(
args.pdf_file,
verbose=args.verbose,
chunk_size=args.chunk_size,
chunk_size=args.pdf_pages_per_chunk,
min_quality=args.min_quality,
extract_images=args.extract_images,
image_dir=args.image_dir,

View File

@@ -383,8 +383,8 @@ def main():
)
parser.add_argument("skill_dir", type=Path, help="Path to skill directory")
parser.add_argument("--output", "-o", type=Path, help="Output JSON file")
parser.add_argument("--chunk-size", type=int, default=512, help="Target chunk size in tokens")
parser.add_argument("--chunk-overlap", type=int, default=50, help="Overlap size in tokens")
parser.add_argument("--chunk-tokens", type=int, default=512, help="Target chunk size in tokens")
parser.add_argument("--chunk-overlap-tokens", type=int, default=50, help="Overlap size in tokens")
parser.add_argument("--no-code-blocks", action="store_true", help="Don't preserve code blocks")
parser.add_argument("--no-paragraphs", action="store_true", help="Don't preserve paragraphs")
@@ -392,8 +392,8 @@ def main():
# Create chunker
chunker = RAGChunker(
chunk_size=args.chunk_size,
chunk_overlap=args.chunk_overlap,
chunk_size=args.chunk_tokens,
chunk_overlap=args.chunk_overlap_tokens,
preserve_code_blocks=not args.no_code_blocks,
preserve_paragraphs=not args.no_paragraphs,
)

View File

@@ -380,9 +380,9 @@ def main():
parser = argparse.ArgumentParser(description="Stream and chunk skill documents")
parser.add_argument("input", help="Input file or directory path")
parser.add_argument("--chunk-size", type=int, default=4000, help="Chunk size in characters")
parser.add_argument("--streaming-chunk-chars", type=int, default=4000, help="Chunk size in characters")
parser.add_argument(
"--chunk-overlap", type=int, default=200, help="Chunk overlap in characters"
"--streaming-overlap-chars", type=int, default=200, help="Chunk overlap in characters"
)
parser.add_argument("--batch-size", type=int, default=100, help="Batch size for processing")
parser.add_argument("--checkpoint", help="Checkpoint file path")
@@ -390,7 +390,7 @@ def main():
# Initialize ingester
ingester = StreamingIngester(
chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap, batch_size=args.batch_size
chunk_size=args.streaming_chunk_chars, chunk_overlap=args.streaming_overlap_chars, batch_size=args.batch_size
)
# Progress callback

View File

@@ -291,7 +291,7 @@ class TestChunkingCLIIntegration:
"""Test chunking via CLI arguments."""
def test_chunk_flag(self, tmp_path):
"""Test --chunk flag enables chunking."""
"""Test --chunk-for-rag flag enables chunking."""
from skill_seekers.cli.package_skill import package_skill
skill_dir = create_test_skill(tmp_path, large_doc=True)
@@ -301,7 +301,7 @@ class TestChunkingCLIIntegration:
open_folder_after=False,
skip_quality_check=True,
target="langchain",
enable_chunking=True, # --chunk flag
enable_chunking=True, # --chunk-for-rag flag
chunk_max_tokens=512,
preserve_code_blocks=True,
)

View File

@@ -32,8 +32,8 @@ class TestParserSync:
["skill-seekers", "scrape", "--help"], capture_output=True, text=True
)
assert "--chunk-for-rag" in result.stdout, "Help should show --chunk-for-rag flag"
assert "--chunk-size" in result.stdout, "Help should show --chunk-size flag"
assert "--chunk-overlap" in result.stdout, "Help should show --chunk-overlap flag"
assert "--chunk-tokens" in result.stdout, "Help should show --chunk-tokens flag"
assert "--chunk-overlap-tokens" in result.stdout, "Help should show --chunk-overlap-tokens flag"
def test_scrape_verbose_flag_works(self):
"""Test that --verbose flag (previously missing) now works."""

View File

@@ -40,8 +40,8 @@ class TestUniversalArguments:
"verbose",
"quiet",
"chunk_for_rag",
"chunk_size",
"chunk_overlap", # Phase 2: RAG args from common.py
"chunk_tokens",
"chunk_overlap_tokens", # Phase 2: RAG args from common.py
"preset",
"config",
# Phase 2: Workflow arguments (universal workflow support)