refactor: rename all chunk flags to include explicit units
Replace ambiguous --chunk-size / --chunk-overlap names that meant different things in different contexts (tokens vs characters) with fully explicit names: - --chunk-size (RAG tokens) → --chunk-tokens - --chunk-overlap (RAG tokens) → --chunk-overlap-tokens - --chunk (enable RAG chunking) → --chunk-for-rag - --streaming-chunk-size (chars) → --streaming-chunk-chars - --streaming-overlap (chars) → --streaming-overlap-chars - --chunk-size (PDF pages) → --pdf-pages-per-chunk (poc file) Also aligns stream_parser.py help with streaming_ingest.py standalone parser. All 2167 tests pass. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -106,8 +106,8 @@ class CreateCommand:
|
||||
# Check against common defaults
|
||||
defaults = {
|
||||
"max_issues": 100,
|
||||
"chunk_size": 512,
|
||||
"chunk_overlap": 50,
|
||||
"chunk_tokens": 512,
|
||||
"chunk_overlap_tokens": 50,
|
||||
"output": None,
|
||||
}
|
||||
|
||||
@@ -158,10 +158,10 @@ class CreateCommand:
|
||||
# RAG arguments (web scraper only)
|
||||
if getattr(self.args, "chunk_for_rag", False):
|
||||
argv.append("--chunk-for-rag")
|
||||
if getattr(self.args, "chunk_size", None) and self.args.chunk_size != 512:
|
||||
argv.extend(["--chunk-size", str(self.args.chunk_size)])
|
||||
if getattr(self.args, "chunk_overlap", None) and self.args.chunk_overlap != 50:
|
||||
argv.extend(["--chunk-overlap", str(self.args.chunk_overlap)])
|
||||
if getattr(self.args, "chunk_tokens", None) and self.args.chunk_tokens != 512:
|
||||
argv.extend(["--chunk-tokens", str(self.args.chunk_tokens)])
|
||||
if getattr(self.args, "chunk_overlap_tokens", None) and self.args.chunk_overlap_tokens != 50:
|
||||
argv.extend(["--chunk-overlap-tokens", str(self.args.chunk_overlap_tokens)])
|
||||
|
||||
# Advanced web-specific arguments
|
||||
if getattr(self.args, "no_preserve_code_blocks", False):
|
||||
|
||||
Reference in New Issue
Block a user