diff --git a/src/skill_seekers/cli/arguments/common.py b/src/skill_seekers/cli/arguments/common.py index 7fbdb36..309f993 100644 --- a/src/skill_seekers/cli/arguments/common.py +++ b/src/skill_seekers/cli/arguments/common.py @@ -101,8 +101,8 @@ RAG_ARGUMENTS: dict[str, dict[str, Any]] = { "help": "Enable semantic chunking for RAG pipelines", }, }, - "chunk_size": { - "flags": ("--chunk-size",), + "chunk_tokens": { + "flags": ("--chunk-tokens",), "kwargs": { "type": int, "default": 512, @@ -110,8 +110,8 @@ RAG_ARGUMENTS: dict[str, dict[str, Any]] = { "help": "Chunk size in tokens for RAG (default: 512)", }, }, - "chunk_overlap": { - "flags": ("--chunk-overlap",), + "chunk_overlap_tokens": { + "flags": ("--chunk-overlap-tokens",), "kwargs": { "type": int, "default": 50, @@ -183,7 +183,7 @@ def add_rag_arguments(parser: argparse.ArgumentParser) -> None: Example: >>> parser = argparse.ArgumentParser() >>> add_rag_arguments(parser) - >>> # Now parser has --chunk-for-rag, --chunk-size, --chunk-overlap + >>> # Now parser has --chunk-for-rag, --chunk-tokens, --chunk-overlap-tokens """ for arg_name, arg_def in RAG_ARGUMENTS.items(): flags = arg_def["flags"] @@ -195,7 +195,7 @@ def get_rag_argument_names() -> set: """Get the set of RAG argument destination names. Returns: - Set of argument dest names (e.g., {'chunk_for_rag', 'chunk_size', 'chunk_overlap'}) + Set of argument dest names (e.g., {'chunk_for_rag', 'chunk_tokens', 'chunk_overlap_tokens'}) """ return set(RAG_ARGUMENTS.keys()) diff --git a/src/skill_seekers/cli/arguments/package.py b/src/skill_seekers/cli/arguments/package.py index d51b3be..6b1387e 100644 --- a/src/skill_seekers/cli/arguments/package.py +++ b/src/skill_seekers/cli/arguments/package.py @@ -70,8 +70,8 @@ PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = { "help": "Use streaming ingestion for large docs (memory-efficient)", }, }, - "streaming_chunk_size": { - "flags": ("--streaming-chunk-size",), + "streaming_chunk_chars": { + "flags": ("--streaming-chunk-chars",), "kwargs": { "type": int, "default": 4000, @@ -79,8 +79,8 @@ PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = { "metavar": "N", }, }, - "streaming_overlap": { - "flags": ("--streaming-overlap",), + "streaming_overlap_chars": { + "flags": ("--streaming-overlap-chars",), "kwargs": { "type": int, "default": 200, @@ -98,8 +98,8 @@ PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = { }, }, # RAG chunking options - "chunk": { - "flags": ("--chunk",), + "chunk_for_rag": { + "flags": ("--chunk-for-rag",), "kwargs": { "action": "store_true", "help": "Enable intelligent chunking for RAG platforms (auto-enabled for RAG adaptors)", diff --git a/src/skill_seekers/cli/create_command.py b/src/skill_seekers/cli/create_command.py index 0dd69f4..8ef8475 100644 --- a/src/skill_seekers/cli/create_command.py +++ b/src/skill_seekers/cli/create_command.py @@ -106,8 +106,8 @@ class CreateCommand: # Check against common defaults defaults = { "max_issues": 100, - "chunk_size": 512, - "chunk_overlap": 50, + "chunk_tokens": 512, + "chunk_overlap_tokens": 50, "output": None, } @@ -158,10 +158,10 @@ class CreateCommand: # RAG arguments (web scraper only) if getattr(self.args, "chunk_for_rag", False): argv.append("--chunk-for-rag") - if getattr(self.args, "chunk_size", None) and self.args.chunk_size != 512: - argv.extend(["--chunk-size", str(self.args.chunk_size)]) - if getattr(self.args, "chunk_overlap", None) and self.args.chunk_overlap != 50: - argv.extend(["--chunk-overlap", str(self.args.chunk_overlap)]) + if getattr(self.args, "chunk_tokens", None) and self.args.chunk_tokens != 512: + argv.extend(["--chunk-tokens", str(self.args.chunk_tokens)]) + if getattr(self.args, "chunk_overlap_tokens", None) and self.args.chunk_overlap_tokens != 50: + argv.extend(["--chunk-overlap-tokens", str(self.args.chunk_overlap_tokens)]) # Advanced web-specific arguments if getattr(self.args, "no_preserve_code_blocks", False): diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index 2595c30..72c06df 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -2227,8 +2227,8 @@ def execute_scraping_and_building( from skill_seekers.cli.rag_chunker import RAGChunker chunker = RAGChunker( - chunk_size=args.chunk_size, - chunk_overlap=args.chunk_overlap, + chunk_size=args.chunk_tokens, + chunk_overlap=args.chunk_overlap_tokens, preserve_code_blocks=not args.no_preserve_code_blocks, preserve_paragraphs=not args.no_preserve_paragraphs, ) diff --git a/src/skill_seekers/cli/package_skill.py b/src/skill_seekers/cli/package_skill.py index 00ee167..c8ebe4a 100644 --- a/src/skill_seekers/cli/package_skill.py +++ b/src/skill_seekers/cli/package_skill.py @@ -221,10 +221,10 @@ Examples: skip_quality_check=args.skip_quality_check, target=args.target, streaming=args.streaming, - chunk_size=args.streaming_chunk_size, - chunk_overlap=args.streaming_overlap, + chunk_size=args.streaming_chunk_chars, + chunk_overlap=args.streaming_overlap_chars, batch_size=args.batch_size, - enable_chunking=args.chunk, + enable_chunking=args.chunk_for_rag, chunk_max_tokens=args.chunk_tokens, preserve_code_blocks=not args.no_preserve_code, ) diff --git a/src/skill_seekers/cli/parsers/stream_parser.py b/src/skill_seekers/cli/parsers/stream_parser.py index 6ee513a..52823e6 100644 --- a/src/skill_seekers/cli/parsers/stream_parser.py +++ b/src/skill_seekers/cli/parsers/stream_parser.py @@ -21,5 +21,5 @@ class StreamParser(SubcommandParser): def add_arguments(self, parser): """Add stream-specific arguments.""" parser.add_argument("input_file", help="Large file to stream") - parser.add_argument("--chunk-size", type=int, default=1024, help="Chunk size in KB") + parser.add_argument("--streaming-chunk-chars", type=int, default=4000, help="Maximum characters per chunk (default: 4000)") parser.add_argument("--output", help="Output directory") diff --git a/src/skill_seekers/cli/pdf_extractor_poc.py b/src/skill_seekers/cli/pdf_extractor_poc.py index 58c62a1..b39a82e 100755 --- a/src/skill_seekers/cli/pdf_extractor_poc.py +++ b/src/skill_seekers/cli/pdf_extractor_poc.py @@ -44,7 +44,7 @@ Usage: Example: python3 pdf_extractor_poc.py docs/manual.pdf -o output.json -v \ - --chunk-size 15 --min-quality 6.0 --extract-images \ + --pdf-pages-per-chunk 15 --min-quality 6.0 --extract-images \ --extract-tables --parallel """ @@ -1079,7 +1079,7 @@ Examples: parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output") parser.add_argument("--pretty", action="store_true", help="Pretty-print JSON output") parser.add_argument( - "--chunk-size", type=int, default=10, help="Pages per chunk (0 = no chunking, default: 10)" + "--pdf-pages-per-chunk", type=int, default=10, help="Pages per chunk (0 = no chunking, default: 10)" ) parser.add_argument( "--no-merge", action="store_true", help="Disable merging code blocks across pages" @@ -1138,7 +1138,7 @@ Examples: extractor = PDFExtractor( args.pdf_file, verbose=args.verbose, - chunk_size=args.chunk_size, + chunk_size=args.pdf_pages_per_chunk, min_quality=args.min_quality, extract_images=args.extract_images, image_dir=args.image_dir, diff --git a/src/skill_seekers/cli/rag_chunker.py b/src/skill_seekers/cli/rag_chunker.py index 23f8340..dcbf782 100644 --- a/src/skill_seekers/cli/rag_chunker.py +++ b/src/skill_seekers/cli/rag_chunker.py @@ -383,8 +383,8 @@ def main(): ) parser.add_argument("skill_dir", type=Path, help="Path to skill directory") parser.add_argument("--output", "-o", type=Path, help="Output JSON file") - parser.add_argument("--chunk-size", type=int, default=512, help="Target chunk size in tokens") - parser.add_argument("--chunk-overlap", type=int, default=50, help="Overlap size in tokens") + parser.add_argument("--chunk-tokens", type=int, default=512, help="Target chunk size in tokens") + parser.add_argument("--chunk-overlap-tokens", type=int, default=50, help="Overlap size in tokens") parser.add_argument("--no-code-blocks", action="store_true", help="Don't preserve code blocks") parser.add_argument("--no-paragraphs", action="store_true", help="Don't preserve paragraphs") @@ -392,8 +392,8 @@ def main(): # Create chunker chunker = RAGChunker( - chunk_size=args.chunk_size, - chunk_overlap=args.chunk_overlap, + chunk_size=args.chunk_tokens, + chunk_overlap=args.chunk_overlap_tokens, preserve_code_blocks=not args.no_code_blocks, preserve_paragraphs=not args.no_paragraphs, ) diff --git a/src/skill_seekers/cli/streaming_ingest.py b/src/skill_seekers/cli/streaming_ingest.py index 9720d35..ab81800 100644 --- a/src/skill_seekers/cli/streaming_ingest.py +++ b/src/skill_seekers/cli/streaming_ingest.py @@ -380,9 +380,9 @@ def main(): parser = argparse.ArgumentParser(description="Stream and chunk skill documents") parser.add_argument("input", help="Input file or directory path") - parser.add_argument("--chunk-size", type=int, default=4000, help="Chunk size in characters") + parser.add_argument("--streaming-chunk-chars", type=int, default=4000, help="Chunk size in characters") parser.add_argument( - "--chunk-overlap", type=int, default=200, help="Chunk overlap in characters" + "--streaming-overlap-chars", type=int, default=200, help="Chunk overlap in characters" ) parser.add_argument("--batch-size", type=int, default=100, help="Batch size for processing") parser.add_argument("--checkpoint", help="Checkpoint file path") @@ -390,7 +390,7 @@ def main(): # Initialize ingester ingester = StreamingIngester( - chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap, batch_size=args.batch_size + chunk_size=args.streaming_chunk_chars, chunk_overlap=args.streaming_overlap_chars, batch_size=args.batch_size ) # Progress callback diff --git a/tests/test_chunking_integration.py b/tests/test_chunking_integration.py index 9a19128..42a1c1b 100644 --- a/tests/test_chunking_integration.py +++ b/tests/test_chunking_integration.py @@ -291,7 +291,7 @@ class TestChunkingCLIIntegration: """Test chunking via CLI arguments.""" def test_chunk_flag(self, tmp_path): - """Test --chunk flag enables chunking.""" + """Test --chunk-for-rag flag enables chunking.""" from skill_seekers.cli.package_skill import package_skill skill_dir = create_test_skill(tmp_path, large_doc=True) @@ -301,7 +301,7 @@ class TestChunkingCLIIntegration: open_folder_after=False, skip_quality_check=True, target="langchain", - enable_chunking=True, # --chunk flag + enable_chunking=True, # --chunk-for-rag flag chunk_max_tokens=512, preserve_code_blocks=True, ) diff --git a/tests/test_cli_refactor_e2e.py b/tests/test_cli_refactor_e2e.py index e2d937b..082a8f0 100644 --- a/tests/test_cli_refactor_e2e.py +++ b/tests/test_cli_refactor_e2e.py @@ -32,8 +32,8 @@ class TestParserSync: ["skill-seekers", "scrape", "--help"], capture_output=True, text=True ) assert "--chunk-for-rag" in result.stdout, "Help should show --chunk-for-rag flag" - assert "--chunk-size" in result.stdout, "Help should show --chunk-size flag" - assert "--chunk-overlap" in result.stdout, "Help should show --chunk-overlap flag" + assert "--chunk-tokens" in result.stdout, "Help should show --chunk-tokens flag" + assert "--chunk-overlap-tokens" in result.stdout, "Help should show --chunk-overlap-tokens flag" def test_scrape_verbose_flag_works(self): """Test that --verbose flag (previously missing) now works.""" diff --git a/tests/test_create_arguments.py b/tests/test_create_arguments.py index 348d9d1..b297721 100644 --- a/tests/test_create_arguments.py +++ b/tests/test_create_arguments.py @@ -40,8 +40,8 @@ class TestUniversalArguments: "verbose", "quiet", "chunk_for_rag", - "chunk_size", - "chunk_overlap", # Phase 2: RAG args from common.py + "chunk_tokens", + "chunk_overlap_tokens", # Phase 2: RAG args from common.py "preset", "config", # Phase 2: Workflow arguments (universal workflow support)