diff --git a/src/skill_seekers/cli/arguments/common.py b/src/skill_seekers/cli/arguments/common.py index b1ef0af..177f7a8 100644 --- a/src/skill_seekers/cli/arguments/common.py +++ b/src/skill_seekers/cli/arguments/common.py @@ -68,6 +68,37 @@ COMMON_ARGUMENTS: Dict[str, Dict[str, Any]] = { } +# RAG (Retrieval-Augmented Generation) arguments +# These are shared across commands that support RAG chunking +RAG_ARGUMENTS: Dict[str, Dict[str, Any]] = { + "chunk_for_rag": { + "flags": ("--chunk-for-rag",), + "kwargs": { + "action": "store_true", + "help": "Enable semantic chunking for RAG pipelines", + }, + }, + "chunk_size": { + "flags": ("--chunk-size",), + "kwargs": { + "type": int, + "default": 512, + "metavar": "TOKENS", + "help": "Chunk size in tokens for RAG (default: 512)", + }, + }, + "chunk_overlap": { + "flags": ("--chunk-overlap",), + "kwargs": { + "type": int, + "default": 50, + "metavar": "TOKENS", + "help": "Overlap between chunks in tokens (default: 50)", + }, + }, +} + + def add_common_arguments(parser: argparse.ArgumentParser) -> None: """Add common arguments to a parser. @@ -89,13 +120,41 @@ def add_common_arguments(parser: argparse.ArgumentParser) -> None: def get_common_argument_names() -> set: """Get the set of common argument destination names. - + Returns: Set of argument dest names (e.g., {'config', 'name', 'description', ...}) """ return set(COMMON_ARGUMENTS.keys()) +def add_rag_arguments(parser: argparse.ArgumentParser) -> None: + """Add RAG (Retrieval-Augmented Generation) arguments to a parser. + + These arguments enable semantic chunking for RAG pipelines. + + Args: + parser: The ArgumentParser to add arguments to + + Example: + >>> parser = argparse.ArgumentParser() + >>> add_rag_arguments(parser) + >>> # Now parser has --chunk-for-rag, --chunk-size, --chunk-overlap + """ + for arg_name, arg_def in RAG_ARGUMENTS.items(): + flags = arg_def["flags"] + kwargs = arg_def["kwargs"] + parser.add_argument(*flags, **kwargs) + + +def get_rag_argument_names() -> set: + """Get the set of RAG argument destination names. + + Returns: + Set of argument dest names (e.g., {'chunk_for_rag', 'chunk_size', 'chunk_overlap'}) + """ + return set(RAG_ARGUMENTS.keys()) + + def get_argument_help(arg_name: str) -> str: """Get the help text for a common argument. diff --git a/src/skill_seekers/cli/arguments/create.py b/src/skill_seekers/cli/arguments/create.py index a2c4762..4408729 100644 --- a/src/skill_seekers/cli/arguments/create.py +++ b/src/skill_seekers/cli/arguments/create.py @@ -13,6 +13,7 @@ import argparse from typing import Dict, Any, Set, List from skill_seekers.cli.constants import DEFAULT_RATE_LIMIT +from .common import RAG_ARGUMENTS # ============================================================================= @@ -91,32 +92,8 @@ UNIVERSAL_ARGUMENTS: Dict[str, Dict[str, Any]] = { "help": "Minimize output (WARNING level only)", }, }, - # RAG features (NEW - universal for all sources!) - "chunk_for_rag": { - "flags": ("--chunk-for-rag",), - "kwargs": { - "action": "store_true", - "help": "Enable semantic chunking for RAG pipelines (all sources)", - }, - }, - "chunk_size": { - "flags": ("--chunk-size",), - "kwargs": { - "type": int, - "default": 512, - "metavar": "TOKENS", - "help": "Chunk size in tokens for RAG (default: 512)", - }, - }, - "chunk_overlap": { - "flags": ("--chunk-overlap",), - "kwargs": { - "type": int, - "default": 50, - "metavar": "TOKENS", - "help": "Overlap between chunks in tokens (default: 50)", - }, - }, + # RAG features (imported from common.py - see RAG_ARGUMENTS) + # Note: RAG arguments are merged into UNIVERSAL_ARGUMENTS at runtime # Preset system "preset": { "flags": ("--preset",), @@ -138,6 +115,9 @@ UNIVERSAL_ARGUMENTS: Dict[str, Dict[str, Any]] = { }, } +# Merge RAG arguments from common.py into universal arguments +UNIVERSAL_ARGUMENTS.update(RAG_ARGUMENTS) + # ============================================================================= # TIER 2: SOURCE-SPECIFIC ARGUMENTS diff --git a/src/skill_seekers/cli/arguments/scrape.py b/src/skill_seekers/cli/arguments/scrape.py index a973af3..963d925 100644 --- a/src/skill_seekers/cli/arguments/scrape.py +++ b/src/skill_seekers/cli/arguments/scrape.py @@ -11,6 +11,7 @@ import argparse from typing import Dict, Any from skill_seekers.cli.constants import DEFAULT_RATE_LIMIT +from .common import RAG_ARGUMENTS # Scrape-specific argument definitions as data structure @@ -177,32 +178,8 @@ SCRAPE_ARGUMENTS: Dict[str, Dict[str, Any]] = { "help": "Minimize output (WARNING level logging only)", }, }, - # RAG chunking options (v2.10.0) - "chunk_for_rag": { - "flags": ("--chunk-for-rag",), - "kwargs": { - "action": "store_true", - "help": "Enable semantic chunking for RAG pipelines (generates rag_chunks.json)", - }, - }, - "chunk_size": { - "flags": ("--chunk-size",), - "kwargs": { - "type": int, - "default": 512, - "metavar": "TOKENS", - "help": "Target chunk size in tokens for RAG (default: 512)", - }, - }, - "chunk_overlap": { - "flags": ("--chunk-overlap",), - "kwargs": { - "type": int, - "default": 50, - "metavar": "TOKENS", - "help": "Overlap size between chunks in tokens (default: 50)", - }, - }, + # RAG chunking options (imported from common.py - see RAG_ARGUMENTS) + # Note: RAG arguments will be merged at runtime "no_preserve_code_blocks": { "flags": ("--no-preserve-code-blocks",), "kwargs": { @@ -219,6 +196,9 @@ SCRAPE_ARGUMENTS: Dict[str, Dict[str, Any]] = { }, } +# Merge RAG arguments from common.py +SCRAPE_ARGUMENTS.update(RAG_ARGUMENTS) + def add_scrape_arguments(parser: argparse.ArgumentParser) -> None: """Add all scrape command arguments to a parser. diff --git a/tests/test_create_arguments.py b/tests/test_create_arguments.py index b874279..affbc31 100644 --- a/tests/test_create_arguments.py +++ b/tests/test_create_arguments.py @@ -25,16 +25,16 @@ class TestUniversalArguments: """Test universal argument definitions.""" def test_universal_count(self): - """Should have exactly 15 universal arguments.""" - assert len(UNIVERSAL_ARGUMENTS) == 15 + """Should have exactly 13 universal arguments (after Phase 1 consolidation).""" + assert len(UNIVERSAL_ARGUMENTS) == 13 def test_universal_argument_names(self): """Universal arguments should have expected names.""" expected_names = { 'name', 'description', 'output', - 'enhance', 'enhance_local', 'enhance_level', 'api_key', + 'enhance_level', 'api_key', # Phase 1: consolidated from enhance + enhance_local 'dry_run', 'verbose', 'quiet', - 'chunk_for_rag', 'chunk_size', 'chunk_overlap', + 'chunk_for_rag', 'chunk_size', 'chunk_overlap', # Phase 2: RAG args from common.py 'preset', 'config' } assert set(UNIVERSAL_ARGUMENTS.keys()) == expected_names @@ -114,9 +114,9 @@ class TestArgumentHelpers: """Should return set of universal argument names.""" names = get_universal_argument_names() assert isinstance(names, set) - assert len(names) == 15 + assert len(names) == 13 assert 'name' in names - assert 'enhance' in names + assert 'enhance_level' in names # Phase 1: consolidated flag def test_get_source_specific_web(self): """Should return web-specific arguments.""" @@ -158,7 +158,7 @@ class TestCompatibleArguments: # Should include universal arguments assert 'name' in compatible - assert 'enhance' in compatible + assert 'enhance_level' in compatible # Phase 1: consolidated flag # Should include web-specific arguments assert 'max_pages' in compatible @@ -232,7 +232,7 @@ class TestAddCreateArguments: # Should have universal arguments assert 'name' in args - assert 'enhance' in args + assert 'enhance_level' in args assert 'chunk_for_rag' in args # Should not have source-specific arguments (they're not added in default mode) @@ -351,7 +351,7 @@ class TestArgumentQuality: } boolean_args = [ - 'enhance', 'enhance_local', 'dry_run', 'verbose', 'quiet', + 'dry_run', 'verbose', 'quiet', 'chunk_for_rag', 'skip_scrape', 'resume', 'fresh', 'async_mode', 'no_issues', 'no_changelog', 'no_releases', 'scrape_only', 'skip_patterns', 'skip_test_examples', 'ocr', 'no_rate_limit'