feat: add workflow support to unified_scraper (fixes gap #1)

unified_scraper.py was the only scraper missing --enhance-workflow,
--enhance-stage, --var, and --workflow-dry-run support. All other
scrapers (doc_scraper, github_scraper, pdf_scraper, codebase_scraper)
already called run_workflows() after building the skill.

Changes:
- arguments/unified.py: add 4 workflow args to UNIFIED_ARGUMENTS so
  the unified CLI subparser picks them up automatically
- unified_scraper.py main(): register the same 4 workflow args in the
  standalone parser
- unified_scraper.py run(): accept optional `args` parameter and call
  run_workflows() after build_skill(), passing unified context
  (name + description) consistent with doc_scraper pattern

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-02-21 23:36:58 +03:00
parent 741daf1c68
commit 4b70c5a860
2 changed files with 78 additions and 3 deletions

View File

@@ -40,6 +40,38 @@ UNIFIED_ARGUMENTS: dict[str, dict[str, Any]] = {
"help": "Dry run mode",
},
},
# Enhancement Workflow arguments (mirrors scrape/github/pdf/codebase scrapers)
"enhance_workflow": {
"flags": ("--enhance-workflow",),
"kwargs": {
"action": "append",
"help": "Apply enhancement workflow (file path or preset: security-focus, minimal, api-documentation, architecture-comprehensive). Can use multiple times to chain workflows.",
"metavar": "WORKFLOW",
},
},
"enhance_stage": {
"flags": ("--enhance-stage",),
"kwargs": {
"action": "append",
"help": "Add inline enhancement stage (format: 'name:prompt'). Can be used multiple times.",
"metavar": "STAGE",
},
},
"var": {
"flags": ("--var",),
"kwargs": {
"action": "append",
"help": "Override workflow variable (format: 'key=value'). Can be used multiple times.",
"metavar": "VAR",
},
},
"workflow_dry_run": {
"flags": ("--workflow-dry-run",),
"kwargs": {
"action": "store_true",
"help": "Preview workflow stages without executing (requires --enhance-workflow)",
},
},
}

View File

@@ -943,9 +943,14 @@ class UnifiedScraper:
logger.info(f"✅ Unified skill built: {self.output_dir}/")
def run(self):
def run(self, args=None):
"""
Execute complete unified scraping workflow.
Args:
args: Optional parsed CLI arguments for workflow integration.
When provided, enhancement workflows (--enhance-workflow,
--enhance-stage) are executed after the skill is built.
"""
logger.info("\n" + "🚀 " * 20)
logger.info(f"Unified Scraper: {self.config['name']}")
@@ -966,6 +971,16 @@ class UnifiedScraper:
# Phase 4: Build skill
self.build_skill(merged_data)
# Phase 5: Enhancement Workflow Integration
if args is not None:
from skill_seekers.cli.workflow_runner import run_workflows
unified_context = {
"name": self.config.get("name", ""),
"description": self.config.get("description", ""),
}
run_workflows(args, context=unified_context)
logger.info("\n" + "" * 20)
logger.info("Unified scraping complete!")
logger.info("" * 20 + "\n")
@@ -1024,6 +1039,34 @@ Examples:
action="store_true",
help="Preview what will be scraped without actually scraping",
)
# Enhancement Workflow arguments (mirrors scrape/github/pdf/codebase scrapers)
parser.add_argument(
"--enhance-workflow",
action="append",
dest="enhance_workflow",
help="Apply enhancement workflow (file path or preset). Can use multiple times to chain workflows.",
metavar="WORKFLOW",
)
parser.add_argument(
"--enhance-stage",
action="append",
dest="enhance_stage",
help="Add inline enhancement stage (format: 'name:prompt'). Can be used multiple times.",
metavar="STAGE",
)
parser.add_argument(
"--var",
action="append",
dest="var",
help="Override workflow variable (format: 'key=value'). Can be used multiple times.",
metavar="VAR",
)
parser.add_argument(
"--workflow-dry-run",
action="store_true",
dest="workflow_dry_run",
help="Preview workflow stages without executing (requires --enhance-workflow)",
)
args = parser.parse_args()
@@ -1068,8 +1111,8 @@ Examples:
logger.info(f"Merge mode: {scraper.merge_mode}")
return
# Run scraper
scraper.run()
# Run scraper (pass args for workflow integration)
scraper.run(args=args)
if __name__ == "__main__":