This commit is contained in:
Pablo Estevez
2026-01-17 17:29:21 +00:00
parent c89f059712
commit 5ed767ff9a
144 changed files with 14142 additions and 16488 deletions

View File

@@ -12,31 +12,28 @@ Usage:
skill-seekers unified --config configs/react_unified.json --merge-mode claude-enhanced
"""
import os
import sys
import argparse
import json
import logging
import argparse
import subprocess
import os
import shutil
import subprocess
import sys
from pathlib import Path
from typing import Dict, List, Any, Optional
from typing import Any
# Import validators and scrapers
try:
from skill_seekers.cli.config_validator import ConfigValidator, validate_config
from skill_seekers.cli.conflict_detector import ConflictDetector
from skill_seekers.cli.merge_sources import RuleBasedMerger, ClaudeEnhancedMerger
from skill_seekers.cli.merge_sources import ClaudeEnhancedMerger, RuleBasedMerger
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
except ImportError as e:
print(f"Error importing modules: {e}")
print("Make sure you're running from the project root directory")
sys.exit(1)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
@@ -52,7 +49,7 @@ class UnifiedScraper:
5. Build unified skill
"""
def __init__(self, config_path: str, merge_mode: Optional[str] = None):
def __init__(self, config_path: str, merge_mode: str | None = None):
"""
Initialize unified scraper.
@@ -68,21 +65,21 @@ class UnifiedScraper:
self.config = self.validator.config
# Determine merge mode
self.merge_mode = merge_mode or self.config.get('merge_mode', 'rule-based')
self.merge_mode = merge_mode or self.config.get("merge_mode", "rule-based")
logger.info(f"Merge mode: {self.merge_mode}")
# Storage for scraped data - use lists to support multiple sources of same type
self.scraped_data = {
'documentation': [], # List of doc sources
'github': [], # List of github sources
'pdf': [] # List of pdf sources
"documentation": [], # List of doc sources
"github": [], # List of github sources
"pdf": [], # List of pdf sources
}
# Track source index for unique naming (multi-source support)
self._source_counters = {'documentation': 0, 'github': 0, 'pdf': 0}
self._source_counters = {"documentation": 0, "github": 0, "pdf": 0}
# Output paths - cleaner organization
self.name = self.config['name']
self.name = self.config["name"]
self.output_dir = f"output/{self.name}" # Final skill only
# Use hidden cache directory for intermediate files
@@ -107,17 +104,16 @@ class UnifiedScraper:
from datetime import datetime
# Create log filename with timestamp
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_file = f"{self.logs_dir}/unified_{timestamp}.log"
# Add file handler to root logger
file_handler = logging.FileHandler(log_file, encoding='utf-8')
file_handler = logging.FileHandler(log_file, encoding="utf-8")
file_handler.setLevel(logging.DEBUG)
# Create formatter
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
file_handler.setFormatter(formatter)
@@ -141,18 +137,18 @@ class UnifiedScraper:
logger.warning("Config is not unified format, converting...")
self.config = self.validator.convert_legacy_to_unified()
sources = self.config.get('sources', [])
sources = self.config.get("sources", [])
for i, source in enumerate(sources):
source_type = source['type']
logger.info(f"\n[{i+1}/{len(sources)}] Scraping {source_type} source...")
source_type = source["type"]
logger.info(f"\n[{i + 1}/{len(sources)}] Scraping {source_type} source...")
try:
if source_type == 'documentation':
if source_type == "documentation":
self._scrape_documentation(source)
elif source_type == 'github':
elif source_type == "github":
self._scrape_github(source)
elif source_type == 'pdf':
elif source_type == "pdf":
self._scrape_pdf(source)
else:
logger.warning(f"Unknown source type: {source_type}")
@@ -162,40 +158,40 @@ class UnifiedScraper:
logger.info(f"\n✅ Scraped {len(self.scraped_data)} sources successfully")
def _scrape_documentation(self, source: Dict[str, Any]):
def _scrape_documentation(self, source: dict[str, Any]):
"""Scrape documentation website."""
# Create temporary config for doc scraper
doc_config = {
'name': f"{self.name}_docs",
'base_url': source['base_url'],
'selectors': source.get('selectors', {}),
'url_patterns': source.get('url_patterns', {}),
'categories': source.get('categories', {}),
'rate_limit': source.get('rate_limit', 0.5),
'max_pages': source.get('max_pages', 100)
"name": f"{self.name}_docs",
"base_url": source["base_url"],
"selectors": source.get("selectors", {}),
"url_patterns": source.get("url_patterns", {}),
"categories": source.get("categories", {}),
"rate_limit": source.get("rate_limit", 0.5),
"max_pages": source.get("max_pages", 100),
}
# Pass through llms.txt settings (so unified configs behave the same as doc_scraper configs)
if 'llms_txt_url' in source:
doc_config['llms_txt_url'] = source.get('llms_txt_url')
if "llms_txt_url" in source:
doc_config["llms_txt_url"] = source.get("llms_txt_url")
if 'skip_llms_txt' in source:
doc_config['skip_llms_txt'] = source.get('skip_llms_txt')
if "skip_llms_txt" in source:
doc_config["skip_llms_txt"] = source.get("skip_llms_txt")
# Optional: support overriding start URLs
if 'start_urls' in source:
doc_config['start_urls'] = source.get('start_urls')
if "start_urls" in source:
doc_config["start_urls"] = source.get("start_urls")
# Write temporary config
temp_config_path = os.path.join(self.data_dir, 'temp_docs_config.json')
with open(temp_config_path, 'w', encoding='utf-8') as f:
temp_config_path = os.path.join(self.data_dir, "temp_docs_config.json")
with open(temp_config_path, "w", encoding="utf-8") as f:
json.dump(doc_config, f, indent=2)
# Run doc_scraper as subprocess
logger.info(f"Scraping documentation from {source['base_url']}")
doc_scraper_path = Path(__file__).parent / "doc_scraper.py"
cmd = [sys.executable, str(doc_scraper_path), '--config', temp_config_path, '--fresh']
cmd = [sys.executable, str(doc_scraper_path), "--config", temp_config_path, "--fresh"]
result = subprocess.run(cmd, capture_output=True, text=True, stdin=subprocess.DEVNULL)
@@ -213,18 +209,20 @@ class UnifiedScraper:
docs_data_file = f"output/{doc_config['name']}_data/summary.json"
if os.path.exists(docs_data_file):
with open(docs_data_file, 'r', encoding='utf-8') as f:
with open(docs_data_file, encoding="utf-8") as f:
summary = json.load(f)
# Append to documentation list (multi-source support)
self.scraped_data['documentation'].append({
'source_id': doc_config['name'],
'base_url': source['base_url'],
'pages': summary.get('pages', []),
'total_pages': summary.get('total_pages', 0),
'data_file': docs_data_file,
'refs_dir': '' # Will be set after moving to cache
})
self.scraped_data["documentation"].append(
{
"source_id": doc_config["name"],
"base_url": source["base_url"],
"pages": summary.get("pages", []),
"total_pages": summary.get("total_pages", 0),
"data_file": docs_data_file,
"refs_dir": "", # Will be set after moving to cache
}
)
logger.info(f"✅ Documentation: {summary.get('total_pages', 0)} pages scraped")
else:
@@ -246,9 +244,9 @@ class UnifiedScraper:
logger.info(f"📦 Moved docs output to cache: {cache_docs_dir}")
# Update refs_dir in scraped_data with cache location
refs_dir_path = os.path.join(cache_docs_dir, 'references')
if self.scraped_data['documentation']:
self.scraped_data['documentation'][-1]['refs_dir'] = refs_dir_path
refs_dir_path = os.path.join(cache_docs_dir, "references")
if self.scraped_data["documentation"]:
self.scraped_data["documentation"][-1]["refs_dir"] = refs_dir_path
if os.path.exists(docs_data_dir):
cache_data_dir = os.path.join(self.data_dir, f"{doc_config['name']}_data")
@@ -257,7 +255,7 @@ class UnifiedScraper:
shutil.move(docs_data_dir, cache_data_dir)
logger.info(f"📦 Moved docs data to cache: {cache_data_dir}")
def _clone_github_repo(self, repo_name: str, idx: int = 0) -> Optional[str]:
def _clone_github_repo(self, repo_name: str, idx: int = 0) -> str | None:
"""
Clone GitHub repository to cache directory for C3.x analysis.
Reuses existing clone if already present.
@@ -274,9 +272,9 @@ class UnifiedScraper:
clone_path = os.path.join(self.repos_dir, repo_dir_name)
# Check if already cloned
if os.path.exists(clone_path) and os.path.isdir(os.path.join(clone_path, '.git')):
if os.path.exists(clone_path) and os.path.isdir(os.path.join(clone_path, ".git")):
logger.info(f"♻️ Found existing repository clone: {clone_path}")
logger.info(f" Reusing for C3.x analysis (skip re-cloning)")
logger.info(" Reusing for C3.x analysis (skip re-cloning)")
return clone_path
# repos_dir already created in __init__
@@ -285,18 +283,18 @@ class UnifiedScraper:
repo_url = f"https://github.com/{repo_name}.git"
logger.info(f"🔄 Cloning repository for C3.x analysis: {repo_url}")
logger.info(f"{clone_path}")
logger.info(f" 💾 Clone will be saved for future reuse")
logger.info(" 💾 Clone will be saved for future reuse")
try:
result = subprocess.run(
['git', 'clone', repo_url, clone_path],
["git", "clone", repo_url, clone_path],
capture_output=True,
text=True,
timeout=600 # 10 minute timeout for full clone
timeout=600, # 10 minute timeout for full clone
)
if result.returncode == 0:
logger.info(f"✅ Repository cloned successfully")
logger.info("✅ Repository cloned successfully")
logger.info(f" 📁 Saved to: {clone_path}")
return clone_path
else:
@@ -307,7 +305,7 @@ class UnifiedScraper:
return None
except subprocess.TimeoutExpired:
logger.error(f"❌ Git clone timed out after 10 minutes")
logger.error("❌ Git clone timed out after 10 minutes")
if os.path.exists(clone_path):
shutil.rmtree(clone_path)
return None
@@ -317,7 +315,7 @@ class UnifiedScraper:
shutil.rmtree(clone_path)
return None
def _scrape_github(self, source: Dict[str, Any]):
def _scrape_github(self, source: dict[str, Any]):
"""Scrape GitHub repository."""
try:
from skill_seekers.cli.github_scraper import GitHubScraper
@@ -326,16 +324,16 @@ class UnifiedScraper:
return
# Multi-source support: Get unique index for this GitHub source
idx = self._source_counters['github']
self._source_counters['github'] += 1
idx = self._source_counters["github"]
self._source_counters["github"] += 1
# Extract repo identifier for unique naming
repo = source['repo']
repo_id = repo.replace('/', '_')
repo = source["repo"]
repo_id = repo.replace("/", "_")
# Check if we need to clone for C3.x analysis
enable_codebase_analysis = source.get('enable_codebase_analysis', True)
local_repo_path = source.get('local_repo_path')
enable_codebase_analysis = source.get("enable_codebase_analysis", True)
local_repo_path = source.get("local_repo_path")
cloned_repo_path = None
# Auto-clone if C3.x analysis is enabled but no local path provided
@@ -351,24 +349,24 @@ class UnifiedScraper:
# Create config for GitHub scraper
github_config = {
'repo': repo,
'name': f"{self.name}_github_{idx}_{repo_id}",
'github_token': source.get('github_token'),
'include_issues': source.get('include_issues', True),
'max_issues': source.get('max_issues', 100),
'include_changelog': source.get('include_changelog', True),
'include_releases': source.get('include_releases', True),
'include_code': source.get('include_code', True),
'code_analysis_depth': source.get('code_analysis_depth', 'surface'),
'file_patterns': source.get('file_patterns', []),
'local_repo_path': local_repo_path # Use cloned path if available
"repo": repo,
"name": f"{self.name}_github_{idx}_{repo_id}",
"github_token": source.get("github_token"),
"include_issues": source.get("include_issues", True),
"max_issues": source.get("max_issues", 100),
"include_changelog": source.get("include_changelog", True),
"include_releases": source.get("include_releases", True),
"include_code": source.get("include_code", True),
"code_analysis_depth": source.get("code_analysis_depth", "surface"),
"file_patterns": source.get("file_patterns", []),
"local_repo_path": local_repo_path, # Use cloned path if available
}
# Pass directory exclusions if specified (optional)
if 'exclude_dirs' in source:
github_config['exclude_dirs'] = source['exclude_dirs']
if 'exclude_dirs_additional' in source:
github_config['exclude_dirs_additional'] = source['exclude_dirs_additional']
if "exclude_dirs" in source:
github_config["exclude_dirs"] = source["exclude_dirs"]
if "exclude_dirs_additional" in source:
github_config["exclude_dirs_additional"] = source["exclude_dirs_additional"]
# Scrape
logger.info(f"Scraping GitHub repository: {source['repo']}")
@@ -381,13 +379,14 @@ class UnifiedScraper:
try:
c3_data = self._run_c3_analysis(local_repo_path, source)
if c3_data:
github_data['c3_analysis'] = c3_data
github_data["c3_analysis"] = c3_data
logger.info("✅ C3.x analysis complete")
else:
logger.warning("⚠️ C3.x analysis returned no data")
except Exception as e:
logger.warning(f"⚠️ C3.x analysis failed: {e}")
import traceback
logger.debug(f"Traceback: {traceback.format_exc()}")
# Continue without C3.x data - graceful degradation
@@ -396,32 +395,29 @@ class UnifiedScraper:
logger.info(f"📁 Repository clone saved for future use: {cloned_repo_path}")
# Save data to unified location with unique filename
github_data_file = os.path.join(self.data_dir, f'github_data_{idx}_{repo_id}.json')
with open(github_data_file, 'w', encoding='utf-8') as f:
github_data_file = os.path.join(self.data_dir, f"github_data_{idx}_{repo_id}.json")
with open(github_data_file, "w", encoding="utf-8") as f:
json.dump(github_data, f, indent=2, ensure_ascii=False)
# ALSO save to the location GitHubToSkillConverter expects (with C3.x data!)
converter_data_file = f"output/{github_config['name']}_github_data.json"
with open(converter_data_file, 'w', encoding='utf-8') as f:
with open(converter_data_file, "w", encoding="utf-8") as f:
json.dump(github_data, f, indent=2, ensure_ascii=False)
# Append to list instead of overwriting (multi-source support)
self.scraped_data['github'].append({
'repo': repo,
'repo_id': repo_id,
'idx': idx,
'data': github_data,
'data_file': github_data_file
})
self.scraped_data["github"].append(
{"repo": repo, "repo_id": repo_id, "idx": idx, "data": github_data, "data_file": github_data_file}
)
# Build standalone SKILL.md for synthesis using GitHubToSkillConverter
try:
from skill_seekers.cli.github_scraper import GitHubToSkillConverter
# Use github_config which has the correct name field
# Converter will load from output/{name}_github_data.json which now has C3.x data
converter = GitHubToSkillConverter(config=github_config)
converter.build_skill()
logger.info(f"✅ GitHub: Standalone SKILL.md created")
logger.info("✅ GitHub: Standalone SKILL.md created")
except Exception as e:
logger.warning(f"⚠️ Failed to build standalone GitHub SKILL.md: {e}")
@@ -430,7 +426,7 @@ class UnifiedScraper:
github_data_file_path = f"output/{github_config['name']}_github_data.json"
if os.path.exists(github_output_dir):
cache_github_dir = os.path.join(self.sources_dir, github_config['name'])
cache_github_dir = os.path.join(self.sources_dir, github_config["name"])
if os.path.exists(cache_github_dir):
shutil.rmtree(cache_github_dir)
shutil.move(github_output_dir, cache_github_dir)
@@ -443,9 +439,9 @@ class UnifiedScraper:
shutil.move(github_data_file_path, cache_github_data)
logger.info(f"📦 Moved GitHub data to cache: {cache_github_data}")
logger.info(f"✅ GitHub: Repository scraped successfully")
logger.info("✅ GitHub: Repository scraped successfully")
def _scrape_pdf(self, source: Dict[str, Any]):
def _scrape_pdf(self, source: dict[str, Any]):
"""Scrape PDF document."""
try:
from skill_seekers.cli.pdf_scraper import PDFToSkillConverter
@@ -454,20 +450,20 @@ class UnifiedScraper:
return
# Multi-source support: Get unique index for this PDF source
idx = self._source_counters['pdf']
self._source_counters['pdf'] += 1
idx = self._source_counters["pdf"]
self._source_counters["pdf"] += 1
# Extract PDF identifier for unique naming (filename without extension)
pdf_path = source['path']
pdf_path = source["path"]
pdf_id = os.path.splitext(os.path.basename(pdf_path))[0]
# Create config for PDF scraper
pdf_config = {
'name': f"{self.name}_pdf_{idx}_{pdf_id}",
'pdf': source['path'],
'extract_tables': source.get('extract_tables', False),
'ocr': source.get('ocr', False),
'password': source.get('password')
"name": f"{self.name}_pdf_{idx}_{pdf_id}",
"pdf": source["path"],
"extract_tables": source.get("extract_tables", False),
"ocr": source.get("ocr", False),
"password": source.get("password"),
}
# Scrape
@@ -476,29 +472,25 @@ class UnifiedScraper:
pdf_data = converter.extract_all()
# Save data
pdf_data_file = os.path.join(self.data_dir, f'pdf_data_{idx}_{pdf_id}.json')
with open(pdf_data_file, 'w', encoding='utf-8') as f:
pdf_data_file = os.path.join(self.data_dir, f"pdf_data_{idx}_{pdf_id}.json")
with open(pdf_data_file, "w", encoding="utf-8") as f:
json.dump(pdf_data, f, indent=2, ensure_ascii=False)
# Append to list instead of overwriting
self.scraped_data['pdf'].append({
'pdf_path': pdf_path,
'pdf_id': pdf_id,
'idx': idx,
'data': pdf_data,
'data_file': pdf_data_file
})
self.scraped_data["pdf"].append(
{"pdf_path": pdf_path, "pdf_id": pdf_id, "idx": idx, "data": pdf_data, "data_file": pdf_data_file}
)
# Build standalone SKILL.md for synthesis
try:
converter.build_skill()
logger.info(f"✅ PDF: Standalone SKILL.md created")
logger.info("✅ PDF: Standalone SKILL.md created")
except Exception as e:
logger.warning(f"⚠️ Failed to build standalone PDF SKILL.md: {e}")
logger.info(f"✅ PDF: {len(pdf_data.get('pages', []))} pages extracted")
def _load_json(self, file_path: Path) -> Dict:
def _load_json(self, file_path: Path) -> dict:
"""
Load JSON file safely.
@@ -513,13 +505,13 @@ class UnifiedScraper:
return {}
try:
with open(file_path, 'r', encoding='utf-8') as f:
with open(file_path, encoding="utf-8") as f:
return json.load(f)
except (json.JSONDecodeError, IOError) as e:
except (OSError, json.JSONDecodeError) as e:
logger.warning(f"Failed to load JSON {file_path}: {e}")
return {}
def _load_guide_collection(self, tutorials_dir: Path) -> Dict:
def _load_guide_collection(self, tutorials_dir: Path) -> dict:
"""
Load how-to guide collection from tutorials directory.
@@ -531,22 +523,22 @@ class UnifiedScraper:
"""
if not tutorials_dir.exists():
logger.warning(f"Tutorials directory not found: {tutorials_dir}")
return {'guides': []}
return {"guides": []}
collection_file = tutorials_dir / 'guide_collection.json'
collection_file = tutorials_dir / "guide_collection.json"
if collection_file.exists():
return self._load_json(collection_file)
# Fallback: scan for individual guide JSON files
guides = []
for guide_file in tutorials_dir.glob('guide_*.json'):
for guide_file in tutorials_dir.glob("guide_*.json"):
guide_data = self._load_json(guide_file)
if guide_data:
guides.append(guide_data)
return {'guides': guides, 'total_count': len(guides)}
return {"guides": guides, "total_count": len(guides)}
def _load_api_reference(self, api_dir: Path) -> Dict[str, Any]:
def _load_api_reference(self, api_dir: Path) -> dict[str, Any]:
"""
Load API reference markdown files from api_reference directory.
@@ -561,16 +553,16 @@ class UnifiedScraper:
return {}
api_refs = {}
for md_file in api_dir.glob('*.md'):
for md_file in api_dir.glob("*.md"):
try:
module_name = md_file.stem
api_refs[module_name] = md_file.read_text(encoding='utf-8')
except IOError as e:
api_refs[module_name] = md_file.read_text(encoding="utf-8")
except OSError as e:
logger.warning(f"Failed to read API reference {md_file}: {e}")
return api_refs
def _run_c3_analysis(self, local_repo_path: str, source: Dict[str, Any]) -> Dict[str, Any]:
def _run_c3_analysis(self, local_repo_path: str, source: dict[str, Any]) -> dict[str, Any]:
"""
Run comprehensive C3.x codebase analysis.
@@ -592,7 +584,7 @@ class UnifiedScraper:
return {}
# Create temp output dir for C3.x analysis
temp_output = Path(self.data_dir) / 'c3_analysis_temp'
temp_output = Path(self.data_dir) / "c3_analysis_temp"
temp_output.mkdir(parents=True, exist_ok=True)
logger.info(f" Analyzing codebase: {local_repo_path}")
@@ -602,37 +594,37 @@ class UnifiedScraper:
results = analyze_codebase(
directory=Path(local_repo_path),
output_dir=temp_output,
depth='deep',
depth="deep",
languages=None, # Analyze all languages
file_patterns=source.get('file_patterns'),
build_api_reference=True, # C2.5: API Reference
extract_comments=False, # Not needed
file_patterns=source.get("file_patterns"),
build_api_reference=True, # C2.5: API Reference
extract_comments=False, # Not needed
build_dependency_graph=True, # C2.6: Dependency Graph
detect_patterns=True, # C3.1: Design patterns
extract_test_examples=True, # C3.2: Test examples
build_how_to_guides=True, # C3.3: How-to guides
detect_patterns=True, # C3.1: Design patterns
extract_test_examples=True, # C3.2: Test examples
build_how_to_guides=True, # C3.3: How-to guides
extract_config_patterns=True, # C3.4: Config patterns
enhance_with_ai=source.get('ai_mode', 'auto') != 'none',
ai_mode=source.get('ai_mode', 'auto')
enhance_with_ai=source.get("ai_mode", "auto") != "none",
ai_mode=source.get("ai_mode", "auto"),
)
# Load C3.x outputs into memory
c3_data = {
'patterns': self._load_json(temp_output / 'patterns' / 'detected_patterns.json'),
'test_examples': self._load_json(temp_output / 'test_examples' / 'test_examples.json'),
'how_to_guides': self._load_guide_collection(temp_output / 'tutorials'),
'config_patterns': self._load_json(temp_output / 'config_patterns' / 'config_patterns.json'),
'architecture': self._load_json(temp_output / 'architecture' / 'architectural_patterns.json'),
'api_reference': self._load_api_reference(temp_output / 'api_reference'), # C2.5
'dependency_graph': self._load_json(temp_output / 'dependencies' / 'dependency_graph.json') # C2.6
"patterns": self._load_json(temp_output / "patterns" / "detected_patterns.json"),
"test_examples": self._load_json(temp_output / "test_examples" / "test_examples.json"),
"how_to_guides": self._load_guide_collection(temp_output / "tutorials"),
"config_patterns": self._load_json(temp_output / "config_patterns" / "config_patterns.json"),
"architecture": self._load_json(temp_output / "architecture" / "architectural_patterns.json"),
"api_reference": self._load_api_reference(temp_output / "api_reference"), # C2.5
"dependency_graph": self._load_json(temp_output / "dependencies" / "dependency_graph.json"), # C2.6
}
# Log summary
total_patterns = sum(len(f.get('patterns', [])) for f in c3_data.get('patterns', []))
total_examples = c3_data.get('test_examples', {}).get('total_examples', 0)
total_guides = len(c3_data.get('how_to_guides', {}).get('guides', []))
total_configs = len(c3_data.get('config_patterns', {}).get('config_files', []))
arch_patterns = len(c3_data.get('architecture', {}).get('patterns', []))
total_patterns = sum(len(f.get("patterns", [])) for f in c3_data.get("patterns", []))
total_examples = c3_data.get("test_examples", {}).get("total_examples", 0)
total_guides = len(c3_data.get("how_to_guides", {}).get("guides", []))
total_configs = len(c3_data.get("config_patterns", {}).get("config_files", []))
arch_patterns = len(c3_data.get("architecture", {}).get("patterns", []))
logger.info(f" ✓ Design Patterns: {total_patterns}")
logger.info(f" ✓ Test Examples: {total_examples}")
@@ -645,6 +637,7 @@ class UnifiedScraper:
except Exception as e:
logger.error(f"C3.x analysis failed: {e}")
import traceback
traceback.print_exc()
return {}
@@ -656,7 +649,7 @@ class UnifiedScraper:
except Exception as e:
logger.warning(f"Failed to clean up temp directory: {e}")
def detect_conflicts(self) -> List:
def detect_conflicts(self) -> list:
"""
Detect conflicts between documentation and code.
@@ -674,18 +667,18 @@ class UnifiedScraper:
return []
# Get documentation and GitHub data
docs_data = self.scraped_data.get('documentation', {})
github_data = self.scraped_data.get('github', {})
docs_data = self.scraped_data.get("documentation", {})
github_data = self.scraped_data.get("github", {})
if not docs_data or not github_data:
logger.warning("Missing documentation or GitHub data for conflict detection")
return []
# Load data files
with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
with open(docs_data["data_file"], encoding="utf-8") as f:
docs_json = json.load(f)
with open(github_data['data_file'], 'r', encoding='utf-8') as f:
with open(github_data["data_file"], encoding="utf-8") as f:
github_json = json.load(f)
# Detect conflicts
@@ -693,26 +686,26 @@ class UnifiedScraper:
conflicts = detector.detect_all_conflicts()
# Save conflicts
conflicts_file = os.path.join(self.data_dir, 'conflicts.json')
conflicts_file = os.path.join(self.data_dir, "conflicts.json")
detector.save_conflicts(conflicts, conflicts_file)
# Print summary
summary = detector.generate_summary(conflicts)
logger.info(f"\n📊 Conflict Summary:")
logger.info("\n📊 Conflict Summary:")
logger.info(f" Total: {summary['total']}")
logger.info(f" By Type:")
for ctype, count in summary['by_type'].items():
logger.info(" By Type:")
for ctype, count in summary["by_type"].items():
if count > 0:
logger.info(f" - {ctype}: {count}")
logger.info(f" By Severity:")
for severity, count in summary['by_severity'].items():
logger.info(" By Severity:")
for severity, count in summary["by_severity"].items():
if count > 0:
emoji = '🔴' if severity == 'high' else '🟡' if severity == 'medium' else '🟢'
emoji = "🔴" if severity == "high" else "🟡" if severity == "medium" else "🟢"
logger.info(f" {emoji} {severity}: {count}")
return conflicts
def merge_sources(self, conflicts: List):
def merge_sources(self, conflicts: list):
"""
Merge data from multiple sources.
@@ -728,18 +721,18 @@ class UnifiedScraper:
return None
# Get data files
docs_data = self.scraped_data.get('documentation', {})
github_data = self.scraped_data.get('github', {})
docs_data = self.scraped_data.get("documentation", {})
github_data = self.scraped_data.get("github", {})
# Load data
with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
with open(docs_data["data_file"], encoding="utf-8") as f:
docs_json = json.load(f)
with open(github_data['data_file'], 'r', encoding='utf-8') as f:
with open(github_data["data_file"], encoding="utf-8") as f:
github_json = json.load(f)
# Choose merger
if self.merge_mode == 'claude-enhanced':
if self.merge_mode == "claude-enhanced":
merger = ClaudeEnhancedMerger(docs_json, github_json, conflicts)
else:
merger = RuleBasedMerger(docs_json, github_json, conflicts)
@@ -748,15 +741,15 @@ class UnifiedScraper:
merged_data = merger.merge_all()
# Save merged data
merged_file = os.path.join(self.data_dir, 'merged_data.json')
with open(merged_file, 'w', encoding='utf-8') as f:
merged_file = os.path.join(self.data_dir, "merged_data.json")
with open(merged_file, "w", encoding="utf-8") as f:
json.dump(merged_data, f, indent=2, ensure_ascii=False)
logger.info(f"✅ Merged data saved: {merged_file}")
return merged_data
def build_skill(self, merged_data: Optional[Dict] = None):
def build_skill(self, merged_data: dict | None = None):
"""
Build final unified skill.
@@ -769,20 +762,14 @@ class UnifiedScraper:
# Load conflicts if they exist
conflicts = []
conflicts_file = os.path.join(self.data_dir, 'conflicts.json')
conflicts_file = os.path.join(self.data_dir, "conflicts.json")
if os.path.exists(conflicts_file):
with open(conflicts_file, 'r', encoding='utf-8') as f:
with open(conflicts_file, encoding="utf-8") as f:
conflicts_data = json.load(f)
conflicts = conflicts_data.get('conflicts', [])
conflicts = conflicts_data.get("conflicts", [])
# Build skill
builder = UnifiedSkillBuilder(
self.config,
self.scraped_data,
merged_data,
conflicts,
cache_dir=self.cache_dir
)
builder = UnifiedSkillBuilder(self.config, self.scraped_data, merged_data, conflicts, cache_dir=self.cache_dir)
builder.build()
@@ -824,6 +811,7 @@ class UnifiedScraper:
except Exception as e:
logger.error(f"\n\n❌ Error during scraping: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
@@ -831,7 +819,7 @@ class UnifiedScraper:
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description='Unified multi-source scraper',
description="Unified multi-source scraper",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
@@ -843,17 +831,18 @@ Examples:
# Backward compatible with legacy configs
skill-seekers unified --config configs/react.json
"""
""",
)
parser.add_argument('--config', '-c', required=True,
help='Path to unified config JSON file')
parser.add_argument('--merge-mode', '-m',
choices=['rule-based', 'claude-enhanced'],
help='Override config merge mode')
parser.add_argument('--skip-codebase-analysis',
action='store_true',
help='Skip C3.x codebase analysis for GitHub sources (default: enabled)')
parser.add_argument("--config", "-c", required=True, help="Path to unified config JSON file")
parser.add_argument(
"--merge-mode", "-m", choices=["rule-based", "claude-enhanced"], help="Override config merge mode"
)
parser.add_argument(
"--skip-codebase-analysis",
action="store_true",
help="Skip C3.x codebase analysis for GitHub sources (default: enabled)",
)
args = parser.parse_args()
@@ -862,14 +851,14 @@ Examples:
# Disable codebase analysis if requested
if args.skip_codebase_analysis:
for source in scraper.config.get('sources', []):
if source['type'] == 'github':
source['enable_codebase_analysis'] = False
for source in scraper.config.get("sources", []):
if source["type"] == "github":
source["enable_codebase_analysis"] = False
logger.info(f"⏭️ Skipping codebase analysis for GitHub source: {source.get('repo', 'unknown')}")
# Run scraper
scraper.run()
if __name__ == '__main__':
if __name__ == "__main__":
main()