run ruff
This commit is contained in:
@@ -12,31 +12,28 @@ Usage:
|
||||
skill-seekers unified --config configs/react_unified.json --merge-mode claude-enhanced
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import argparse
|
||||
import subprocess
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Optional
|
||||
from typing import Any
|
||||
|
||||
# Import validators and scrapers
|
||||
try:
|
||||
from skill_seekers.cli.config_validator import ConfigValidator, validate_config
|
||||
from skill_seekers.cli.conflict_detector import ConflictDetector
|
||||
from skill_seekers.cli.merge_sources import RuleBasedMerger, ClaudeEnhancedMerger
|
||||
from skill_seekers.cli.merge_sources import ClaudeEnhancedMerger, RuleBasedMerger
|
||||
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
||||
except ImportError as e:
|
||||
print(f"Error importing modules: {e}")
|
||||
print("Make sure you're running from the project root directory")
|
||||
sys.exit(1)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -52,7 +49,7 @@ class UnifiedScraper:
|
||||
5. Build unified skill
|
||||
"""
|
||||
|
||||
def __init__(self, config_path: str, merge_mode: Optional[str] = None):
|
||||
def __init__(self, config_path: str, merge_mode: str | None = None):
|
||||
"""
|
||||
Initialize unified scraper.
|
||||
|
||||
@@ -68,21 +65,21 @@ class UnifiedScraper:
|
||||
self.config = self.validator.config
|
||||
|
||||
# Determine merge mode
|
||||
self.merge_mode = merge_mode or self.config.get('merge_mode', 'rule-based')
|
||||
self.merge_mode = merge_mode or self.config.get("merge_mode", "rule-based")
|
||||
logger.info(f"Merge mode: {self.merge_mode}")
|
||||
|
||||
# Storage for scraped data - use lists to support multiple sources of same type
|
||||
self.scraped_data = {
|
||||
'documentation': [], # List of doc sources
|
||||
'github': [], # List of github sources
|
||||
'pdf': [] # List of pdf sources
|
||||
"documentation": [], # List of doc sources
|
||||
"github": [], # List of github sources
|
||||
"pdf": [], # List of pdf sources
|
||||
}
|
||||
|
||||
# Track source index for unique naming (multi-source support)
|
||||
self._source_counters = {'documentation': 0, 'github': 0, 'pdf': 0}
|
||||
self._source_counters = {"documentation": 0, "github": 0, "pdf": 0}
|
||||
|
||||
# Output paths - cleaner organization
|
||||
self.name = self.config['name']
|
||||
self.name = self.config["name"]
|
||||
self.output_dir = f"output/{self.name}" # Final skill only
|
||||
|
||||
# Use hidden cache directory for intermediate files
|
||||
@@ -107,17 +104,16 @@ class UnifiedScraper:
|
||||
from datetime import datetime
|
||||
|
||||
# Create log filename with timestamp
|
||||
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
log_file = f"{self.logs_dir}/unified_{timestamp}.log"
|
||||
|
||||
# Add file handler to root logger
|
||||
file_handler = logging.FileHandler(log_file, encoding='utf-8')
|
||||
file_handler = logging.FileHandler(log_file, encoding="utf-8")
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
|
||||
# Create formatter
|
||||
formatter = logging.Formatter(
|
||||
'%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
file_handler.setFormatter(formatter)
|
||||
|
||||
@@ -141,18 +137,18 @@ class UnifiedScraper:
|
||||
logger.warning("Config is not unified format, converting...")
|
||||
self.config = self.validator.convert_legacy_to_unified()
|
||||
|
||||
sources = self.config.get('sources', [])
|
||||
sources = self.config.get("sources", [])
|
||||
|
||||
for i, source in enumerate(sources):
|
||||
source_type = source['type']
|
||||
logger.info(f"\n[{i+1}/{len(sources)}] Scraping {source_type} source...")
|
||||
source_type = source["type"]
|
||||
logger.info(f"\n[{i + 1}/{len(sources)}] Scraping {source_type} source...")
|
||||
|
||||
try:
|
||||
if source_type == 'documentation':
|
||||
if source_type == "documentation":
|
||||
self._scrape_documentation(source)
|
||||
elif source_type == 'github':
|
||||
elif source_type == "github":
|
||||
self._scrape_github(source)
|
||||
elif source_type == 'pdf':
|
||||
elif source_type == "pdf":
|
||||
self._scrape_pdf(source)
|
||||
else:
|
||||
logger.warning(f"Unknown source type: {source_type}")
|
||||
@@ -162,40 +158,40 @@ class UnifiedScraper:
|
||||
|
||||
logger.info(f"\n✅ Scraped {len(self.scraped_data)} sources successfully")
|
||||
|
||||
def _scrape_documentation(self, source: Dict[str, Any]):
|
||||
def _scrape_documentation(self, source: dict[str, Any]):
|
||||
"""Scrape documentation website."""
|
||||
# Create temporary config for doc scraper
|
||||
doc_config = {
|
||||
'name': f"{self.name}_docs",
|
||||
'base_url': source['base_url'],
|
||||
'selectors': source.get('selectors', {}),
|
||||
'url_patterns': source.get('url_patterns', {}),
|
||||
'categories': source.get('categories', {}),
|
||||
'rate_limit': source.get('rate_limit', 0.5),
|
||||
'max_pages': source.get('max_pages', 100)
|
||||
"name": f"{self.name}_docs",
|
||||
"base_url": source["base_url"],
|
||||
"selectors": source.get("selectors", {}),
|
||||
"url_patterns": source.get("url_patterns", {}),
|
||||
"categories": source.get("categories", {}),
|
||||
"rate_limit": source.get("rate_limit", 0.5),
|
||||
"max_pages": source.get("max_pages", 100),
|
||||
}
|
||||
|
||||
# Pass through llms.txt settings (so unified configs behave the same as doc_scraper configs)
|
||||
if 'llms_txt_url' in source:
|
||||
doc_config['llms_txt_url'] = source.get('llms_txt_url')
|
||||
if "llms_txt_url" in source:
|
||||
doc_config["llms_txt_url"] = source.get("llms_txt_url")
|
||||
|
||||
if 'skip_llms_txt' in source:
|
||||
doc_config['skip_llms_txt'] = source.get('skip_llms_txt')
|
||||
if "skip_llms_txt" in source:
|
||||
doc_config["skip_llms_txt"] = source.get("skip_llms_txt")
|
||||
|
||||
# Optional: support overriding start URLs
|
||||
if 'start_urls' in source:
|
||||
doc_config['start_urls'] = source.get('start_urls')
|
||||
if "start_urls" in source:
|
||||
doc_config["start_urls"] = source.get("start_urls")
|
||||
|
||||
# Write temporary config
|
||||
temp_config_path = os.path.join(self.data_dir, 'temp_docs_config.json')
|
||||
with open(temp_config_path, 'w', encoding='utf-8') as f:
|
||||
temp_config_path = os.path.join(self.data_dir, "temp_docs_config.json")
|
||||
with open(temp_config_path, "w", encoding="utf-8") as f:
|
||||
json.dump(doc_config, f, indent=2)
|
||||
|
||||
# Run doc_scraper as subprocess
|
||||
logger.info(f"Scraping documentation from {source['base_url']}")
|
||||
|
||||
doc_scraper_path = Path(__file__).parent / "doc_scraper.py"
|
||||
cmd = [sys.executable, str(doc_scraper_path), '--config', temp_config_path, '--fresh']
|
||||
cmd = [sys.executable, str(doc_scraper_path), "--config", temp_config_path, "--fresh"]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, stdin=subprocess.DEVNULL)
|
||||
|
||||
@@ -213,18 +209,20 @@ class UnifiedScraper:
|
||||
docs_data_file = f"output/{doc_config['name']}_data/summary.json"
|
||||
|
||||
if os.path.exists(docs_data_file):
|
||||
with open(docs_data_file, 'r', encoding='utf-8') as f:
|
||||
with open(docs_data_file, encoding="utf-8") as f:
|
||||
summary = json.load(f)
|
||||
|
||||
# Append to documentation list (multi-source support)
|
||||
self.scraped_data['documentation'].append({
|
||||
'source_id': doc_config['name'],
|
||||
'base_url': source['base_url'],
|
||||
'pages': summary.get('pages', []),
|
||||
'total_pages': summary.get('total_pages', 0),
|
||||
'data_file': docs_data_file,
|
||||
'refs_dir': '' # Will be set after moving to cache
|
||||
})
|
||||
self.scraped_data["documentation"].append(
|
||||
{
|
||||
"source_id": doc_config["name"],
|
||||
"base_url": source["base_url"],
|
||||
"pages": summary.get("pages", []),
|
||||
"total_pages": summary.get("total_pages", 0),
|
||||
"data_file": docs_data_file,
|
||||
"refs_dir": "", # Will be set after moving to cache
|
||||
}
|
||||
)
|
||||
|
||||
logger.info(f"✅ Documentation: {summary.get('total_pages', 0)} pages scraped")
|
||||
else:
|
||||
@@ -246,9 +244,9 @@ class UnifiedScraper:
|
||||
logger.info(f"📦 Moved docs output to cache: {cache_docs_dir}")
|
||||
|
||||
# Update refs_dir in scraped_data with cache location
|
||||
refs_dir_path = os.path.join(cache_docs_dir, 'references')
|
||||
if self.scraped_data['documentation']:
|
||||
self.scraped_data['documentation'][-1]['refs_dir'] = refs_dir_path
|
||||
refs_dir_path = os.path.join(cache_docs_dir, "references")
|
||||
if self.scraped_data["documentation"]:
|
||||
self.scraped_data["documentation"][-1]["refs_dir"] = refs_dir_path
|
||||
|
||||
if os.path.exists(docs_data_dir):
|
||||
cache_data_dir = os.path.join(self.data_dir, f"{doc_config['name']}_data")
|
||||
@@ -257,7 +255,7 @@ class UnifiedScraper:
|
||||
shutil.move(docs_data_dir, cache_data_dir)
|
||||
logger.info(f"📦 Moved docs data to cache: {cache_data_dir}")
|
||||
|
||||
def _clone_github_repo(self, repo_name: str, idx: int = 0) -> Optional[str]:
|
||||
def _clone_github_repo(self, repo_name: str, idx: int = 0) -> str | None:
|
||||
"""
|
||||
Clone GitHub repository to cache directory for C3.x analysis.
|
||||
Reuses existing clone if already present.
|
||||
@@ -274,9 +272,9 @@ class UnifiedScraper:
|
||||
clone_path = os.path.join(self.repos_dir, repo_dir_name)
|
||||
|
||||
# Check if already cloned
|
||||
if os.path.exists(clone_path) and os.path.isdir(os.path.join(clone_path, '.git')):
|
||||
if os.path.exists(clone_path) and os.path.isdir(os.path.join(clone_path, ".git")):
|
||||
logger.info(f"♻️ Found existing repository clone: {clone_path}")
|
||||
logger.info(f" Reusing for C3.x analysis (skip re-cloning)")
|
||||
logger.info(" Reusing for C3.x analysis (skip re-cloning)")
|
||||
return clone_path
|
||||
|
||||
# repos_dir already created in __init__
|
||||
@@ -285,18 +283,18 @@ class UnifiedScraper:
|
||||
repo_url = f"https://github.com/{repo_name}.git"
|
||||
logger.info(f"🔄 Cloning repository for C3.x analysis: {repo_url}")
|
||||
logger.info(f" → {clone_path}")
|
||||
logger.info(f" 💾 Clone will be saved for future reuse")
|
||||
logger.info(" 💾 Clone will be saved for future reuse")
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['git', 'clone', repo_url, clone_path],
|
||||
["git", "clone", repo_url, clone_path],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600 # 10 minute timeout for full clone
|
||||
timeout=600, # 10 minute timeout for full clone
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
logger.info(f"✅ Repository cloned successfully")
|
||||
logger.info("✅ Repository cloned successfully")
|
||||
logger.info(f" 📁 Saved to: {clone_path}")
|
||||
return clone_path
|
||||
else:
|
||||
@@ -307,7 +305,7 @@ class UnifiedScraper:
|
||||
return None
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.error(f"❌ Git clone timed out after 10 minutes")
|
||||
logger.error("❌ Git clone timed out after 10 minutes")
|
||||
if os.path.exists(clone_path):
|
||||
shutil.rmtree(clone_path)
|
||||
return None
|
||||
@@ -317,7 +315,7 @@ class UnifiedScraper:
|
||||
shutil.rmtree(clone_path)
|
||||
return None
|
||||
|
||||
def _scrape_github(self, source: Dict[str, Any]):
|
||||
def _scrape_github(self, source: dict[str, Any]):
|
||||
"""Scrape GitHub repository."""
|
||||
try:
|
||||
from skill_seekers.cli.github_scraper import GitHubScraper
|
||||
@@ -326,16 +324,16 @@ class UnifiedScraper:
|
||||
return
|
||||
|
||||
# Multi-source support: Get unique index for this GitHub source
|
||||
idx = self._source_counters['github']
|
||||
self._source_counters['github'] += 1
|
||||
idx = self._source_counters["github"]
|
||||
self._source_counters["github"] += 1
|
||||
|
||||
# Extract repo identifier for unique naming
|
||||
repo = source['repo']
|
||||
repo_id = repo.replace('/', '_')
|
||||
repo = source["repo"]
|
||||
repo_id = repo.replace("/", "_")
|
||||
|
||||
# Check if we need to clone for C3.x analysis
|
||||
enable_codebase_analysis = source.get('enable_codebase_analysis', True)
|
||||
local_repo_path = source.get('local_repo_path')
|
||||
enable_codebase_analysis = source.get("enable_codebase_analysis", True)
|
||||
local_repo_path = source.get("local_repo_path")
|
||||
cloned_repo_path = None
|
||||
|
||||
# Auto-clone if C3.x analysis is enabled but no local path provided
|
||||
@@ -351,24 +349,24 @@ class UnifiedScraper:
|
||||
|
||||
# Create config for GitHub scraper
|
||||
github_config = {
|
||||
'repo': repo,
|
||||
'name': f"{self.name}_github_{idx}_{repo_id}",
|
||||
'github_token': source.get('github_token'),
|
||||
'include_issues': source.get('include_issues', True),
|
||||
'max_issues': source.get('max_issues', 100),
|
||||
'include_changelog': source.get('include_changelog', True),
|
||||
'include_releases': source.get('include_releases', True),
|
||||
'include_code': source.get('include_code', True),
|
||||
'code_analysis_depth': source.get('code_analysis_depth', 'surface'),
|
||||
'file_patterns': source.get('file_patterns', []),
|
||||
'local_repo_path': local_repo_path # Use cloned path if available
|
||||
"repo": repo,
|
||||
"name": f"{self.name}_github_{idx}_{repo_id}",
|
||||
"github_token": source.get("github_token"),
|
||||
"include_issues": source.get("include_issues", True),
|
||||
"max_issues": source.get("max_issues", 100),
|
||||
"include_changelog": source.get("include_changelog", True),
|
||||
"include_releases": source.get("include_releases", True),
|
||||
"include_code": source.get("include_code", True),
|
||||
"code_analysis_depth": source.get("code_analysis_depth", "surface"),
|
||||
"file_patterns": source.get("file_patterns", []),
|
||||
"local_repo_path": local_repo_path, # Use cloned path if available
|
||||
}
|
||||
|
||||
# Pass directory exclusions if specified (optional)
|
||||
if 'exclude_dirs' in source:
|
||||
github_config['exclude_dirs'] = source['exclude_dirs']
|
||||
if 'exclude_dirs_additional' in source:
|
||||
github_config['exclude_dirs_additional'] = source['exclude_dirs_additional']
|
||||
if "exclude_dirs" in source:
|
||||
github_config["exclude_dirs"] = source["exclude_dirs"]
|
||||
if "exclude_dirs_additional" in source:
|
||||
github_config["exclude_dirs_additional"] = source["exclude_dirs_additional"]
|
||||
|
||||
# Scrape
|
||||
logger.info(f"Scraping GitHub repository: {source['repo']}")
|
||||
@@ -381,13 +379,14 @@ class UnifiedScraper:
|
||||
try:
|
||||
c3_data = self._run_c3_analysis(local_repo_path, source)
|
||||
if c3_data:
|
||||
github_data['c3_analysis'] = c3_data
|
||||
github_data["c3_analysis"] = c3_data
|
||||
logger.info("✅ C3.x analysis complete")
|
||||
else:
|
||||
logger.warning("⚠️ C3.x analysis returned no data")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ C3.x analysis failed: {e}")
|
||||
import traceback
|
||||
|
||||
logger.debug(f"Traceback: {traceback.format_exc()}")
|
||||
# Continue without C3.x data - graceful degradation
|
||||
|
||||
@@ -396,32 +395,29 @@ class UnifiedScraper:
|
||||
logger.info(f"📁 Repository clone saved for future use: {cloned_repo_path}")
|
||||
|
||||
# Save data to unified location with unique filename
|
||||
github_data_file = os.path.join(self.data_dir, f'github_data_{idx}_{repo_id}.json')
|
||||
with open(github_data_file, 'w', encoding='utf-8') as f:
|
||||
github_data_file = os.path.join(self.data_dir, f"github_data_{idx}_{repo_id}.json")
|
||||
with open(github_data_file, "w", encoding="utf-8") as f:
|
||||
json.dump(github_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# ALSO save to the location GitHubToSkillConverter expects (with C3.x data!)
|
||||
converter_data_file = f"output/{github_config['name']}_github_data.json"
|
||||
with open(converter_data_file, 'w', encoding='utf-8') as f:
|
||||
with open(converter_data_file, "w", encoding="utf-8") as f:
|
||||
json.dump(github_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Append to list instead of overwriting (multi-source support)
|
||||
self.scraped_data['github'].append({
|
||||
'repo': repo,
|
||||
'repo_id': repo_id,
|
||||
'idx': idx,
|
||||
'data': github_data,
|
||||
'data_file': github_data_file
|
||||
})
|
||||
self.scraped_data["github"].append(
|
||||
{"repo": repo, "repo_id": repo_id, "idx": idx, "data": github_data, "data_file": github_data_file}
|
||||
)
|
||||
|
||||
# Build standalone SKILL.md for synthesis using GitHubToSkillConverter
|
||||
try:
|
||||
from skill_seekers.cli.github_scraper import GitHubToSkillConverter
|
||||
|
||||
# Use github_config which has the correct name field
|
||||
# Converter will load from output/{name}_github_data.json which now has C3.x data
|
||||
converter = GitHubToSkillConverter(config=github_config)
|
||||
converter.build_skill()
|
||||
logger.info(f"✅ GitHub: Standalone SKILL.md created")
|
||||
logger.info("✅ GitHub: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone GitHub SKILL.md: {e}")
|
||||
|
||||
@@ -430,7 +426,7 @@ class UnifiedScraper:
|
||||
github_data_file_path = f"output/{github_config['name']}_github_data.json"
|
||||
|
||||
if os.path.exists(github_output_dir):
|
||||
cache_github_dir = os.path.join(self.sources_dir, github_config['name'])
|
||||
cache_github_dir = os.path.join(self.sources_dir, github_config["name"])
|
||||
if os.path.exists(cache_github_dir):
|
||||
shutil.rmtree(cache_github_dir)
|
||||
shutil.move(github_output_dir, cache_github_dir)
|
||||
@@ -443,9 +439,9 @@ class UnifiedScraper:
|
||||
shutil.move(github_data_file_path, cache_github_data)
|
||||
logger.info(f"📦 Moved GitHub data to cache: {cache_github_data}")
|
||||
|
||||
logger.info(f"✅ GitHub: Repository scraped successfully")
|
||||
logger.info("✅ GitHub: Repository scraped successfully")
|
||||
|
||||
def _scrape_pdf(self, source: Dict[str, Any]):
|
||||
def _scrape_pdf(self, source: dict[str, Any]):
|
||||
"""Scrape PDF document."""
|
||||
try:
|
||||
from skill_seekers.cli.pdf_scraper import PDFToSkillConverter
|
||||
@@ -454,20 +450,20 @@ class UnifiedScraper:
|
||||
return
|
||||
|
||||
# Multi-source support: Get unique index for this PDF source
|
||||
idx = self._source_counters['pdf']
|
||||
self._source_counters['pdf'] += 1
|
||||
idx = self._source_counters["pdf"]
|
||||
self._source_counters["pdf"] += 1
|
||||
|
||||
# Extract PDF identifier for unique naming (filename without extension)
|
||||
pdf_path = source['path']
|
||||
pdf_path = source["path"]
|
||||
pdf_id = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||
|
||||
# Create config for PDF scraper
|
||||
pdf_config = {
|
||||
'name': f"{self.name}_pdf_{idx}_{pdf_id}",
|
||||
'pdf': source['path'],
|
||||
'extract_tables': source.get('extract_tables', False),
|
||||
'ocr': source.get('ocr', False),
|
||||
'password': source.get('password')
|
||||
"name": f"{self.name}_pdf_{idx}_{pdf_id}",
|
||||
"pdf": source["path"],
|
||||
"extract_tables": source.get("extract_tables", False),
|
||||
"ocr": source.get("ocr", False),
|
||||
"password": source.get("password"),
|
||||
}
|
||||
|
||||
# Scrape
|
||||
@@ -476,29 +472,25 @@ class UnifiedScraper:
|
||||
pdf_data = converter.extract_all()
|
||||
|
||||
# Save data
|
||||
pdf_data_file = os.path.join(self.data_dir, f'pdf_data_{idx}_{pdf_id}.json')
|
||||
with open(pdf_data_file, 'w', encoding='utf-8') as f:
|
||||
pdf_data_file = os.path.join(self.data_dir, f"pdf_data_{idx}_{pdf_id}.json")
|
||||
with open(pdf_data_file, "w", encoding="utf-8") as f:
|
||||
json.dump(pdf_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Append to list instead of overwriting
|
||||
self.scraped_data['pdf'].append({
|
||||
'pdf_path': pdf_path,
|
||||
'pdf_id': pdf_id,
|
||||
'idx': idx,
|
||||
'data': pdf_data,
|
||||
'data_file': pdf_data_file
|
||||
})
|
||||
self.scraped_data["pdf"].append(
|
||||
{"pdf_path": pdf_path, "pdf_id": pdf_id, "idx": idx, "data": pdf_data, "data_file": pdf_data_file}
|
||||
)
|
||||
|
||||
# Build standalone SKILL.md for synthesis
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info(f"✅ PDF: Standalone SKILL.md created")
|
||||
logger.info("✅ PDF: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone PDF SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ PDF: {len(pdf_data.get('pages', []))} pages extracted")
|
||||
|
||||
def _load_json(self, file_path: Path) -> Dict:
|
||||
def _load_json(self, file_path: Path) -> dict:
|
||||
"""
|
||||
Load JSON file safely.
|
||||
|
||||
@@ -513,13 +505,13 @@ class UnifiedScraper:
|
||||
return {}
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except (json.JSONDecodeError, IOError) as e:
|
||||
except (OSError, json.JSONDecodeError) as e:
|
||||
logger.warning(f"Failed to load JSON {file_path}: {e}")
|
||||
return {}
|
||||
|
||||
def _load_guide_collection(self, tutorials_dir: Path) -> Dict:
|
||||
def _load_guide_collection(self, tutorials_dir: Path) -> dict:
|
||||
"""
|
||||
Load how-to guide collection from tutorials directory.
|
||||
|
||||
@@ -531,22 +523,22 @@ class UnifiedScraper:
|
||||
"""
|
||||
if not tutorials_dir.exists():
|
||||
logger.warning(f"Tutorials directory not found: {tutorials_dir}")
|
||||
return {'guides': []}
|
||||
return {"guides": []}
|
||||
|
||||
collection_file = tutorials_dir / 'guide_collection.json'
|
||||
collection_file = tutorials_dir / "guide_collection.json"
|
||||
if collection_file.exists():
|
||||
return self._load_json(collection_file)
|
||||
|
||||
# Fallback: scan for individual guide JSON files
|
||||
guides = []
|
||||
for guide_file in tutorials_dir.glob('guide_*.json'):
|
||||
for guide_file in tutorials_dir.glob("guide_*.json"):
|
||||
guide_data = self._load_json(guide_file)
|
||||
if guide_data:
|
||||
guides.append(guide_data)
|
||||
|
||||
return {'guides': guides, 'total_count': len(guides)}
|
||||
return {"guides": guides, "total_count": len(guides)}
|
||||
|
||||
def _load_api_reference(self, api_dir: Path) -> Dict[str, Any]:
|
||||
def _load_api_reference(self, api_dir: Path) -> dict[str, Any]:
|
||||
"""
|
||||
Load API reference markdown files from api_reference directory.
|
||||
|
||||
@@ -561,16 +553,16 @@ class UnifiedScraper:
|
||||
return {}
|
||||
|
||||
api_refs = {}
|
||||
for md_file in api_dir.glob('*.md'):
|
||||
for md_file in api_dir.glob("*.md"):
|
||||
try:
|
||||
module_name = md_file.stem
|
||||
api_refs[module_name] = md_file.read_text(encoding='utf-8')
|
||||
except IOError as e:
|
||||
api_refs[module_name] = md_file.read_text(encoding="utf-8")
|
||||
except OSError as e:
|
||||
logger.warning(f"Failed to read API reference {md_file}: {e}")
|
||||
|
||||
return api_refs
|
||||
|
||||
def _run_c3_analysis(self, local_repo_path: str, source: Dict[str, Any]) -> Dict[str, Any]:
|
||||
def _run_c3_analysis(self, local_repo_path: str, source: dict[str, Any]) -> dict[str, Any]:
|
||||
"""
|
||||
Run comprehensive C3.x codebase analysis.
|
||||
|
||||
@@ -592,7 +584,7 @@ class UnifiedScraper:
|
||||
return {}
|
||||
|
||||
# Create temp output dir for C3.x analysis
|
||||
temp_output = Path(self.data_dir) / 'c3_analysis_temp'
|
||||
temp_output = Path(self.data_dir) / "c3_analysis_temp"
|
||||
temp_output.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger.info(f" Analyzing codebase: {local_repo_path}")
|
||||
@@ -602,37 +594,37 @@ class UnifiedScraper:
|
||||
results = analyze_codebase(
|
||||
directory=Path(local_repo_path),
|
||||
output_dir=temp_output,
|
||||
depth='deep',
|
||||
depth="deep",
|
||||
languages=None, # Analyze all languages
|
||||
file_patterns=source.get('file_patterns'),
|
||||
build_api_reference=True, # C2.5: API Reference
|
||||
extract_comments=False, # Not needed
|
||||
file_patterns=source.get("file_patterns"),
|
||||
build_api_reference=True, # C2.5: API Reference
|
||||
extract_comments=False, # Not needed
|
||||
build_dependency_graph=True, # C2.6: Dependency Graph
|
||||
detect_patterns=True, # C3.1: Design patterns
|
||||
extract_test_examples=True, # C3.2: Test examples
|
||||
build_how_to_guides=True, # C3.3: How-to guides
|
||||
detect_patterns=True, # C3.1: Design patterns
|
||||
extract_test_examples=True, # C3.2: Test examples
|
||||
build_how_to_guides=True, # C3.3: How-to guides
|
||||
extract_config_patterns=True, # C3.4: Config patterns
|
||||
enhance_with_ai=source.get('ai_mode', 'auto') != 'none',
|
||||
ai_mode=source.get('ai_mode', 'auto')
|
||||
enhance_with_ai=source.get("ai_mode", "auto") != "none",
|
||||
ai_mode=source.get("ai_mode", "auto"),
|
||||
)
|
||||
|
||||
# Load C3.x outputs into memory
|
||||
c3_data = {
|
||||
'patterns': self._load_json(temp_output / 'patterns' / 'detected_patterns.json'),
|
||||
'test_examples': self._load_json(temp_output / 'test_examples' / 'test_examples.json'),
|
||||
'how_to_guides': self._load_guide_collection(temp_output / 'tutorials'),
|
||||
'config_patterns': self._load_json(temp_output / 'config_patterns' / 'config_patterns.json'),
|
||||
'architecture': self._load_json(temp_output / 'architecture' / 'architectural_patterns.json'),
|
||||
'api_reference': self._load_api_reference(temp_output / 'api_reference'), # C2.5
|
||||
'dependency_graph': self._load_json(temp_output / 'dependencies' / 'dependency_graph.json') # C2.6
|
||||
"patterns": self._load_json(temp_output / "patterns" / "detected_patterns.json"),
|
||||
"test_examples": self._load_json(temp_output / "test_examples" / "test_examples.json"),
|
||||
"how_to_guides": self._load_guide_collection(temp_output / "tutorials"),
|
||||
"config_patterns": self._load_json(temp_output / "config_patterns" / "config_patterns.json"),
|
||||
"architecture": self._load_json(temp_output / "architecture" / "architectural_patterns.json"),
|
||||
"api_reference": self._load_api_reference(temp_output / "api_reference"), # C2.5
|
||||
"dependency_graph": self._load_json(temp_output / "dependencies" / "dependency_graph.json"), # C2.6
|
||||
}
|
||||
|
||||
# Log summary
|
||||
total_patterns = sum(len(f.get('patterns', [])) for f in c3_data.get('patterns', []))
|
||||
total_examples = c3_data.get('test_examples', {}).get('total_examples', 0)
|
||||
total_guides = len(c3_data.get('how_to_guides', {}).get('guides', []))
|
||||
total_configs = len(c3_data.get('config_patterns', {}).get('config_files', []))
|
||||
arch_patterns = len(c3_data.get('architecture', {}).get('patterns', []))
|
||||
total_patterns = sum(len(f.get("patterns", [])) for f in c3_data.get("patterns", []))
|
||||
total_examples = c3_data.get("test_examples", {}).get("total_examples", 0)
|
||||
total_guides = len(c3_data.get("how_to_guides", {}).get("guides", []))
|
||||
total_configs = len(c3_data.get("config_patterns", {}).get("config_files", []))
|
||||
arch_patterns = len(c3_data.get("architecture", {}).get("patterns", []))
|
||||
|
||||
logger.info(f" ✓ Design Patterns: {total_patterns}")
|
||||
logger.info(f" ✓ Test Examples: {total_examples}")
|
||||
@@ -645,6 +637,7 @@ class UnifiedScraper:
|
||||
except Exception as e:
|
||||
logger.error(f"C3.x analysis failed: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return {}
|
||||
|
||||
@@ -656,7 +649,7 @@ class UnifiedScraper:
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to clean up temp directory: {e}")
|
||||
|
||||
def detect_conflicts(self) -> List:
|
||||
def detect_conflicts(self) -> list:
|
||||
"""
|
||||
Detect conflicts between documentation and code.
|
||||
|
||||
@@ -674,18 +667,18 @@ class UnifiedScraper:
|
||||
return []
|
||||
|
||||
# Get documentation and GitHub data
|
||||
docs_data = self.scraped_data.get('documentation', {})
|
||||
github_data = self.scraped_data.get('github', {})
|
||||
docs_data = self.scraped_data.get("documentation", {})
|
||||
github_data = self.scraped_data.get("github", {})
|
||||
|
||||
if not docs_data or not github_data:
|
||||
logger.warning("Missing documentation or GitHub data for conflict detection")
|
||||
return []
|
||||
|
||||
# Load data files
|
||||
with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
|
||||
with open(docs_data["data_file"], encoding="utf-8") as f:
|
||||
docs_json = json.load(f)
|
||||
|
||||
with open(github_data['data_file'], 'r', encoding='utf-8') as f:
|
||||
with open(github_data["data_file"], encoding="utf-8") as f:
|
||||
github_json = json.load(f)
|
||||
|
||||
# Detect conflicts
|
||||
@@ -693,26 +686,26 @@ class UnifiedScraper:
|
||||
conflicts = detector.detect_all_conflicts()
|
||||
|
||||
# Save conflicts
|
||||
conflicts_file = os.path.join(self.data_dir, 'conflicts.json')
|
||||
conflicts_file = os.path.join(self.data_dir, "conflicts.json")
|
||||
detector.save_conflicts(conflicts, conflicts_file)
|
||||
|
||||
# Print summary
|
||||
summary = detector.generate_summary(conflicts)
|
||||
logger.info(f"\n📊 Conflict Summary:")
|
||||
logger.info("\n📊 Conflict Summary:")
|
||||
logger.info(f" Total: {summary['total']}")
|
||||
logger.info(f" By Type:")
|
||||
for ctype, count in summary['by_type'].items():
|
||||
logger.info(" By Type:")
|
||||
for ctype, count in summary["by_type"].items():
|
||||
if count > 0:
|
||||
logger.info(f" - {ctype}: {count}")
|
||||
logger.info(f" By Severity:")
|
||||
for severity, count in summary['by_severity'].items():
|
||||
logger.info(" By Severity:")
|
||||
for severity, count in summary["by_severity"].items():
|
||||
if count > 0:
|
||||
emoji = '🔴' if severity == 'high' else '🟡' if severity == 'medium' else '🟢'
|
||||
emoji = "🔴" if severity == "high" else "🟡" if severity == "medium" else "🟢"
|
||||
logger.info(f" {emoji} {severity}: {count}")
|
||||
|
||||
return conflicts
|
||||
|
||||
def merge_sources(self, conflicts: List):
|
||||
def merge_sources(self, conflicts: list):
|
||||
"""
|
||||
Merge data from multiple sources.
|
||||
|
||||
@@ -728,18 +721,18 @@ class UnifiedScraper:
|
||||
return None
|
||||
|
||||
# Get data files
|
||||
docs_data = self.scraped_data.get('documentation', {})
|
||||
github_data = self.scraped_data.get('github', {})
|
||||
docs_data = self.scraped_data.get("documentation", {})
|
||||
github_data = self.scraped_data.get("github", {})
|
||||
|
||||
# Load data
|
||||
with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
|
||||
with open(docs_data["data_file"], encoding="utf-8") as f:
|
||||
docs_json = json.load(f)
|
||||
|
||||
with open(github_data['data_file'], 'r', encoding='utf-8') as f:
|
||||
with open(github_data["data_file"], encoding="utf-8") as f:
|
||||
github_json = json.load(f)
|
||||
|
||||
# Choose merger
|
||||
if self.merge_mode == 'claude-enhanced':
|
||||
if self.merge_mode == "claude-enhanced":
|
||||
merger = ClaudeEnhancedMerger(docs_json, github_json, conflicts)
|
||||
else:
|
||||
merger = RuleBasedMerger(docs_json, github_json, conflicts)
|
||||
@@ -748,15 +741,15 @@ class UnifiedScraper:
|
||||
merged_data = merger.merge_all()
|
||||
|
||||
# Save merged data
|
||||
merged_file = os.path.join(self.data_dir, 'merged_data.json')
|
||||
with open(merged_file, 'w', encoding='utf-8') as f:
|
||||
merged_file = os.path.join(self.data_dir, "merged_data.json")
|
||||
with open(merged_file, "w", encoding="utf-8") as f:
|
||||
json.dump(merged_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
logger.info(f"✅ Merged data saved: {merged_file}")
|
||||
|
||||
return merged_data
|
||||
|
||||
def build_skill(self, merged_data: Optional[Dict] = None):
|
||||
def build_skill(self, merged_data: dict | None = None):
|
||||
"""
|
||||
Build final unified skill.
|
||||
|
||||
@@ -769,20 +762,14 @@ class UnifiedScraper:
|
||||
|
||||
# Load conflicts if they exist
|
||||
conflicts = []
|
||||
conflicts_file = os.path.join(self.data_dir, 'conflicts.json')
|
||||
conflicts_file = os.path.join(self.data_dir, "conflicts.json")
|
||||
if os.path.exists(conflicts_file):
|
||||
with open(conflicts_file, 'r', encoding='utf-8') as f:
|
||||
with open(conflicts_file, encoding="utf-8") as f:
|
||||
conflicts_data = json.load(f)
|
||||
conflicts = conflicts_data.get('conflicts', [])
|
||||
conflicts = conflicts_data.get("conflicts", [])
|
||||
|
||||
# Build skill
|
||||
builder = UnifiedSkillBuilder(
|
||||
self.config,
|
||||
self.scraped_data,
|
||||
merged_data,
|
||||
conflicts,
|
||||
cache_dir=self.cache_dir
|
||||
)
|
||||
builder = UnifiedSkillBuilder(self.config, self.scraped_data, merged_data, conflicts, cache_dir=self.cache_dir)
|
||||
|
||||
builder.build()
|
||||
|
||||
@@ -824,6 +811,7 @@ class UnifiedScraper:
|
||||
except Exception as e:
|
||||
logger.error(f"\n\n❌ Error during scraping: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
@@ -831,7 +819,7 @@ class UnifiedScraper:
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Unified multi-source scraper',
|
||||
description="Unified multi-source scraper",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
@@ -843,17 +831,18 @@ Examples:
|
||||
|
||||
# Backward compatible with legacy configs
|
||||
skill-seekers unified --config configs/react.json
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument('--config', '-c', required=True,
|
||||
help='Path to unified config JSON file')
|
||||
parser.add_argument('--merge-mode', '-m',
|
||||
choices=['rule-based', 'claude-enhanced'],
|
||||
help='Override config merge mode')
|
||||
parser.add_argument('--skip-codebase-analysis',
|
||||
action='store_true',
|
||||
help='Skip C3.x codebase analysis for GitHub sources (default: enabled)')
|
||||
parser.add_argument("--config", "-c", required=True, help="Path to unified config JSON file")
|
||||
parser.add_argument(
|
||||
"--merge-mode", "-m", choices=["rule-based", "claude-enhanced"], help="Override config merge mode"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-codebase-analysis",
|
||||
action="store_true",
|
||||
help="Skip C3.x codebase analysis for GitHub sources (default: enabled)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -862,14 +851,14 @@ Examples:
|
||||
|
||||
# Disable codebase analysis if requested
|
||||
if args.skip_codebase_analysis:
|
||||
for source in scraper.config.get('sources', []):
|
||||
if source['type'] == 'github':
|
||||
source['enable_codebase_analysis'] = False
|
||||
for source in scraper.config.get("sources", []):
|
||||
if source["type"] == "github":
|
||||
source["enable_codebase_analysis"] = False
|
||||
logger.info(f"⏭️ Skipping codebase analysis for GitHub source: {source.get('repo', 'unknown')}")
|
||||
|
||||
# Run scraper
|
||||
scraper.run()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user