Files
antigravity-skills-reference/scripts/analyze_remaining_similar_skills.py
sck_0 76f43ef8ee docs: add analysis and implementation reports for VoltAgent integration
- VoltAgent repository analysis and validation reports
- Similar skills analysis and implementation tracking
- HTML to markdown conversion report
- Final skills count verification
2026-01-30 09:15:27 +01:00

216 lines
7.5 KiB
Python

#!/usr/bin/env python3
"""
Analyze remaining similar skills to determine if they are truly new
and worth adding to the repository.
"""
import json
import re
from pathlib import Path
from typing import Dict, List, Tuple
from urllib.parse import urlparse
from urllib.request import urlopen, Request
from urllib.error import URLError, HTTPError
def normalize_skill_name(name: str) -> str:
"""Normalize skill name to kebab-case."""
# Remove special chars, convert to lowercase, replace spaces/hyphens
name = re.sub(r'[^\w\s-]', '', name.lower())
name = re.sub(r'[\s_]+', '-', name)
name = re.sub(r'-+', '-', name)
return name.strip('-')
def check_url_accessible(url: str) -> bool:
"""Check if URL is accessible."""
try:
req = Request(url, method='HEAD')
with urlopen(req, timeout=10) as response:
return response.status == 200
except (URLError, HTTPError, Exception):
return False
def get_repo_base_url(github_url: str) -> str:
"""Extract base GitHub repository URL."""
# Handle various GitHub URL formats
patterns = [
r'https://github\.com/([^/]+/[^/]+)',
r'github\.com/([^/]+/[^/]+)',
]
for pattern in patterns:
match = re.search(pattern, github_url)
if match:
return f"https://github.com/{match.group(1)}"
return None
def check_skill_file_exists(repo_url: str, skill_path: str = None) -> Tuple[bool, str]:
"""Check if SKILL.md exists in the repository."""
base_url = get_repo_base_url(repo_url)
if not base_url:
return False, None
# Common paths to check
paths_to_check = [
f"{base_url}/raw/main/{skill_path}/SKILL.md" if skill_path else f"{base_url}/raw/main/SKILL.md",
f"{base_url}/raw/main/skills/{skill_path}/SKILL.md" if skill_path else None,
f"{base_url}/raw/master/{skill_path}/SKILL.md" if skill_path else f"{base_url}/raw/master/SKILL.md",
f"{base_url}/blob/main/{skill_path}/SKILL.md" if skill_path else f"{base_url}/blob/main/SKILL.md",
]
for path in paths_to_check:
if path and check_url_accessible(path):
return True, path
return False, None
def analyze_similarity(skill_name: str, similar_skills: List[str], existing_skills: Dict) -> Dict:
"""Analyze how similar a skill is to existing ones."""
analysis = {
'is_duplicate': False,
'is_complementary': False,
'similarity_score': 0.0,
'closest_match': None,
'reasoning': []
}
skill_lower = skill_name.lower()
# Check for exact or near-exact matches
for existing_name, existing_data in existing_skills.items():
existing_lower = existing_name.lower()
# Exact match
if skill_lower == existing_lower:
analysis['is_duplicate'] = True
analysis['closest_match'] = existing_name
analysis['reasoning'].append(f"Exact match with existing skill: {existing_name}")
return analysis
# Check if one contains the other
if skill_lower in existing_lower or existing_lower in skill_lower:
if abs(len(skill_lower) - len(existing_lower)) <= 3:
analysis['is_duplicate'] = True
analysis['closest_match'] = existing_name
analysis['similarity_score'] = 0.9
analysis['reasoning'].append(f"Near-exact match: '{skill_name}' vs '{existing_name}'")
return analysis
# Check similarity with similar skills list
for similar in similar_skills:
if similar.lower() in existing_skills:
existing_data = existing_skills[similar.lower()]
# If the similar skill exists, this might be a duplicate
analysis['similarity_score'] = 0.7
analysis['closest_match'] = similar
analysis['reasoning'].append(f"Similar to existing skill: {similar}")
# Determine if complementary
if analysis['similarity_score'] < 0.5:
analysis['is_complementary'] = True
analysis['reasoning'].append("Low similarity - likely complementary skill")
return analysis
def main():
base_dir = Path(__file__).parent.parent
# Load remaining similar skills
remaining_file = base_dir / "remaining_similar_skills.json"
if not remaining_file.exists():
print("❌ remaining_similar_skills.json not found. Run the analysis first.")
return
with open(remaining_file, 'r') as f:
data = json.load(f)
# Load existing skills
catalog_file = base_dir / "data" / "catalog.json"
with open(catalog_file, 'r') as f:
catalog = json.load(f)
existing_skills = {s['name'].lower(): s for s in catalog.get('skills', [])}
print(f"🔍 Analyzing {len(data['skills'])} remaining similar skills...\n")
results = {
'truly_new': [],
'duplicates': [],
'complementary': [],
'needs_review': [],
'invalid_sources': []
}
for skill in data['skills']:
skill_name = skill['name']
print(f"Analyzing: {skill_name}")
# Skip if already exists
if skill['exists_in_catalog'] or skill['folder_exists']:
results['duplicates'].append({
'name': skill_name,
'reason': 'Already exists in repository',
'url': skill['url']
})
continue
# Check source accessibility
exists, raw_url = check_skill_file_exists(skill['url'], skill.get('skill_part'))
if not exists:
results['invalid_sources'].append({
'name': skill_name,
'url': skill['url'],
'reason': 'SKILL.md not found or URL inaccessible'
})
continue
# Analyze similarity
similarity_analysis = analyze_similarity(
skill_name,
skill['similar_to'],
existing_skills
)
skill_result = {
'name': skill_name,
'url': skill['url'],
'raw_url': raw_url,
'description': skill['description'],
'org': skill['org'],
'category': skill['category'],
'similar_to': skill['similar_to'],
'similarity_analysis': similarity_analysis
}
if similarity_analysis['is_duplicate']:
results['duplicates'].append(skill_result)
elif similarity_analysis['is_complementary']:
results['complementary'].append(skill_result)
else:
results['needs_review'].append(skill_result)
# Generate report
report = {
'summary': {
'total_analyzed': len(data['skills']),
'truly_new': len(results['complementary']),
'duplicates': len(results['duplicates']),
'needs_review': len(results['needs_review']),
'invalid_sources': len(results['invalid_sources'])
},
'results': results
}
output_file = base_dir / "similar_skills_analysis.json"
with open(output_file, 'w') as f:
json.dump(report, f, indent=2)
print(f"\n✅ Analysis complete!")
print(f"📊 Summary:")
print(f" - Truly new (complementary): {len(results['complementary'])}")
print(f" - Duplicates: {len(results['duplicates'])}")
print(f" - Needs review: {len(results['needs_review'])}")
print(f" - Invalid sources: {len(results['invalid_sources'])}")
print(f"\n📄 Full report saved to: {output_file}")
if __name__ == "__main__":
main()