refactor: flatten Microsoft skills from nested to flat directory structure

Rewrote sync_microsoft_skills.py (v4) to use each SKILL.md's frontmatter
'name' field as the flat directory name under skills/, replacing the nested
skills/official/microsoft/<lang>/<category>/<service>/ hierarchy.

This fixes CI failures caused by the indexing, validation, and catalog
scripts expecting skills/<id>/SKILL.md (depth 1).

Changes:
- Rewrite scripts/sync_microsoft_skills.py for flat output with collision detection
- Update scripts/tests/inspect_microsoft_repo.py for flat name mapping
- Update scripts/tests/test_comprehensive_coverage.py for name uniqueness checks
- Delete skills/official/ nested directory
- Add 129 Microsoft skills as flat directories (e.g. skills/azure-mgmt-botservice-dotnet/)
- Move attribution files to docs/ (LICENSE-MICROSOFT, microsoft-skills-attribution.json)
- Rebuild skills_index.json, CATALOG.md, README.md (845 total skills)
This commit is contained in:
Ahmed Rehan
2026-02-12 00:07:15 +05:00
parent e06454dafd
commit e7ae616385
142 changed files with 5683 additions and 6097 deletions

View File

@@ -1,9 +1,10 @@
#!/usr/bin/env python3
"""
Test Script: Verify Microsoft Skills Sync Coverage
Tests all possible skill locations and structures
Test Script: Verify Microsoft Skills Sync Coverage and Flat Name Uniqueness
Ensures all skills are captured and no directory name collisions exist.
"""
import re
import subprocess
import tempfile
from pathlib import Path
@@ -11,204 +12,177 @@ from collections import defaultdict
MS_REPO = "https://github.com/microsoft/skills.git"
def extract_skill_name(skill_md_path: Path) -> str | None:
"""Extract the 'name' field from SKILL.md YAML frontmatter."""
try:
content = skill_md_path.read_text(encoding="utf-8")
except Exception:
return None
fm_match = re.search(r"^---\s*\n(.*?)\n---", content, re.DOTALL)
if not fm_match:
return None
for line in fm_match.group(1).splitlines():
match = re.match(r"^name:\s*(.+)$", line)
if match:
value = match.group(1).strip().strip("\"'")
if value:
return value
return None
def analyze_skill_locations():
"""
Comprehensive analysis of all skill locations in Microsoft repo.
Verifies that v3 script will catch everything.
Verifies flat name uniqueness and coverage.
"""
print("🔬 Comprehensive Skill Location Analysis")
print("🔬 Comprehensive Skill Coverage & Uniqueness Analysis")
print("=" * 60)
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
print("\n1⃣ Cloning repository...")
subprocess.run(
["git", "clone", "--depth", "1", MS_REPO, str(temp_path)],
check=True,
capture_output=True
capture_output=True,
)
# Find ALL SKILL.md files in the entire repo
# Find ALL SKILL.md files
all_skill_files = list(temp_path.rglob("SKILL.md"))
print(f"\n2⃣ Total SKILL.md files found: {len(all_skill_files)}")
# Categorize by location type
# Categorize by location
location_types = defaultdict(list)
for skill_file in all_skill_files:
skill_dir = skill_file.parent
# Determine location type
if ".github/skills" in str(skill_file):
path_str = str(skill_file)
if ".github/skills" in path_str:
location_types["github_skills"].append(skill_file)
elif ".github/plugins" in str(skill_file):
elif ".github/plugins" in path_str:
location_types["github_plugins"].append(skill_file)
elif "/skills/" in str(skill_file):
# This is in the skills/ directory structure
# Check if it's via symlink or actual file
try:
skills_root = temp_path / "skills"
if skills_root in skill_file.parents:
# This skill is somewhere under skills/
# But is it a symlink or actual?
if skill_dir.is_symlink():
location_types["skills_symlinked"].append(skill_file)
else:
# Check if any parent is a symlink
has_symlink_parent = False
for parent in skill_file.parents:
if parent == skills_root:
break
if parent.is_symlink():
has_symlink_parent = True
break
if has_symlink_parent:
location_types["skills_via_symlink_parent"].append(skill_file)
else:
location_types["skills_direct"].append(skill_file)
except:
location_types["unknown"].append(skill_file)
elif "/skills/" in path_str:
location_types["skills_dir"].append(skill_file)
else:
location_types["other"].append(skill_file)
# Display results
print("\n3⃣ Skills by Location Type:")
print("-" * 60)
for loc_type, files in sorted(location_types.items()):
print(f"\n 📍 {loc_type}: {len(files)} skills")
if len(files) <= 5:
for f in files:
try:
rel = f.relative_to(temp_path)
print(f" - {rel}")
except:
print(f" - {f.name}")
else:
for f in files[:3]:
try:
rel = f.relative_to(temp_path)
print(f" - {rel}")
except:
print(f" - {f.name}")
print(f" ... and {len(files) - 3} more")
# Verify v3 coverage
print("\n4⃣ V3 Script Coverage Analysis:")
print(f" 📍 {loc_type}: {len(files)} skills")
# Flat name uniqueness check
print("\n4⃣ Flat Name Uniqueness Check:")
print("-" * 60)
github_skills_count = len(location_types["github_skills"])
github_plugins_count = len(location_types["github_plugins"])
skills_symlinked_count = len(location_types["skills_symlinked"])
skills_direct_count = len(location_types["skills_direct"])
skills_via_symlink_parent_count = len(location_types["skills_via_symlink_parent"])
print(f"\n ✅ .github/skills/: {github_skills_count}")
print(f" └─ Handled by: find_all_skills() function")
print(f"\n ✅ .github/plugins/: {github_plugins_count}")
print(f" └─ Handled by: find_plugin_skills() function")
print(f"\n ✅ skills/ (symlinked dirs): {skills_symlinked_count}")
print(f" └─ Handled by: sync_skills_preserve_structure() lines 76-83")
if skills_direct_count > 0:
print(f"\n ✅ skills/ (direct, non-symlink): {skills_direct_count}")
print(f" └─ Handled by: sync_skills_preserve_structure() lines 84-86")
name_map: dict[str, list[str]] = {}
missing_names = []
for skill_file in all_skill_files:
try:
rel = skill_file.parent.relative_to(temp_path)
except ValueError:
rel = skill_file.parent
name = extract_skill_name(skill_file)
if not name:
missing_names.append(str(rel))
# Generate fallback
parts = [p for p in rel.parts if p not in (
".github", "skills", "plugins")]
name = "ms-" + "-".join(parts) if parts else str(rel)
if name not in name_map:
name_map[name] = []
name_map[name].append(str(rel))
# Report results
collisions = {n: paths for n, paths in name_map.items()
if len(paths) > 1}
unique_names = {n: paths for n,
paths in name_map.items() if len(paths) == 1}
print(f"\n ✅ Unique names: {len(unique_names)}")
if missing_names:
print(
f"\n ⚠️ Skills missing frontmatter 'name' ({len(missing_names)}):")
for path in missing_names[:5]:
print(f" - {path}")
if len(missing_names) > 5:
print(f" ... and {len(missing_names) - 5} more")
if collisions:
print(f"\n ❌ Name collisions ({len(collisions)}):")
for name, paths in collisions.items():
print(f" '{name}':")
for p in paths:
print(f" - {p}")
else:
print(f"\n skills/ (direct, non-symlink): 0")
print(f" └─ No direct skills found, but v3 would handle them (lines 84-86)")
if skills_via_symlink_parent_count > 0:
print(f"\n ⚠️ skills/ (via symlink parent): {skills_via_symlink_parent_count}")
print(f" └─ May need special handling")
print(f"\n ✅ No collisions detected!")
# Validate all names are valid directory names
print("\n5⃣ Directory Name Validation:")
invalid_names = []
for name in name_map:
if not re.match(r"^[a-zA-Z0-9][a-zA-Z0-9._-]*$", name):
invalid_names.append(name)
if invalid_names:
print(f" ❌ Invalid directory names ({len(invalid_names)}):")
for name in invalid_names[:5]:
print(f" - '{name}'")
else:
print(f" ✅ All {len(name_map)} names are valid directory names!")
# Summary
print("\n5️⃣ Summary:")
print("\n6️⃣ Summary:")
print("-" * 60)
total_handled = (github_skills_count + github_plugins_count +
skills_symlinked_count + skills_direct_count)
print(f"\n Total SKILL.md files: {len(all_skill_files)}")
print(f" Handled by v3 script: {total_handled}")
if total_handled == len(all_skill_files):
print(f"\n ✅ 100% Coverage - All skills will be synced!")
elif total_handled >= len(all_skill_files) * 0.99:
print(f"\n~100% Coverage - Script handles all skills!")
print(f" ({len(all_skill_files) - total_handled} skills may be duplicates)")
total = len(all_skill_files)
unique = len(unique_names) + len(collisions)
print(f" Total SKILL.md files: {total}")
print(f" Unique flat names: {len(unique_names)}")
print(f" Collisions: {len(collisions)}")
print(f" Missing names: {len(missing_names)}")
is_pass = len(collisions) == 0 and len(invalid_names) == 0
if is_pass:
print(f"\nALL CHECKS PASSED")
else:
print(f"\n ⚠️ Partial Coverage - Missing {len(all_skill_files) - total_handled} skills")
print(f"\n Skills not covered:")
for loc_type, files in location_types.items():
if loc_type not in ["github_skills", "github_plugins", "skills_symlinked", "skills_direct"]:
print(f" - {loc_type}: {len(files)}")
# Test specific cases
print("\n6⃣ Testing Specific Edge Cases:")
print("-" * 60)
skills_dir = temp_path / "skills"
if skills_dir.exists():
# Check for any non-symlink directories with SKILL.md
print("\n Checking for non-symlinked skills in skills/...")
non_symlink_skills = []
for item in skills_dir.rglob("*"):
if item.is_dir() and not item.is_symlink():
if (item / "SKILL.md").exists():
# Check if any parent is a symlink
has_symlink_parent = False
for parent in item.parents:
if parent == skills_dir:
break
if parent.is_symlink():
has_symlink_parent = True
break
if not has_symlink_parent:
non_symlink_skills.append(item)
if non_symlink_skills:
print(f" ✅ Found {len(non_symlink_skills)} non-symlinked skills:")
for skill in non_symlink_skills[:5]:
print(f" - {skill.relative_to(skills_dir)}")
print(f" These WILL be synced by v3 (lines 84-86)")
else:
print(f" No non-symlinked skills found in skills/")
print(f" But v3 is ready to handle them if they exist!")
print(f"\n ⚠️ SOME CHECKS NEED ATTENTION")
print("\n✨ Analysis complete!")
return {
'total': len(all_skill_files),
'handled': total_handled,
'breakdown': {k: len(v) for k, v in location_types.items()}
"total": total,
"unique": len(unique_names),
"collisions": len(collisions),
"missing_names": len(missing_names),
"invalid_names": len(invalid_names),
"passed": is_pass,
}
if __name__ == "__main__":
try:
results = analyze_skill_locations()
print("\n" + "=" * 60)
print("FINAL VERDICT")
print("=" * 60)
coverage_pct = (results['handled'] / results['total'] * 100) if results['total'] > 0 else 0
print(f"\nCoverage: {coverage_pct:.1f}%")
print(f"Skills handled: {results['handled']}/{results['total']}")
if coverage_pct >= 99:
print("\n✅ V3 SCRIPT IS COMPREHENSIVE")
print(" All skill locations are properly handled!")
if results["passed"]:
print("\n✅ V4 FLAT STRUCTURE IS VALID")
print(" All names are unique and valid directory names!")
else:
print("\n⚠️ V3 SCRIPT MAY NEED ENHANCEMENT")
print(" Some edge cases might be missed")
print("\n⚠️ V4 FLAT STRUCTURE NEEDS FIXES")
if results["collisions"] > 0:
print(f" {results['collisions']} name collisions to resolve")
if results["invalid_names"] > 0:
print(f" {results['invalid_names']} invalid directory names")
except Exception as e:
print(f"\n❌ Error: {e}")
import traceback