refactor: flatten Microsoft skills from nested to flat directory structure

Rewrote sync_microsoft_skills.py (v4) to use each SKILL.md's frontmatter 'name' field as the flat directory name under skills/, replacing the nested skills/official/microsoft/<lang>/<category>/<service>/ hierarchy. This fixes CI failures caused by the indexing, validation, and catalog scripts expecting skills/<id>/SKILL.md (depth 1). Changes: - Rewrite scripts/sync_microsoft_skills.py for flat output with collision detection - Update scripts/tests/inspect_microsoft_repo.py for flat name mapping - Update scripts/tests/test_comprehensive_coverage.py for name uniqueness checks - Delete skills/official/ nested directory - Add 129 Microsoft skills as flat directories (e.g. skills/azure-mgmt-botservice-dotnet/) - Move attribution files to docs/ (LICENSE-MICROSOFT, microsoft-skills-attribution.json) - Rebuild skills_index.json, CATALOG.md, README.md (845 total skills)
2026-02-12 00:07:15 +05:00
parent e06454dafd
commit e7ae616385
142 changed files with 5683 additions and 6097 deletions
--- a/scripts/tests/test_comprehensive_coverage.py
+++ b/scripts/tests/test_comprehensive_coverage.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python3
 """
-Test Script: Verify Microsoft Skills Sync Coverage
-Tests all possible skill locations and structures
+Test Script: Verify Microsoft Skills Sync Coverage and Flat Name Uniqueness
+Ensures all skills are captured and no directory name collisions exist.
 """

+import re
 import subprocess
 import tempfile
 from pathlib import Path
@@ -11,204 +12,177 @@ from collections import defaultdict

 MS_REPO = "https://github.com/microsoft/skills.git"

+
+def extract_skill_name(skill_md_path: Path) -> str | None:
+    """Extract the 'name' field from SKILL.md YAML frontmatter."""
+    try:
+        content = skill_md_path.read_text(encoding="utf-8")
+    except Exception:
+        return None
+
+    fm_match = re.search(r"^---\s*\n(.*?)\n---", content, re.DOTALL)
+    if not fm_match:
+        return None
+
+    for line in fm_match.group(1).splitlines():
+        match = re.match(r"^name:\s*(.+)$", line)
+        if match:
+            value = match.group(1).strip().strip("\"'")
+            if value:
+                return value
+    return None
+
+
 def analyze_skill_locations():
    """
    Comprehensive analysis of all skill locations in Microsoft repo.
-    Verifies that v3 script will catch everything.
+    Verifies flat name uniqueness and coverage.
    """
-    print("🔬 Comprehensive Skill Location Analysis")
+    print("🔬 Comprehensive Skill Coverage & Uniqueness Analysis")
    print("=" * 60)
-    
+
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = Path(temp_dir)
-        
+
        print("\n1️⃣ Cloning repository...")
        subprocess.run(
            ["git", "clone", "--depth", "1", MS_REPO, str(temp_path)],
            check=True,
-            capture_output=True
+            capture_output=True,
        )
-        
-        # Find ALL SKILL.md files in the entire repo
+
+        # Find ALL SKILL.md files
        all_skill_files = list(temp_path.rglob("SKILL.md"))
        print(f"\n2️⃣ Total SKILL.md files found: {len(all_skill_files)}")
-        
-        # Categorize by location type
+
+        # Categorize by location
        location_types = defaultdict(list)
-        
        for skill_file in all_skill_files:
-            skill_dir = skill_file.parent
-            
-            # Determine location type
-            if ".github/skills" in str(skill_file):
+            path_str = str(skill_file)
+            if ".github/skills" in path_str:
                location_types["github_skills"].append(skill_file)
-            elif ".github/plugins" in str(skill_file):
+            elif ".github/plugins" in path_str:
                location_types["github_plugins"].append(skill_file)
-            elif "/skills/" in str(skill_file):
-                # This is in the skills/ directory structure
-                # Check if it's via symlink or actual file
-                try:
-                    skills_root = temp_path / "skills"
-                    if skills_root in skill_file.parents:
-                        # This skill is somewhere under skills/
-                        # But is it a symlink or actual?
-                        if skill_dir.is_symlink():
-                            location_types["skills_symlinked"].append(skill_file)
-                        else:
-                            # Check if any parent is a symlink
-                            has_symlink_parent = False
-                            for parent in skill_file.parents:
-                                if parent == skills_root:
-                                    break
-                                if parent.is_symlink():
-                                    has_symlink_parent = True
-                                    break
-                            
-                            if has_symlink_parent:
-                                location_types["skills_via_symlink_parent"].append(skill_file)
-                            else:
-                                location_types["skills_direct"].append(skill_file)
-                except:
-                    location_types["unknown"].append(skill_file)
+            elif "/skills/" in path_str:
+                location_types["skills_dir"].append(skill_file)
            else:
                location_types["other"].append(skill_file)
-        
-        # Display results
+
        print("\n3️⃣ Skills by Location Type:")
-        print("-" * 60)
-        
        for loc_type, files in sorted(location_types.items()):
-            print(f"\n  📍 {loc_type}: {len(files)} skills")
-            if len(files) <= 5:
-                for f in files:
-                    try:
-                        rel = f.relative_to(temp_path)
-                        print(f"      - {rel}")
-                    except:
-                        print(f"      - {f.name}")
-            else:
-                for f in files[:3]:
-                    try:
-                        rel = f.relative_to(temp_path)
-                        print(f"      - {rel}")
-                    except:
-                        print(f"      - {f.name}")
-                print(f"      ... and {len(files) - 3} more")
-        
-        # Verify v3 coverage
-        print("\n4️⃣ V3 Script Coverage Analysis:")
+            print(f"  📍 {loc_type}: {len(files)} skills")
+
+        # Flat name uniqueness check
+        print("\n4️⃣ Flat Name Uniqueness Check:")
        print("-" * 60)
-        
-        github_skills_count = len(location_types["github_skills"])
-        github_plugins_count = len(location_types["github_plugins"])
-        skills_symlinked_count = len(location_types["skills_symlinked"])
-        skills_direct_count = len(location_types["skills_direct"])
-        skills_via_symlink_parent_count = len(location_types["skills_via_symlink_parent"])
-        
-        print(f"\n  ✅ .github/skills/: {github_skills_count}")
-        print(f"     └─ Handled by: find_all_skills() function")
-        
-        print(f"\n  ✅ .github/plugins/: {github_plugins_count}")
-        print(f"     └─ Handled by: find_plugin_skills() function")
-        
-        print(f"\n  ✅ skills/ (symlinked dirs): {skills_symlinked_count}")
-        print(f"     └─ Handled by: sync_skills_preserve_structure() lines 76-83")
-        
-        if skills_direct_count > 0:
-            print(f"\n  ✅ skills/ (direct, non-symlink): {skills_direct_count}")
-            print(f"     └─ Handled by: sync_skills_preserve_structure() lines 84-86")
+
+        name_map: dict[str, list[str]] = {}
+        missing_names = []
+
+        for skill_file in all_skill_files:
+            try:
+                rel = skill_file.parent.relative_to(temp_path)
+            except ValueError:
+                rel = skill_file.parent
+
+            name = extract_skill_name(skill_file)
+            if not name:
+                missing_names.append(str(rel))
+                # Generate fallback
+                parts = [p for p in rel.parts if p not in (
+                    ".github", "skills", "plugins")]
+                name = "ms-" + "-".join(parts) if parts else str(rel)
+
+            if name not in name_map:
+                name_map[name] = []
+            name_map[name].append(str(rel))
+
+        # Report results
+        collisions = {n: paths for n, paths in name_map.items()
+                      if len(paths) > 1}
+        unique_names = {n: paths for n,
+                        paths in name_map.items() if len(paths) == 1}
+
+        print(f"\n  ✅ Unique names: {len(unique_names)}")
+
+        if missing_names:
+            print(
+                f"\n  ⚠️  Skills missing frontmatter 'name' ({len(missing_names)}):")
+            for path in missing_names[:5]:
+                print(f"     - {path}")
+            if len(missing_names) > 5:
+                print(f"     ... and {len(missing_names) - 5} more")
+
+        if collisions:
+            print(f"\n  ❌ Name collisions ({len(collisions)}):")
+            for name, paths in collisions.items():
+                print(f"     '{name}':")
+                for p in paths:
+                    print(f"       - {p}")
        else:
-            print(f"\n  ℹ️  skills/ (direct, non-symlink): 0")
-            print(f"     └─ No direct skills found, but v3 would handle them (lines 84-86)")
-        
-        if skills_via_symlink_parent_count > 0:
-            print(f"\n  ⚠️  skills/ (via symlink parent): {skills_via_symlink_parent_count}")
-            print(f"     └─ May need special handling")
-        
+            print(f"\n  ✅ No collisions detected!")
+
+        # Validate all names are valid directory names
+        print("\n5️⃣ Directory Name Validation:")
+        invalid_names = []
+        for name in name_map:
+            if not re.match(r"^[a-zA-Z0-9][a-zA-Z0-9._-]*$", name):
+                invalid_names.append(name)
+
+        if invalid_names:
+            print(f"  ❌ Invalid directory names ({len(invalid_names)}):")
+            for name in invalid_names[:5]:
+                print(f"     - '{name}'")
+        else:
+            print(f"  ✅ All {len(name_map)} names are valid directory names!")
+
        # Summary
-        print("\n5️⃣ Summary:")
+        print("\n6️⃣ Summary:")
        print("-" * 60)
-        
-        total_handled = (github_skills_count + github_plugins_count + 
-                        skills_symlinked_count + skills_direct_count)
-        
-        print(f"\n  Total SKILL.md files: {len(all_skill_files)}")
-        print(f"  Handled by v3 script: {total_handled}")
-        
-        if total_handled == len(all_skill_files):
-            print(f"\n  ✅ 100% Coverage - All skills will be synced!")
-        elif total_handled >= len(all_skill_files) * 0.99:
-            print(f"\n  ✅ ~100% Coverage - Script handles all skills!")
-            print(f"     ({len(all_skill_files) - total_handled} skills may be duplicates)")
+        total = len(all_skill_files)
+        unique = len(unique_names) + len(collisions)
+
+        print(f"  Total SKILL.md files: {total}")
+        print(f"  Unique flat names: {len(unique_names)}")
+        print(f"  Collisions: {len(collisions)}")
+        print(f"  Missing names: {len(missing_names)}")
+
+        is_pass = len(collisions) == 0 and len(invalid_names) == 0
+        if is_pass:
+            print(f"\n  ✅ ALL CHECKS PASSED")
        else:
-            print(f"\n  ⚠️  Partial Coverage - Missing {len(all_skill_files) - total_handled} skills")
-            print(f"\n  Skills not covered:")
-            for loc_type, files in location_types.items():
-                if loc_type not in ["github_skills", "github_plugins", "skills_symlinked", "skills_direct"]:
-                    print(f"    - {loc_type}: {len(files)}")
-        
-        # Test specific cases
-        print("\n6️⃣ Testing Specific Edge Cases:")
-        print("-" * 60)
-        
-        skills_dir = temp_path / "skills"
-        if skills_dir.exists():
-            # Check for any non-symlink directories with SKILL.md
-            print("\n  Checking for non-symlinked skills in skills/...")
-            non_symlink_skills = []
-            
-            for item in skills_dir.rglob("*"):
-                if item.is_dir() and not item.is_symlink():
-                    if (item / "SKILL.md").exists():
-                        # Check if any parent is a symlink
-                        has_symlink_parent = False
-                        for parent in item.parents:
-                            if parent == skills_dir:
-                                break
-                            if parent.is_symlink():
-                                has_symlink_parent = True
-                                break
-                        
-                        if not has_symlink_parent:
-                            non_symlink_skills.append(item)
-            
-            if non_symlink_skills:
-                print(f"  ✅ Found {len(non_symlink_skills)} non-symlinked skills:")
-                for skill in non_symlink_skills[:5]:
-                    print(f"     - {skill.relative_to(skills_dir)}")
-                print(f"     These WILL be synced by v3 (lines 84-86)")
-            else:
-                print(f"  ℹ️  No non-symlinked skills found in skills/")
-                print(f"     But v3 is ready to handle them if they exist!")
-        
+            print(f"\n  ⚠️  SOME CHECKS NEED ATTENTION")
+
        print("\n✨ Analysis complete!")
-        
+
        return {
-            'total': len(all_skill_files),
-            'handled': total_handled,
-            'breakdown': {k: len(v) for k, v in location_types.items()}
+            "total": total,
+            "unique": len(unique_names),
+            "collisions": len(collisions),
+            "missing_names": len(missing_names),
+            "invalid_names": len(invalid_names),
+            "passed": is_pass,
        }

+
 if __name__ == "__main__":
    try:
        results = analyze_skill_locations()
-        
+
        print("\n" + "=" * 60)
        print("FINAL VERDICT")
        print("=" * 60)
-        
-        coverage_pct = (results['handled'] / results['total'] * 100) if results['total'] > 0 else 0
-        
-        print(f"\nCoverage: {coverage_pct:.1f}%")
-        print(f"Skills handled: {results['handled']}/{results['total']}")
-        
-        if coverage_pct >= 99:
-            print("\n✅ V3 SCRIPT IS COMPREHENSIVE")
-            print("   All skill locations are properly handled!")
+
+        if results["passed"]:
+            print("\n✅ V4 FLAT STRUCTURE IS VALID")
+            print("   All names are unique and valid directory names!")
        else:
-            print("\n⚠️  V3 SCRIPT MAY NEED ENHANCEMENT")
-            print("   Some edge cases might be missed")
-        
+            print("\n⚠️  V4 FLAT STRUCTURE NEEDS FIXES")
+            if results["collisions"] > 0:
+                print(f"   {results['collisions']} name collisions to resolve")
+            if results["invalid_names"] > 0:
+                print(f"   {results['invalid_names']} invalid directory names")
+
    except Exception as e:
        print(f"\n❌ Error: {e}")
        import traceback