Rewrote sync_microsoft_skills.py (v4) to use each SKILL.md's frontmatter 'name' field as the flat directory name under skills/, replacing the nested skills/official/microsoft/<lang>/<category>/<service>/ hierarchy. This fixes CI failures caused by the indexing, validation, and catalog scripts expecting skills/<id>/SKILL.md (depth 1). Changes: - Rewrite scripts/sync_microsoft_skills.py for flat output with collision detection - Update scripts/tests/inspect_microsoft_repo.py for flat name mapping - Update scripts/tests/test_comprehensive_coverage.py for name uniqueness checks - Delete skills/official/ nested directory - Add 129 Microsoft skills as flat directories (e.g. skills/azure-mgmt-botservice-dotnet/) - Move attribution files to docs/ (LICENSE-MICROSOFT, microsoft-skills-attribution.json) - Rebuild skills_index.json, CATALOG.md, README.md (845 total skills)
668 lines
12 KiB
JavaScript
668 lines
12 KiB
JavaScript
const fs = require("fs");
|
|
const path = require("path");
|
|
const {
|
|
listSkillIdsRecursive,
|
|
readSkill,
|
|
tokenize,
|
|
unique,
|
|
} = require("../lib/skill-utils");
|
|
|
|
const ROOT = path.resolve(__dirname, "..");
|
|
const SKILLS_DIR = path.join(ROOT, "skills");
|
|
|
|
const STOPWORDS = new Set([
|
|
"a",
|
|
"an",
|
|
"and",
|
|
"are",
|
|
"as",
|
|
"at",
|
|
"be",
|
|
"but",
|
|
"by",
|
|
"for",
|
|
"from",
|
|
"has",
|
|
"have",
|
|
"in",
|
|
"into",
|
|
"is",
|
|
"it",
|
|
"its",
|
|
"of",
|
|
"on",
|
|
"or",
|
|
"our",
|
|
"out",
|
|
"over",
|
|
"that",
|
|
"the",
|
|
"their",
|
|
"they",
|
|
"this",
|
|
"to",
|
|
"use",
|
|
"when",
|
|
"with",
|
|
"you",
|
|
"your",
|
|
"will",
|
|
"can",
|
|
"if",
|
|
"not",
|
|
"only",
|
|
"also",
|
|
"more",
|
|
"best",
|
|
"practice",
|
|
"practices",
|
|
"expert",
|
|
"specialist",
|
|
"focused",
|
|
"focus",
|
|
"master",
|
|
"modern",
|
|
"advanced",
|
|
"comprehensive",
|
|
"production",
|
|
"production-ready",
|
|
"ready",
|
|
"build",
|
|
"create",
|
|
"deliver",
|
|
"design",
|
|
"implement",
|
|
"implementation",
|
|
"strategy",
|
|
"strategies",
|
|
"patterns",
|
|
"pattern",
|
|
"workflow",
|
|
"workflows",
|
|
"guide",
|
|
"template",
|
|
"templates",
|
|
"tool",
|
|
"tools",
|
|
"project",
|
|
"projects",
|
|
"support",
|
|
"manage",
|
|
"management",
|
|
"system",
|
|
"systems",
|
|
"services",
|
|
"service",
|
|
"across",
|
|
"end",
|
|
"end-to-end",
|
|
"using",
|
|
"based",
|
|
"ensure",
|
|
"ensure",
|
|
"help",
|
|
"needs",
|
|
"need",
|
|
"focuses",
|
|
"handles",
|
|
"builds",
|
|
"make",
|
|
]);
|
|
|
|
const TAG_STOPWORDS = new Set([
|
|
"pro",
|
|
"expert",
|
|
"patterns",
|
|
"pattern",
|
|
"workflow",
|
|
"workflows",
|
|
"templates",
|
|
"template",
|
|
"toolkit",
|
|
"tools",
|
|
"tool",
|
|
"project",
|
|
"projects",
|
|
"guide",
|
|
"management",
|
|
"engineer",
|
|
"architect",
|
|
"developer",
|
|
"specialist",
|
|
"assistant",
|
|
"analysis",
|
|
"review",
|
|
"reviewer",
|
|
"automation",
|
|
"orchestration",
|
|
"scaffold",
|
|
"scaffolding",
|
|
"implementation",
|
|
"strategy",
|
|
"context",
|
|
"management",
|
|
"feature",
|
|
"features",
|
|
"smart",
|
|
"system",
|
|
"systems",
|
|
"design",
|
|
"development",
|
|
"development",
|
|
"test",
|
|
"testing",
|
|
"workflow",
|
|
]);
|
|
|
|
const CATEGORY_RULES = [
|
|
{
|
|
name: "security",
|
|
keywords: [
|
|
"security",
|
|
"sast",
|
|
"compliance",
|
|
"privacy",
|
|
"threat",
|
|
"vulnerability",
|
|
"owasp",
|
|
"pci",
|
|
"gdpr",
|
|
"secrets",
|
|
"risk",
|
|
"malware",
|
|
"forensics",
|
|
"attack",
|
|
"incident",
|
|
"auth",
|
|
"mtls",
|
|
"zero",
|
|
"trust",
|
|
],
|
|
},
|
|
{
|
|
name: "infrastructure",
|
|
keywords: [
|
|
"kubernetes",
|
|
"k8s",
|
|
"helm",
|
|
"terraform",
|
|
"cloud",
|
|
"network",
|
|
"devops",
|
|
"gitops",
|
|
"prometheus",
|
|
"grafana",
|
|
"observability",
|
|
"monitoring",
|
|
"logging",
|
|
"tracing",
|
|
"deployment",
|
|
"istio",
|
|
"linkerd",
|
|
"service",
|
|
"mesh",
|
|
"slo",
|
|
"sre",
|
|
"oncall",
|
|
"incident",
|
|
"pipeline",
|
|
"cicd",
|
|
"ci",
|
|
"cd",
|
|
"kafka",
|
|
],
|
|
},
|
|
{
|
|
name: "data-ai",
|
|
keywords: [
|
|
"data",
|
|
"database",
|
|
"db",
|
|
"sql",
|
|
"postgres",
|
|
"mysql",
|
|
"analytics",
|
|
"etl",
|
|
"warehouse",
|
|
"dbt",
|
|
"ml",
|
|
"ai",
|
|
"llm",
|
|
"rag",
|
|
"vector",
|
|
"embedding",
|
|
"spark",
|
|
"airflow",
|
|
"cdc",
|
|
"pipeline",
|
|
],
|
|
},
|
|
{
|
|
name: "development",
|
|
keywords: [
|
|
"python",
|
|
"javascript",
|
|
"typescript",
|
|
"java",
|
|
"golang",
|
|
"go",
|
|
"rust",
|
|
"csharp",
|
|
"dotnet",
|
|
"php",
|
|
"ruby",
|
|
"node",
|
|
"react",
|
|
"frontend",
|
|
"backend",
|
|
"mobile",
|
|
"ios",
|
|
"android",
|
|
"flutter",
|
|
"fastapi",
|
|
"django",
|
|
"nextjs",
|
|
"vue",
|
|
"api",
|
|
],
|
|
},
|
|
{
|
|
name: "architecture",
|
|
keywords: [
|
|
"architecture",
|
|
"c4",
|
|
"microservices",
|
|
"event",
|
|
"cqrs",
|
|
"saga",
|
|
"domain",
|
|
"ddd",
|
|
"patterns",
|
|
"decision",
|
|
"adr",
|
|
],
|
|
},
|
|
{
|
|
name: "testing",
|
|
keywords: ["testing", "tdd", "unit", "e2e", "qa", "test"],
|
|
},
|
|
{
|
|
name: "business",
|
|
keywords: [
|
|
"business",
|
|
"market",
|
|
"sales",
|
|
"finance",
|
|
"startup",
|
|
"legal",
|
|
"hr",
|
|
"product",
|
|
"customer",
|
|
"seo",
|
|
"marketing",
|
|
"kpi",
|
|
"contract",
|
|
"employment",
|
|
],
|
|
},
|
|
{
|
|
name: "workflow",
|
|
keywords: [
|
|
"workflow",
|
|
"orchestration",
|
|
"conductor",
|
|
"automation",
|
|
"process",
|
|
"collaboration",
|
|
],
|
|
},
|
|
];
|
|
|
|
const BUNDLE_RULES = {
|
|
"core-dev": {
|
|
description:
|
|
"Core development skills across languages, frameworks, and backend/frontend fundamentals.",
|
|
keywords: [
|
|
"python",
|
|
"javascript",
|
|
"typescript",
|
|
"go",
|
|
"golang",
|
|
"rust",
|
|
"java",
|
|
"node",
|
|
"frontend",
|
|
"backend",
|
|
"react",
|
|
"fastapi",
|
|
"django",
|
|
"nextjs",
|
|
"api",
|
|
"mobile",
|
|
"ios",
|
|
"android",
|
|
"flutter",
|
|
"php",
|
|
"ruby",
|
|
],
|
|
},
|
|
"security-core": {
|
|
description: "Security, privacy, and compliance essentials.",
|
|
keywords: [
|
|
"security",
|
|
"sast",
|
|
"compliance",
|
|
"threat",
|
|
"risk",
|
|
"privacy",
|
|
"secrets",
|
|
"owasp",
|
|
"gdpr",
|
|
"pci",
|
|
"vulnerability",
|
|
"auth",
|
|
],
|
|
},
|
|
"k8s-core": {
|
|
description: "Kubernetes and service mesh essentials.",
|
|
keywords: [
|
|
"kubernetes",
|
|
"k8s",
|
|
"helm",
|
|
"istio",
|
|
"linkerd",
|
|
"service",
|
|
"mesh",
|
|
],
|
|
},
|
|
"data-core": {
|
|
description: "Data engineering and analytics foundations.",
|
|
keywords: [
|
|
"data",
|
|
"database",
|
|
"sql",
|
|
"dbt",
|
|
"airflow",
|
|
"spark",
|
|
"analytics",
|
|
"etl",
|
|
"warehouse",
|
|
"postgres",
|
|
"mysql",
|
|
"kafka",
|
|
],
|
|
},
|
|
"ops-core": {
|
|
description: "Operations, observability, and delivery pipelines.",
|
|
keywords: [
|
|
"observability",
|
|
"monitoring",
|
|
"logging",
|
|
"tracing",
|
|
"prometheus",
|
|
"grafana",
|
|
"devops",
|
|
"gitops",
|
|
"deployment",
|
|
"cicd",
|
|
"pipeline",
|
|
"slo",
|
|
"sre",
|
|
"incident",
|
|
],
|
|
},
|
|
};
|
|
|
|
const CURATED_COMMON = [
|
|
"bash-pro",
|
|
"python-pro",
|
|
"javascript-pro",
|
|
"typescript-pro",
|
|
"golang-pro",
|
|
"rust-pro",
|
|
"java-pro",
|
|
"frontend-developer",
|
|
"backend-architect",
|
|
"nodejs-backend-patterns",
|
|
"fastapi-pro",
|
|
"api-design-principles",
|
|
"sql-pro",
|
|
"database-architect",
|
|
"kubernetes-architect",
|
|
"terraform-specialist",
|
|
"observability-engineer",
|
|
"security-auditor",
|
|
"sast-configuration",
|
|
"gitops-workflow",
|
|
];
|
|
|
|
function normalizeTokens(tokens) {
|
|
return unique(tokens.map((token) => token.toLowerCase())).filter(Boolean);
|
|
}
|
|
|
|
function deriveTags(skill) {
|
|
let tags = Array.isArray(skill.tags) ? skill.tags : [];
|
|
tags = tags.map((tag) => tag.toLowerCase()).filter(Boolean);
|
|
|
|
if (!tags.length) {
|
|
tags = skill.id
|
|
.split("-")
|
|
.map((tag) => tag.toLowerCase())
|
|
.filter((tag) => tag && !TAG_STOPWORDS.has(tag));
|
|
}
|
|
|
|
return normalizeTokens(tags);
|
|
}
|
|
|
|
function detectCategory(skill, tags) {
|
|
const haystack = normalizeTokens([
|
|
...tags,
|
|
...tokenize(skill.name),
|
|
...tokenize(skill.description),
|
|
]);
|
|
const haystackSet = new Set(haystack);
|
|
|
|
for (const rule of CATEGORY_RULES) {
|
|
for (const keyword of rule.keywords) {
|
|
if (haystackSet.has(keyword)) {
|
|
return rule.name;
|
|
}
|
|
}
|
|
}
|
|
|
|
return "general";
|
|
}
|
|
|
|
function buildTriggers(skill, tags) {
|
|
const tokens = tokenize(`${skill.name} ${skill.description}`).filter(
|
|
(token) => token.length >= 2 && !STOPWORDS.has(token),
|
|
);
|
|
return unique([...tags, ...tokens]).slice(0, 12);
|
|
}
|
|
|
|
function buildAliases(skills) {
|
|
const existingIds = new Set(skills.map((skill) => skill.id));
|
|
const aliases = {};
|
|
const used = new Set();
|
|
|
|
for (const skill of skills) {
|
|
if (skill.name && skill.name !== skill.id) {
|
|
const alias = skill.name.toLowerCase();
|
|
if (!existingIds.has(alias) && !used.has(alias)) {
|
|
aliases[alias] = skill.id;
|
|
used.add(alias);
|
|
}
|
|
}
|
|
|
|
const tokens = skill.id.split("-").filter(Boolean);
|
|
if (skill.id.length < 28 || tokens.length < 4) continue;
|
|
|
|
const deduped = [];
|
|
const tokenSeen = new Set();
|
|
for (const token of tokens) {
|
|
if (tokenSeen.has(token)) continue;
|
|
tokenSeen.add(token);
|
|
deduped.push(token);
|
|
}
|
|
|
|
const aliasTokens =
|
|
deduped.length > 3
|
|
? [deduped[0], deduped[1], deduped[deduped.length - 1]]
|
|
: deduped;
|
|
const alias = unique(aliasTokens).join("-");
|
|
|
|
if (!alias || alias === skill.id) continue;
|
|
if (existingIds.has(alias) || used.has(alias)) continue;
|
|
|
|
aliases[alias] = skill.id;
|
|
used.add(alias);
|
|
}
|
|
|
|
return aliases;
|
|
}
|
|
|
|
function buildBundles(skills) {
|
|
const bundles = {};
|
|
const skillTokens = new Map();
|
|
|
|
for (const skill of skills) {
|
|
const tokens = normalizeTokens([
|
|
...skill.tags,
|
|
...tokenize(skill.name),
|
|
...tokenize(skill.description),
|
|
]);
|
|
skillTokens.set(skill.id, new Set(tokens));
|
|
}
|
|
|
|
for (const [bundleName, rule] of Object.entries(BUNDLE_RULES)) {
|
|
const bundleSkills = [];
|
|
const keywords = rule.keywords.map((keyword) => keyword.toLowerCase());
|
|
|
|
for (const skill of skills) {
|
|
const tokenSet = skillTokens.get(skill.id) || new Set();
|
|
if (keywords.some((keyword) => tokenSet.has(keyword))) {
|
|
bundleSkills.push(skill.id);
|
|
}
|
|
}
|
|
|
|
bundles[bundleName] = {
|
|
description: rule.description,
|
|
skills: bundleSkills.sort(),
|
|
};
|
|
}
|
|
|
|
const common = CURATED_COMMON.filter((skillId) => skillTokens.has(skillId));
|
|
|
|
return { bundles, common };
|
|
}
|
|
|
|
function truncate(value, limit) {
|
|
if (!value || value.length <= limit) return value || "";
|
|
return `${value.slice(0, limit - 3)}...`;
|
|
}
|
|
|
|
function renderCatalogMarkdown(catalog) {
|
|
const lines = [];
|
|
lines.push("# Skill Catalog");
|
|
lines.push("");
|
|
lines.push(`Generated at: ${catalog.generatedAt}`);
|
|
lines.push("");
|
|
lines.push(`Total skills: ${catalog.total}`);
|
|
lines.push("");
|
|
|
|
const categories = Array.from(
|
|
new Set(catalog.skills.map((skill) => skill.category)),
|
|
).sort();
|
|
for (const category of categories) {
|
|
const grouped = catalog.skills.filter(
|
|
(skill) => skill.category === category,
|
|
);
|
|
lines.push(`## ${category} (${grouped.length})`);
|
|
lines.push("");
|
|
lines.push("| Skill | Description | Tags | Triggers |");
|
|
lines.push("| --- | --- | --- | --- |");
|
|
|
|
for (const skill of grouped) {
|
|
const description = truncate(skill.description, 160).replace(
|
|
/\|/g,
|
|
"\\|",
|
|
);
|
|
const tags = skill.tags.join(", ");
|
|
const triggers = skill.triggers.join(", ");
|
|
lines.push(
|
|
`| \`${skill.id}\` | ${description} | ${tags} | ${triggers} |`,
|
|
);
|
|
}
|
|
|
|
lines.push("");
|
|
}
|
|
|
|
return lines.join("\n");
|
|
}
|
|
|
|
function buildCatalog() {
|
|
const skillRelPaths = listSkillIdsRecursive(SKILLS_DIR);
|
|
const skills = skillRelPaths.map((relPath) => readSkill(SKILLS_DIR, relPath));
|
|
const catalogSkills = [];
|
|
|
|
for (const skill of skills) {
|
|
const tags = deriveTags(skill);
|
|
const category = detectCategory(skill, tags);
|
|
const triggers = buildTriggers(skill, tags);
|
|
|
|
catalogSkills.push({
|
|
id: skill.id,
|
|
name: skill.name,
|
|
description: skill.description,
|
|
category,
|
|
tags,
|
|
triggers,
|
|
path: path.relative(ROOT, skill.path),
|
|
});
|
|
}
|
|
|
|
const catalog = {
|
|
generatedAt: process.env.SOURCE_DATE_EPOCH
|
|
? new Date(process.env.SOURCE_DATE_EPOCH * 1000).toISOString()
|
|
: "2026-02-08T00:00:00.000Z",
|
|
total: catalogSkills.length,
|
|
skills: catalogSkills.sort((a, b) =>
|
|
a.id < b.id ? -1 : a.id > b.id ? 1 : 0,
|
|
),
|
|
};
|
|
|
|
const aliases = buildAliases(catalog.skills);
|
|
const bundleData = buildBundles(catalog.skills);
|
|
|
|
const catalogPath = path.join(ROOT, "data", "catalog.json");
|
|
const catalogMarkdownPath = path.join(ROOT, "CATALOG.md");
|
|
const bundlesPath = path.join(ROOT, "data", "bundles.json");
|
|
const aliasesPath = path.join(ROOT, "data", "aliases.json");
|
|
|
|
fs.writeFileSync(catalogPath, JSON.stringify(catalog, null, 2));
|
|
fs.writeFileSync(catalogMarkdownPath, renderCatalogMarkdown(catalog));
|
|
fs.writeFileSync(
|
|
bundlesPath,
|
|
JSON.stringify(
|
|
{ generatedAt: catalog.generatedAt, ...bundleData },
|
|
null,
|
|
2,
|
|
),
|
|
);
|
|
fs.writeFileSync(
|
|
aliasesPath,
|
|
JSON.stringify({ generatedAt: catalog.generatedAt, aliases }, null, 2),
|
|
);
|
|
|
|
return catalog;
|
|
}
|
|
|
|
if (require.main === module) {
|
|
const catalog = buildCatalog();
|
|
console.log(`Generated catalog for ${catalog.total} skills.`);
|
|
}
|
|
|
|
module.exports = {
|
|
buildCatalog,
|
|
};
|