Files

330 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Skill Matching Algorithm for Agent Orchestrator.
Scores and ranks skills against a user query to determine
which agents are relevant for the current request.
Scoring:
- Skill name appears in query: +15
- Exact trigger keyword match: +10 per keyword
- Capability category match: +5 per category
- Description word overlap: +1 per word
- Project assignment boost: +20 if skill is assigned to active project
Usage:
python match_skills.py "raspar dados de um site"
python match_skills.py "coletar precos e enviar por whatsapp"
python match_skills.py --project myproject "query here"
"""
import json
import sys
import os
import re
import subprocess
from pathlib import Path
# ── Configuration ──────────────────────────────────────────────────────────
# Resolve paths relative to this script's location
_SCRIPT_DIR = Path(__file__).resolve().parent
ORCHESTRATOR_DIR = _SCRIPT_DIR.parent
SKILLS_ROOT = ORCHESTRATOR_DIR.parent
DATA_DIR = ORCHESTRATOR_DIR / "data"
REGISTRY_PATH = DATA_DIR / "registry.json"
PROJECTS_PATH = DATA_DIR / "projects.json"
SCAN_SCRIPT = _SCRIPT_DIR / "scan_registry.py"
# Capability keywords for query -> category matching (PT + EN)
CAPABILITY_KEYWORDS = {
"data-extraction": [
"scrape", "extract", "crawl", "parse", "harvest", "collect", "data",
"raspar", "extrair", "coletar", "dados", "tabela", "table", "csv",
"web data", "pull info", "get data",
],
"messaging": [
"whatsapp", "message", "send", "chat", "notify", "notification", "sms",
"mensagem", "enviar", "notificar", "notificacao", "atendimento",
"comunicar", "avisar",
],
"social-media": [
"instagram", "facebook", "twitter", "post", "stories", "reels",
"social", "feed", "follower", "publicar", "rede social", "engajamento",
],
"government-data": [
"junta", "leiloeiro", "cadastro", "governo", "comercial", "tribunal",
"diario oficial", "certidao", "registro", "uf", "estado",
],
"web-automation": [
"browser", "selenium", "playwright", "automate", "click", "fill form",
"navegador", "automatizar", "automacao", "preencher",
],
"api-integration": [
"api", "endpoint", "webhook", "rest", "graph", "oauth", "token",
"integracao", "integrar", "conectar",
],
"analytics": [
"insight", "analytics", "metrics", "dashboard", "report", "stats",
"relatorio", "metricas", "analise", "estatistica",
],
"content-management": [
"publish", "schedule", "template", "content", "media", "upload",
"publicar", "agendar", "conteudo", "midia",
],
"legal": [
"advogado", "direito", "juridico", "lei", "processo",
"acao", "peticao", "recurso", "sentenca", "juiz",
"divorcio", "guarda", "alimentos", "pensao", "alimenticia", "inventario", "heranca", "partilha",
"acidente de trabalho", "acidente",
"familia", "criminal", "penal", "crime", "feminicidio", "maria da penha",
"violencia domestica", "medida protetiva", "stalking",
"danos morais", "responsabilidade civil", "indenizacao", "dano",
"consumidor", "cdc", "plano de saude",
"trabalhista", "clt", "rescisao", "fgts", "horas extras",
"previdenciario", "aposentadoria", "aposentar", "inss",
"imobiliario", "usucapiao", "despejo", "inquilinato",
"alienacao fiduciaria", "bem de familia",
"tributario", "imposto", "icms", "execucao fiscal",
"administrativo", "licitacao", "improbidade", "mandado de seguranca",
"empresarial", "societario", "falencia", "recuperacao judicial",
"empresa", "ltda", "cnpj", "mei", "eireli", "contrato social",
"contrato", "clausula", "contestacao", "apelacao", "agravo",
"habeas corpus", "mandado", "liminar", "tutela",
"cpc", "stj", "stf", "sumula", "jurisprudencia",
"oab", "honorarios", "custas",
],
"auction": [
"leilao", "leilao judicial", "leilao extrajudicial", "hasta publica",
"arrematacao", "arrematar", "arrematante", "lance", "desagio",
"edital leilao", "penhora", "adjudicacao", "praca",
"imissao na posse", "carta arrematacao", "vil preco",
"avaliacao imovel", "laudo", "perito", "matricula",
"leiloeiro", "comissao leiloeiro",
],
"security": [
"seguranca", "security", "owasp", "vulnerability", "incident",
"pentest", "firewall", "malware", "phishing", "cve",
"autenticacao", "criptografia", "encryption",
],
"image-generation": [
"imagem", "image", "gerar imagem", "generate image",
"stable diffusion", "comfyui", "midjourney", "dall-e",
"foto", "ilustracao", "arte", "design",
],
"monitoring": [
"monitor", "monitorar", "health", "status",
"audit", "auditoria", "sentinel", "check",
],
"context-management": [
"contexto", "context", "sessao", "session", "compactacao", "compaction",
"comprimir", "compress", "snapshot", "checkpoint", "briefing",
"continuidade", "continuity", "preservar", "preserve",
"memoria", "memory", "resumo", "summary",
"salvar estado", "save state", "context window", "janela de contexto",
"perda de dados", "data loss", "backup",
],
}
# ── Functions ──────────────────────────────────────────────────────────────
def ensure_registry():
"""Run scan if registry doesn't exist."""
if not REGISTRY_PATH.exists():
subprocess.run(
[sys.executable, str(SCAN_SCRIPT)],
capture_output=True, text=True
)
def load_registry() -> list[dict]:
"""Load skills from registry.json."""
ensure_registry()
if not REGISTRY_PATH.exists():
return []
try:
data = json.loads(REGISTRY_PATH.read_text(encoding="utf-8"))
return data.get("skills", [])
except Exception:
return []
def load_projects() -> dict:
"""Load project assignments."""
if not PROJECTS_PATH.exists():
return {"projects": []}
try:
return json.loads(PROJECTS_PATH.read_text(encoding="utf-8"))
except Exception:
return {"projects": []}
def get_project_skills(project_name: str) -> set:
"""Get set of skill names assigned to a project."""
projects = load_projects()
for p in projects.get("projects", []):
if p.get("name", "").lower() == project_name.lower():
return set(p.get("skills", []))
return set()
def query_to_capabilities(query: str) -> list[str]:
"""Map a query to capability categories using word boundary matching."""
q_lower = query.lower()
q_words = set(re.findall(r'[a-zA-ZÀ-ÿ]+', q_lower))
caps = []
for cap, keywords in CAPABILITY_KEYWORDS.items():
for kw in keywords:
# Multi-word keywords: substring match. Single-word: exact word match.
if " " in kw:
if kw in q_lower:
caps.append(cap)
break
elif kw in q_words:
caps.append(cap)
break
return caps
def normalize(text: str) -> set[str]:
"""Normalize text to a set of lowercase words."""
return set(re.findall(r'[a-zA-ZÀ-ÿ]{3,}', text.lower()))
def score_skill(skill: dict, query: str, project_skills: set = None) -> dict:
"""
Score a skill's relevance to a query.
Returns dict with score, reasons, and skill info.
"""
q_lower = query.lower()
score = 0
reasons = []
name = skill.get("name", "")
description = skill.get("description", "")
triggers = skill.get("triggers", [])
capabilities = skill.get("capabilities", [])
# 1. Skill name in query (+15)
if name.lower() in q_lower or name.lower().replace("-", " ") in q_lower:
score += 15
reasons.append(f"name:{name}")
# 2. Trigger keyword matches (+10 each) - word boundary matching
q_words = set(re.findall(r'[a-zA-ZÀ-ÿ]+', q_lower))
for trigger in triggers:
trigger_lower = trigger.lower()
# Multi-word triggers: substring match. Single-word: exact word match.
if " " in trigger_lower:
if trigger_lower in q_lower:
score += 10
reasons.append(f"trigger:{trigger}")
elif trigger_lower in q_words:
score += 10
reasons.append(f"trigger:{trigger}")
# 3. Capability category match (+5 each)
query_caps = query_to_capabilities(query)
for cap in capabilities:
if cap in query_caps:
score += 5
reasons.append(f"capability:{cap}")
# 4. Description word overlap (+1 each, max 10)
query_words = normalize(query)
desc_words = normalize(description)
overlap = query_words & desc_words
overlap_score = min(len(overlap), 10)
if overlap_score > 0:
score += overlap_score
reasons.append(f"word_overlap:{overlap_score}")
# 5. Project assignment boost (+20)
if project_skills and name in project_skills:
score += 20
reasons.append("project_boost")
return {
"name": name,
"score": score,
"reasons": reasons,
"location": skill.get("location", ""),
"skill_md": skill.get("skill_md", ""),
"capabilities": capabilities,
"status": skill.get("status", "unknown"),
}
def match(query: str, project: str = None, top_n: int = 5, threshold: int = 5) -> list[dict]:
"""
Match a query against all registered skills.
Returns top N skills with score >= threshold, sorted by score descending.
"""
skills = load_registry()
if not skills:
return []
project_skills = get_project_skills(project) if project else set()
results = []
for skill in skills:
result = score_skill(skill, query, project_skills)
if result["score"] >= threshold:
results.append(result)
results.sort(key=lambda x: x["score"], reverse=True)
return results[:top_n]
# ── CLI Entry Point ────────────────────────────────────────────────────────
def main():
args = sys.argv[1:]
project = None
query_parts = []
i = 0
while i < len(args):
if args[i] == "--project" and i + 1 < len(args):
project = args[i + 1]
i += 2
else:
query_parts.append(args[i])
i += 1
query = " ".join(query_parts)
if not query:
print(json.dumps({
"error": "No query provided",
"usage": 'python match_skills.py "your query here"'
}, indent=2))
sys.exit(1)
results = match(query, project=project)
output = {
"query": query,
"project": project,
"matched": len(results),
"skills": results,
}
if len(results) == 0:
output["recommendation"] = "No skills matched. Operate without skills or suggest creating a new one."
elif len(results) == 1:
output["recommendation"] = f"Single skill match: use '{results[0]['name']}' directly."
output["action"] = "load_skill"
else:
output["recommendation"] = f"Multiple skills matched ({len(results)}). Use orchestration."
output["action"] = "orchestrate"
print(json.dumps(output, indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()