New skills covering 10 categories: **Security & Audit**: 007 (STRIDE/PASTA/OWASP), cred-omega (secrets management) **AI Personas**: Karpathy, Hinton, Sutskever, LeCun (4 sub-skills), Altman, Musk, Gates, Jobs, Buffett **Multi-agent Orchestration**: agent-orchestrator, task-intelligence, multi-advisor **Code Analysis**: matematico-tao (Terence Tao-inspired mathematical code analysis) **Social & Messaging**: Instagram Graph API, Telegram Bot, WhatsApp Cloud API, social-orchestrator **Image Generation**: AI Studio (Gemini), Stability AI, ComfyUI Gateway, image-studio router **Brazilian Domain**: 6 auction specialist modules, 2 legal advisors, auctioneers data scraper **Product & Growth**: design, invention, monetization, analytics, growth engine **DevOps & LLM Ops**: Docker/CI-CD/AWS, RAG/embeddings/fine-tuning **Skill Governance**: installer, sentinel auditor, context management Each skill includes: - Standardized YAML frontmatter (name, description, risk, source, tags, tools) - Structured sections (Overview, When to Use, How it Works, Best Practices) - Python scripts and reference documentation where applicable - Cross-platform compatibility (Claude Code, Antigravity, Cursor, Gemini CLI, Codex CLI) Co-authored-by: ProgramadorBrasil <214873561+ProgramadorBrasil@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
212 lines
8.1 KiB
Python
212 lines
8.1 KiB
Python
"""
|
||
Scraper JUCEMA — Junta Comercial do Estado do Maranhao
|
||
URL: https://portal.jucema.ma.gov.br/ (React SPA - dados via REST API)
|
||
Metodo: httpx GET para API REST do CMS
|
||
Mecanismo real descoberto em 2026-02-25:
|
||
- Portal e React SPA (Create React App), sem conteudo server-side
|
||
- API base: https://api.jucema.ma.gov.br/api/public/
|
||
- Leiloeiros estao em: GET /api/public/posts/11
|
||
(post_id 11 = "Leiloeiro" no menu /api/public/menus?with=menus.menus)
|
||
- Conteudo e HTML dentro do campo data.content
|
||
- Ultima atualizacao: 2025-10-20 18:29:59
|
||
Total: 53 leiloeiros (39 Regular + 12 Irregular + 2 Cancelada)
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
import re
|
||
from typing import List
|
||
|
||
import httpx
|
||
from bs4 import BeautifulSoup
|
||
|
||
from .base_scraper import AbstractJuntaScraper, Leiloeiro
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
RE_MATRICULA = re.compile(r"[Mm]atr[íi]cula\s+N[º°o]?\s*(\d+(?:/\d+)?)\s*[–\-]\s*[Ee]m:\s*(\d{2}/\d{2}/\d{4})")
|
||
RE_SITUACAO = re.compile(r"SITUA[ÇC][ÃA]O:\s*(.+)", re.IGNORECASE)
|
||
RE_CONTATO = re.compile(r"[Cc]ontato:\s*(.+)")
|
||
RE_EMAIL = re.compile(r"[Ee]-?[Mm]ail:\s*(.+)")
|
||
RE_ENDERECO = re.compile(r"[Ee]ndere[çc]o:\s*(.+)")
|
||
|
||
|
||
class JucemaScraper(AbstractJuntaScraper):
|
||
estado = "MA"
|
||
junta = "JUCEMA"
|
||
url = "https://portal.jucema.ma.gov.br/"
|
||
_API_URL = "https://api.jucema.ma.gov.br/api/public/posts/11"
|
||
|
||
async def _fetch_api(self) -> List[dict]:
|
||
"""
|
||
Busca dados do post de leiloeiros via API REST do CMS.
|
||
GET /api/public/posts/11 retorna JSON com campo 'content' em HTML.
|
||
"""
|
||
try:
|
||
async with httpx.AsyncClient(
|
||
headers={**self.HEADERS, "Accept": "application/json"},
|
||
verify=True,
|
||
follow_redirects=True,
|
||
timeout=30.0,
|
||
) as client:
|
||
resp = await client.get(self._API_URL)
|
||
if resp.status_code >= 400:
|
||
logger.warning("[MA] API post/11 retornou HTTP %d", resp.status_code)
|
||
return []
|
||
|
||
data = resp.json()
|
||
# O campo pode estar em data.content ou diretamente em content
|
||
# API retorna { success: true, data: { content: "..." }, message: "..." }
|
||
inner = data.get("data") or {}
|
||
content_html = (
|
||
inner.get("content") or
|
||
data.get("content") or
|
||
""
|
||
)
|
||
if not content_html:
|
||
logger.warning("[MA] Campo 'content' vazio na API")
|
||
return []
|
||
|
||
logger.info("[MA] API retornou %d bytes de HTML", len(content_html))
|
||
soup = BeautifulSoup(content_html, "lxml")
|
||
return self._parse_cms_content(soup)
|
||
|
||
except Exception as exc:
|
||
logger.error("[MA] Erro na API: %s", exc)
|
||
return []
|
||
|
||
def _parse_cms_content(self, soup) -> List[dict]:
|
||
"""
|
||
Parseia conteudo HTML do CMS da JUCEMA.
|
||
Formato dos paragrafos:
|
||
<p>NOME COMPLETO</p>
|
||
<p>SITUACAO: REGULAR</p>
|
||
<p>Matricula No 010/1993 - Em: 29/04/1993</p>
|
||
<p>Endereco: ...</p>
|
||
<p>Contato: ...</p>
|
||
<p>E-mail: ...</p>
|
||
"""
|
||
records = []
|
||
paragraphs = [self.clean(p.get_text()) for p in soup.find_all("p") if self.clean(p.get_text())]
|
||
|
||
# Tambem tentar com outros elementos se nao houver <p>
|
||
if len(paragraphs) < 3:
|
||
paragraphs = [
|
||
self.clean(el.get_text())
|
||
for el in soup.find_all(["p", "li", "div", "span"])
|
||
if self.clean(el.get_text()) and len(self.clean(el.get_text())) > 3
|
||
]
|
||
|
||
current: dict | None = None
|
||
|
||
for text in paragraphs:
|
||
if not text:
|
||
continue
|
||
|
||
m_matr = RE_MATRICULA.search(text)
|
||
m_sit = RE_SITUACAO.search(text)
|
||
m_cont = RE_CONTATO.search(text)
|
||
m_email = RE_EMAIL.search(text)
|
||
m_end = RE_ENDERECO.search(text)
|
||
|
||
if m_matr:
|
||
if current:
|
||
current["matricula"] = m_matr.group(1)
|
||
current["data_registro"] = m_matr.group(2)
|
||
continue
|
||
if m_sit:
|
||
if current:
|
||
current["situacao"] = self.clean(m_sit.group(1))
|
||
continue
|
||
if m_cont:
|
||
if current:
|
||
current.setdefault("telefone", self.clean(m_cont.group(1)))
|
||
continue
|
||
if m_email:
|
||
if current:
|
||
current["email"] = self.clean(m_email.group(1))
|
||
continue
|
||
if m_end:
|
||
if current:
|
||
current["endereco"] = self.clean(m_end.group(1))
|
||
m_cidade = re.search(r"([A-ZÁÉÍÓÚÀÃÕÇ][A-Za-záéíóúàãõç\s]+)/MA", text)
|
||
if m_cidade:
|
||
current["municipio"] = m_cidade.group(1).strip()
|
||
continue
|
||
|
||
# Detectar inicio de nova entrada: nome em maiusculas
|
||
# Excluir titulos de secao como "RELACAO DOS LEILOEIROS", "CEP:", linhas curtas
|
||
is_nome = (
|
||
len(text) > 8 and
|
||
not re.match(r"(SITUA|Matr|MATR|[Ee]ndere|[Cc]ontato|[Ee]-?mail|www\.|http|^\d|Site:|CEP:|RELA[ÇC])", text) and
|
||
not re.search(r"^(RELA[ÇC][ÃA]O|LISTA|CADASTRO|JUNTA|COMERCIAL|LEILOEIROS\s*$)", text, re.IGNORECASE) and
|
||
sum(1 for c in text if c.isupper()) > len(text) * 0.3 and
|
||
" " in text and
|
||
len(text.split()) >= 2 and
|
||
len(text) < 120
|
||
)
|
||
|
||
if is_nome:
|
||
if current and current.get("nome"):
|
||
records.append(current)
|
||
current = {"nome": text, "municipio": "Sao Luis"}
|
||
elif current and text and re.search(r"\(\d{2}\)", text):
|
||
current.setdefault("telefone", text)
|
||
|
||
if current and current.get("nome"):
|
||
records.append(current)
|
||
|
||
return records
|
||
|
||
async def fetch_insecure(self, url: str):
|
||
"""Fetch com verify=False para sites com SSL problematico."""
|
||
try:
|
||
async with httpx.AsyncClient(
|
||
headers=self.HEADERS,
|
||
timeout=30.0,
|
||
follow_redirects=True,
|
||
verify=False,
|
||
) as client:
|
||
resp = await client.get(url)
|
||
if resp.status_code < 400:
|
||
return BeautifulSoup(resp.text, "lxml")
|
||
except Exception:
|
||
pass
|
||
return None
|
||
|
||
async def parse_leiloeiros(self) -> List[Leiloeiro]:
|
||
# Estrategia 1: API REST (direto ao dado, sem renderizacao JS)
|
||
records = await self._fetch_api()
|
||
|
||
if not records:
|
||
# Estrategia 2: Playwright para renderizar o SPA React
|
||
logger.info("[MA] API falhou, tentando Playwright no SPA")
|
||
for spa_url in [
|
||
"https://portal.jucema.ma.gov.br/leiloeiro",
|
||
"https://portal.jucema.ma.gov.br/leiloeiros",
|
||
]:
|
||
soup = await self.fetch_page_js(url=spa_url, wait_ms=5000)
|
||
if soup:
|
||
records = self._parse_cms_content(soup)
|
||
if records:
|
||
break
|
||
|
||
if not records:
|
||
# Estrategia 3: URLs alternativas com httpx
|
||
logger.info("[MA] Tentando URLs alternativas")
|
||
for url in [
|
||
"http://www.jucema.ma.gov.br/leiloeiros",
|
||
"https://www.jucema.ma.gov.br/leiloeiros",
|
||
"http://portal.jucema.ma.gov.br/pagina/11",
|
||
]:
|
||
soup = await self.fetch_insecure(url)
|
||
if soup:
|
||
text = soup.get_text()
|
||
if any(kw in text.lower() for kw in ["leiloeiro", "matr", "nome"]):
|
||
records = self._parse_cms_content(soup)
|
||
if records:
|
||
break
|
||
|
||
logger.info("[MA] Total de registros encontrados: %d", len(records))
|
||
return [self.make_leiloeiro(**r) for r in records if r.get("nome")]
|