#!/usr/bin/env python3 """ schema_validator.py — Extracts and validates JSON-LD structured data from HTML. Usage: python3 schema_validator.py [file.html] cat page.html | python3 schema_validator.py If no file is provided, runs on embedded sample HTML for demonstration. Output: Human-readable validation report + JSON summary. Scoring: 0-100 per schema block based on required/recommended field coverage. """ import json import sys import re import select from html.parser import HTMLParser from typing import List, Dict, Any, Optional # ─── Required and recommended fields per schema type ───────────────────────── SCHEMA_RULES: Dict[str, Dict[str, List[str]]] = { "Article": { "required": ["headline", "image", "datePublished", "author"], "recommended": ["dateModified", "publisher", "description", "url", "mainEntityOfPage"], }, "BlogPosting": { "required": ["headline", "image", "datePublished", "author"], "recommended": ["dateModified", "publisher", "description", "url", "mainEntityOfPage"], }, "NewsArticle": { "required": ["headline", "image", "datePublished", "author"], "recommended": ["dateModified", "publisher", "description", "url"], }, "HowTo": { "required": ["name", "step"], "recommended": ["description", "image", "totalTime", "tool", "supply", "estimatedCost"], }, "FAQPage": { "required": ["mainEntity"], "recommended": [], }, "Product": { "required": ["name", "offers"], "recommended": ["description", "image", "sku", "brand", "aggregateRating"], }, "Organization": { "required": ["name", "url"], "recommended": ["logo", "sameAs", "contactPoint", "description", "foundingDate"], }, "LocalBusiness": { "required": ["name", "address"], "recommended": ["telephone", "openingHoursSpecification", "geo", "priceRange", "image", "url"], }, "BreadcrumbList": { "required": ["itemListElement"], "recommended": [], }, "VideoObject": { "required": ["name", "description", "thumbnailUrl", "uploadDate"], "recommended": ["duration", "contentUrl", "embedUrl", "interactionStatistic", "hasPart"], }, "WebSite": { "required": ["url"], "recommended": ["name", "potentialAction"], }, "Event": { "required": ["name", "startDate", "location"], "recommended": ["endDate", "description", "image", "organizer", "offers"], }, "Recipe": { "required": ["name", "image", "author", "datePublished"], "recommended": ["description", "cookTime", "prepTime", "totalTime", "recipeYield", "recipeIngredient", "recipeInstructions", "aggregateRating"], }, } KNOWN_TYPES = set(SCHEMA_RULES.keys()) # ─── HTML Parser to extract JSON-LD blocks ─────────────────────────────────── class JSONLDExtractor(HTMLParser): """Extracts all
Cold email works when it sounds human...