#!/usr/bin/env python3 """ schema_validator.py — Extracts and validates JSON-LD structured data from HTML. Usage: python3 schema_validator.py [file.html] cat page.html | python3 schema_validator.py If no file is provided, runs on embedded sample HTML for demonstration. Output: Human-readable validation report + JSON summary. Scoring: 0-100 per schema block based on required/recommended field coverage. """ import json import sys import re import select from html.parser import HTMLParser from typing import List, Dict, Any, Optional # ─── Required and recommended fields per schema type ───────────────────────── SCHEMA_RULES: Dict[str, Dict[str, List[str]]] = { "Article": { "required": ["headline", "image", "datePublished", "author"], "recommended": ["dateModified", "publisher", "description", "url", "mainEntityOfPage"], }, "BlogPosting": { "required": ["headline", "image", "datePublished", "author"], "recommended": ["dateModified", "publisher", "description", "url", "mainEntityOfPage"], }, "NewsArticle": { "required": ["headline", "image", "datePublished", "author"], "recommended": ["dateModified", "publisher", "description", "url"], }, "HowTo": { "required": ["name", "step"], "recommended": ["description", "image", "totalTime", "tool", "supply", "estimatedCost"], }, "FAQPage": { "required": ["mainEntity"], "recommended": [], }, "Product": { "required": ["name", "offers"], "recommended": ["description", "image", "sku", "brand", "aggregateRating"], }, "Organization": { "required": ["name", "url"], "recommended": ["logo", "sameAs", "contactPoint", "description", "foundingDate"], }, "LocalBusiness": { "required": ["name", "address"], "recommended": ["telephone", "openingHoursSpecification", "geo", "priceRange", "image", "url"], }, "BreadcrumbList": { "required": ["itemListElement"], "recommended": [], }, "VideoObject": { "required": ["name", "description", "thumbnailUrl", "uploadDate"], "recommended": ["duration", "contentUrl", "embedUrl", "interactionStatistic", "hasPart"], }, "WebSite": { "required": ["url"], "recommended": ["name", "potentialAction"], }, "Event": { "required": ["name", "startDate", "location"], "recommended": ["endDate", "description", "image", "organizer", "offers"], }, "Recipe": { "required": ["name", "image", "author", "datePublished"], "recommended": ["description", "cookTime", "prepTime", "totalTime", "recipeYield", "recipeIngredient", "recipeInstructions", "aggregateRating"], }, } KNOWN_TYPES = set(SCHEMA_RULES.keys()) # ─── HTML Parser to extract JSON-LD blocks ─────────────────────────────────── class JSONLDExtractor(HTMLParser): """Extracts all

How to Write Cold Emails That Get Replies

Cold email works when it sounds human...

""" # ─── Main ───────────────────────────────────────────────────────────────────── def main(): import argparse parser = argparse.ArgumentParser( description="Extracts and validates JSON-LD structured data from HTML. " "Scores 0-100 per schema block based on required/recommended field coverage." ) parser.add_argument( "file", nargs="?", default=None, help="Path to an HTML file to validate. " "Use '-' to read from stdin. If omitted, runs embedded sample." ) args = parser.parse_args() if args.file: if args.file == "-": html = sys.stdin.read() else: try: with open(args.file, "r", encoding="utf-8") as f: html = f.read() except FileNotFoundError: print(f"Error: File not found: {args.file}", file=sys.stderr) sys.exit(1) else: print("No file provided — running on embedded sample HTML.\n") html = SAMPLE_HTML extractor = JSONLDExtractor() extractor.feed(html) all_results = [] for i, block in enumerate(extractor.blocks, start=1): results = validate_block(block, i) all_results.extend(results) print_report(all_results, html) # JSON output for programmatic use summary = { "blocks_found": len(extractor.blocks), "schemas_validated": len(all_results), "average_score": (sum(r.get("score", 0) for r in all_results) // len(all_results)) if all_results else 0, "results": all_results, } print("\n── JSON Output ──") print(json.dumps(summary, indent=2)) if __name__ == "__main__": main()