Files
claude-skills-reference/marketing-skill/site-architecture/scripts/sitemap_analyzer.py
Reza Rezvani 5add886197 fix: repair 25 Python scripts failing --help across all domains
- Fix Python 3.10+ syntax (float | None → Optional[float]) in 2 scripts
- Add argparse CLI handling to 9 marketing scripts using raw sys.argv
- Fix 10 scripts crashing at module level (wrap in __main__, add argparse)
- Make yaml/prefect/mcp imports conditional with stdlib fallbacks (4 scripts)
- Fix f-string backslash syntax in project_bootstrapper.py
- Fix -h flag conflict in pr_analyzer.py
- Fix tech-debt.md description (score → prioritize)

All 237 scripts now pass python3 --help verification.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 05:51:27 +01:00

376 lines
15 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
sitemap_analyzer.py — Analyzes sitemap.xml files for structure, depth, and potential issues.
Usage:
python3 sitemap_analyzer.py [sitemap.xml]
python3 sitemap_analyzer.py https://example.com/sitemap.xml (fetches via urllib)
cat sitemap.xml | python3 sitemap_analyzer.py
If no file is provided, runs on embedded sample sitemap for demonstration.
Output: Structural analysis with depth distribution, URL patterns, orphan candidates,
duplicate path detection, and JSON summary.
Stdlib only — no external dependencies.
"""
import json
import sys
import re
import select
import urllib.request
import urllib.error
from collections import Counter, defaultdict
from urllib.parse import urlparse
import xml.etree.ElementTree as ET
# ─── Namespaces used in sitemaps ─────────────────────────────────────────────
SITEMAP_NAMESPACES = {
"sm": "http://www.sitemaps.org/schemas/sitemap/0.9",
"image": "http://www.google.com/schemas/sitemap-image/1.1",
"video": "http://www.google.com/schemas/sitemap-video/1.1",
"news": "http://www.google.com/schemas/sitemap-news/0.9",
"xhtml": "http://www.w3.org/1999/xhtml",
}
# ─── Sample sitemap (embedded) ────────────────────────────────────────────────
SAMPLE_SITEMAP = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<!-- Homepage -->
<url>
<loc>https://example.com/</loc>
<changefreq>daily</changefreq>
<priority>1.0</priority>
</url>
<!-- Top-level pages -->
<url><loc>https://example.com/pricing</loc></url>
<url><loc>https://example.com/about</loc></url>
<url><loc>https://example.com/contact</loc></url>
<url><loc>https://example.com/blog</loc></url>
<!-- Features section -->
<url><loc>https://example.com/features</loc></url>
<url><loc>https://example.com/features/email-automation</loc></url>
<url><loc>https://example.com/features/crm-integration</loc></url>
<url><loc>https://example.com/features/analytics</loc></url>
<!-- Solutions section -->
<url><loc>https://example.com/solutions/sales-teams</loc></url>
<url><loc>https://example.com/solutions/marketing-teams</loc></url>
<!-- Blog posts (various topics) -->
<url><loc>https://example.com/blog/cold-email-guide</loc></url>
<url><loc>https://example.com/blog/email-open-rates</loc></url>
<url><loc>https://example.com/blog/crm-comparison</loc></url>
<url><loc>https://example.com/blog/sales-process-optimization</loc></url>
<!-- Deeply nested pages (potential issue) -->
<url><loc>https://example.com/resources/guides/email/cold-outreach/advanced/templates</loc></url>
<url><loc>https://example.com/resources/guides/email/cold-outreach/advanced/scripts</loc></url>
<!-- Duplicate path patterns (potential issue) -->
<url><loc>https://example.com/blog/email-tips</loc></url>
<url><loc>https://example.com/resources/email-tips</loc></url>
<!-- Dynamic-looking URL (potential issue) -->
<url><loc>https://example.com/search?q=cold+email&amp;sort=recent</loc></url>
<!-- Case studies -->
<url><loc>https://example.com/customers/acme-corp</loc></url>
<url><loc>https://example.com/customers/globex</loc></url>
<!-- Legal pages (often over-linked) -->
<url><loc>https://example.com/privacy</loc></url>
<url><loc>https://example.com/terms</loc></url>
</urlset>
"""
# ─── URL Analysis ─────────────────────────────────────────────────────────────
def get_depth(path: str) -> int:
"""Return depth of a URL path. / = 0, /blog = 1, /blog/post = 2, etc."""
parts = [p for p in path.strip("/").split("/") if p]
return len(parts)
def get_path_pattern(path: str) -> str:
"""Replace variable segments with {slug} for pattern detection."""
parts = path.strip("/").split("/")
normalized = []
for p in parts:
if p:
# Keep static segments (likely structure), replace dynamic-looking ones
if re.match(r'^[a-z][-a-z]+$', p) and len(p) < 30:
normalized.append(p)
else:
normalized.append("{slug}")
return "/" + "/".join(normalized) if normalized else "/"
def has_query_params(url: str) -> bool:
return "?" in url
def looks_like_dynamic_url(url: str) -> bool:
parsed = urlparse(url)
return bool(parsed.query)
def detect_path_siblings(urls: list) -> list:
"""Find URLs with same slug in different parent directories (potential duplicates)."""
slug_to_paths = defaultdict(list)
for url in urls:
path = urlparse(url).path.strip("/")
slug = path.split("/")[-1] if path else ""
if slug:
slug_to_paths[slug].append(url)
duplicates = []
for slug, paths in slug_to_paths.items():
if len(paths) > 1:
# Only flag if they're in different directories
parents = set("/".join(urlparse(p).path.strip("/").split("/")[:-1]) for p in paths)
if len(parents) > 1:
duplicates.append({"slug": slug, "urls": paths})
return duplicates
# ─── Sitemap Parser ──────────────────────────────────────────────────────────
def parse_sitemap(content: str) -> list:
"""Parse sitemap XML and return list of URL dicts."""
urls = []
# Strip namespace declarations for simpler parsing
content_clean = re.sub(r'xmlns[^=]*="[^"]*"', '', content)
try:
root = ET.fromstring(content_clean)
except ET.ParseError as e:
print(f"❌ XML parse error: {e}", file=sys.stderr)
return []
# Handle sitemap index (points to other sitemaps)
if root.tag.endswith("sitemapindex") or root.tag == "sitemapindex":
print(" This is a sitemap index file — it points to child sitemaps.")
print(" Child sitemaps:")
for sitemap in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc") or root.findall(".//loc"):
print(f" - {sitemap.text}")
print(" Run this tool on each child sitemap for full analysis.")
return []
# Regular urlset
for url_el in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url") or root.findall(".//url"):
loc_el = url_el.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc") or url_el.find("loc")
lastmod_el = url_el.find("{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod") or url_el.find("lastmod")
priority_el = url_el.find("{http://www.sitemaps.org/schemas/sitemap/0.9}priority") or url_el.find("priority")
if loc_el is not None and loc_el.text:
urls.append({
"url": loc_el.text.strip(),
"lastmod": lastmod_el.text.strip() if lastmod_el is not None and lastmod_el.text else None,
"priority": float(priority_el.text.strip()) if priority_el is not None and priority_el.text else None,
})
return urls
# ─── Analysis Engine ─────────────────────────────────────────────────────────
def analyze_urls(urls: list) -> dict:
raw_urls = [u["url"] for u in urls]
paths = [urlparse(u).path for u in raw_urls]
depths = [get_depth(p) for p in paths]
depth_counter = Counter(depths)
dynamic_urls = [u for u in raw_urls if looks_like_dynamic_url(u)]
patterns = Counter(get_path_pattern(urlparse(u).path) for u in raw_urls)
top_patterns = patterns.most_common(10)
duplicate_slugs = detect_path_siblings(raw_urls)
deep_urls = [(u, get_depth(urlparse(u).path)) for u in raw_urls if get_depth(urlparse(u).path) >= 4]
# Extract top-level directories
top_dirs = Counter()
for p in paths:
parts = p.strip("/").split("/")
if parts and parts[0]:
top_dirs[parts[0]] += 1
return {
"total_urls": len(urls),
"depth_distribution": dict(sorted(depth_counter.items())),
"top_directories": dict(top_dirs.most_common(15)),
"dynamic_urls": dynamic_urls,
"deep_pages": deep_urls,
"duplicate_slug_candidates": duplicate_slugs,
"top_url_patterns": [{"pattern": p, "count": c} for p, c in top_patterns],
}
# ─── Report Printer ──────────────────────────────────────────────────────────
def grade_depth_distribution(dist: dict) -> str:
deep = sum(v for k, v in dist.items() if k >= 4)
total = sum(dist.values())
if total == 0:
return "N/A"
pct = deep / total * 100
if pct < 5:
return "🟢 Excellent"
if pct < 15:
return "🟡 Acceptable"
return "🔴 Too many deep pages"
def print_report(analysis: dict) -> None:
print("\n" + "" * 62)
print(" SITEMAP STRUCTURE ANALYSIS")
print("" * 62)
print(f"\n Total URLs: {analysis['total_urls']}")
print("\n── Depth Distribution ──")
dist = analysis["depth_distribution"]
total = analysis["total_urls"]
for depth, count in sorted(dist.items()):
pct = count / total * 100 if total else 0
bar = "" * int(pct / 2)
label = "homepage" if depth == 0 else f"{' ' * min(depth, 3)}/{'…/' * (depth - 1)}page"
print(f" Depth {depth}: {count:4d} pages ({pct:5.1f}%) {bar} {label}")
print(f"\n Rating: {grade_depth_distribution(dist)}")
deep_pct = sum(v for k, v in dist.items() if k >= 4) / total * 100 if total else 0
if deep_pct >= 5:
print(" ⚠️ More than 5% of pages are 4+ levels deep.")
print(" Consider flattening structure or adding shortcut links.")
print("\n── Top-Level Directories ──")
for d, count in analysis["top_directories"].items():
pct = count / total * 100 if total else 0
print(f" /{d:<30s} {count:4d} URLs ({pct:.1f}%)")
print("\n── URL Pattern Analysis ──")
for p in analysis["top_url_patterns"]:
print(f" {p['pattern']:<45s} {p['count']:4d} URLs")
if analysis["dynamic_urls"]:
print(f"\n── Dynamic URLs Detected ({len(analysis['dynamic_urls'])}) ──")
print(" ⚠️ URLs with query parameters should usually be excluded from sitemap.")
print(" Use canonical tags or robots.txt to prevent duplicate content indexing.")
for u in analysis["dynamic_urls"][:5]:
print(f" {u}")
if len(analysis["dynamic_urls"]) > 5:
print(f" ... and {len(analysis['dynamic_urls']) - 5} more")
if analysis["deep_pages"]:
print(f"\n── Deep Pages (4+ Levels) ({len(analysis['deep_pages'])}) ──")
print(" ⚠️ Pages this deep may have weak crawl equity. Add internal shortcuts.")
for url, depth in analysis["deep_pages"][:5]:
print(f" Depth {depth}: {url}")
if len(analysis["deep_pages"]) > 5:
print(f" ... and {len(analysis['deep_pages']) - 5} more")
if analysis["duplicate_slug_candidates"]:
print(f"\n── Potential Duplicate Path Issues ({len(analysis['duplicate_slug_candidates'])}) ──")
print(" ⚠️ Same slug appears in multiple directories — possible duplicate content.")
for item in analysis["duplicate_slug_candidates"][:5]:
print(f" Slug: '{item['slug']}'")
for u in item["urls"]:
print(f" - {u}")
if len(analysis["duplicate_slug_candidates"]) > 5:
print(f" ... and {len(analysis['duplicate_slug_candidates']) - 5} more")
print("\n── Recommendations ──")
has_issues = False
if analysis["dynamic_urls"]:
print(" 1. Remove dynamic URLs (with ?) from sitemap.")
has_issues = True
if analysis["deep_pages"]:
print(f" {'2' if has_issues else '1'}. Flatten deep URL structures or add internal shortcut links.")
has_issues = True
if analysis["duplicate_slug_candidates"]:
print(f" {'3' if has_issues else '1'}. Review duplicate slug paths — consolidate or add canonical tags.")
has_issues = True
if not has_issues:
print(" ✅ No major structural issues detected in this sitemap.")
print("\n" + "" * 62)
# ─── Main ─────────────────────────────────────────────────────────────────────
def load_content(source: str) -> str:
"""Load sitemap from file path, URL, or stdin."""
if source.startswith("http://") or source.startswith("https://"):
try:
with urllib.request.urlopen(source, timeout=10) as resp:
return resp.read().decode("utf-8")
except urllib.error.URLError as e:
print(f"Error fetching URL: {e}", file=sys.stderr)
sys.exit(1)
else:
try:
with open(source, "r", encoding="utf-8") as f:
return f.read()
except FileNotFoundError:
print(f"Error: File not found: {source}", file=sys.stderr)
sys.exit(1)
def main():
import argparse
parser = argparse.ArgumentParser(
description="Analyzes sitemap.xml files for structure, depth, and potential issues. "
"Reports depth distribution, URL patterns, orphan candidates, and duplicates."
)
parser.add_argument(
"file", nargs="?", default=None,
help="Path to a sitemap.xml file or URL (https://...). "
"Use '-' to read from stdin. If omitted, runs embedded sample."
)
args = parser.parse_args()
if args.file:
if args.file == "-":
content = sys.stdin.read()
else:
content = load_content(args.file)
else:
print("No file or URL provided — running on embedded sample sitemap.\n")
content = SAMPLE_SITEMAP
urls = parse_sitemap(content)
if not urls:
print("No URLs found in sitemap.", file=sys.stderr)
sys.exit(1)
analysis = analyze_urls(urls)
print_report(analysis)
# JSON output
print("\n── JSON Summary ──")
summary = {
"total_urls": analysis["total_urls"],
"depth_distribution": analysis["depth_distribution"],
"dynamic_url_count": len(analysis["dynamic_urls"]),
"deep_page_count": len(analysis["deep_pages"]),
"duplicate_slug_count": len(analysis["duplicate_slug_candidates"]),
"top_directories": analysis["top_directories"],
}
print(json.dumps(summary, indent=2))
if __name__ == "__main__":
main()