#!/usr/bin/env python3
"""
topic_cluster_mapper.py — Groups keywords/topics into content clusters
Usage:
  python3 topic_cluster_mapper.py --file keywords.txt
  python3 topic_cluster_mapper.py --json
  python3 topic_cluster_mapper.py          # demo mode (20 marketing topics)
"""

import argparse
import json
import re
import sys
from collections import defaultdict


# ---------------------------------------------------------------------------
# Simple stemmer (no nltk)
# ---------------------------------------------------------------------------

STOP_WORDS = {
    "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
    "of", "with", "by", "from", "is", "are", "was", "were", "be", "been",
    "how", "what", "why", "when", "where", "who", "which", "that", "this",
    "it", "its", "do", "does", "your", "our", "my", "their", "we", "you",
    "get", "make", "use", "using", "used", "can", "will", "should", "best",
}


def simple_stem(word: str) -> str:
    """Very simple suffix-stripping stemmer."""
    w = word.lower()
    if len(w) <= 3:
        return w
    # Order matters — try longer suffixes first
    suffixes = [
        "ization", "isation", "ational", "fulness", "ousness", "iveness",
        "iveness", "ingness", "ations", "nesses", "ators", "ation",
        "ating", "alism", "ality", "alize", "alise", "ation", "ator",
        "ness", "ment", "less", "tion", "sion", "tion", "ing", "ers",
        "ies", "ied", "ily", "ful", "ous", "ive", "ize", "ise", "est",
        "ed", "er", "ly", "al", "ic", "s",
    ]
    for sfx in suffixes:
        if w.endswith(sfx) and len(w) - len(sfx) >= 3:
            return w[: -len(sfx)]
    return w


def extract_stems(topic: str) -> set:
    words = re.findall(r"\b[a-zA-Z]+\b", topic.lower())
    return {simple_stem(w) for w in words if w not in STOP_WORDS and len(w) > 2}


# ---------------------------------------------------------------------------
# Clustering
# ---------------------------------------------------------------------------

def compute_similarity(stems_a: set, stems_b: set) -> float:
    """Jaccard similarity between two stem sets."""
    if not stems_a or not stems_b:
        return 0.0
    intersection = stems_a & stems_b
    union = stems_a | stems_b
    return len(intersection) / len(union)


def build_clusters(topics: list, threshold: float = 0.15) -> list:
    """
    Greedy clustering: assign each topic to the first cluster it's
    similar-enough to; else start a new cluster.
    """
    # Pre-compute stems
    topic_stems = {t: extract_stems(t) for t in topics}

    clusters = []  # list of {"pillar": str, "topics": [str], "stems": set}

    for topic in topics:
        t_stems = topic_stems[topic]
        best_cluster = None
        best_score = 0.0

        for cluster in clusters:
            sim = compute_similarity(t_stems, cluster["stems"])
            if sim > best_score:
                best_score = sim
                best_cluster = cluster

        if best_cluster and best_score >= threshold:
            best_cluster["topics"].append(topic)
            best_cluster["stems"] |= t_stems  # grow cluster centroid
        else:
            clusters.append({
                "pillar": topic,
                "topics": [topic],
                "stems": set(t_stems),
            })

    # Identify best pillar: topic with most shared stems to others in cluster
    for cluster in clusters:
        if len(cluster["topics"]) == 1:
            continue
        all_stems = [topic_stems[t] for t in cluster["topics"]]
        best_topic = cluster["topics"][0]
        best_conn = 0
        for i, topic in enumerate(cluster["topics"]):
            conn = sum(
                len(topic_stems[topic] & topic_stems[other])
                for j, other in enumerate(cluster["topics"]) if i != j
            )
            if conn > best_conn:
                best_conn = conn
                best_topic = topic
        cluster["pillar"] = best_topic

    return clusters


def build_output(topics: list, clusters: list) -> dict:
    cluster_output = []
    for i, c in enumerate(clusters, 1):
        supporting = [t for t in c["topics"] if t != c["pillar"]]
        cluster_output.append({
            "cluster_id": i,
            "pillar_topic": c["pillar"],
            "size": len(c["topics"]),
            "supporting_topics": supporting,
            "suggested_url_slug": re.sub(r"[^a-z0-9]+", "-", c["pillar"].lower()).strip("-"),
        })

    # Sort by cluster size desc
    cluster_output.sort(key=lambda x: -x["size"])

    return {
        "total_topics": len(topics),
        "total_clusters": len(clusters),
        "clusters": cluster_output,
        "recommendations": _make_recommendations(cluster_output),
    }


def _make_recommendations(clusters: list) -> list:
    recs = []
    large = [c for c in clusters if c["size"] >= 3]
    singletons = [c for c in clusters if c["size"] == 1]

    if large:
        recs.append(f"Create {len(large)} pillar page(s) for clusters with 3+ topics")
    if singletons:
        recs.append(
            f"{len(singletons)} singleton topic(s) — consider merging or expanding to form mini-clusters"
        )
    if clusters:
        biggest = clusters[0]
        recs.append(
            f"Highest-priority cluster: '{biggest['pillar_topic']}' "
            f"({biggest['size']} related topics) — start content here"
        )
    return recs


# ---------------------------------------------------------------------------
# Demo topics
# ---------------------------------------------------------------------------

DEMO_TOPICS = [
    "email marketing strategy",
    "email subject line tips",
    "email open rate optimization",
    "email automation workflows",
    "SEO keyword research",
    "on-page SEO optimization",
    "SEO content strategy",
    "technical SEO audit",
    "social media marketing",
    "social media content calendar",
    "Instagram marketing tips",
    "LinkedIn marketing for B2B",
    "content marketing ROI",
    "content strategy planning",
    "blog content ideas",
    "landing page conversion rate",
    "conversion rate optimization",
    "A/B testing landing pages",
    "paid ads budget allocation",
    "Google Ads campaign setup",
]


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(
        description="Topic cluster mapper — groups keywords into content clusters."
    )
    parser.add_argument("--file", help="Text file with one topic/keyword per line")
    parser.add_argument("--threshold", type=float, default=0.15,
                        help="Similarity threshold for clustering (default: 0.15)")
    parser.add_argument("--json", action="store_true", help="Output as JSON")
    args = parser.parse_args()

    if args.file:
        with open(args.file, "r", encoding="utf-8") as f:
            topics = [line.strip() for line in f if line.strip()]
    else:
        topics = DEMO_TOPICS
        if not args.json:
            print("No input provided — running in demo mode with 20 marketing topics.\n")

    if not topics:
        print("No topics found.", file=sys.stderr)
        sys.exit(1)

    clusters = build_clusters(topics, threshold=args.threshold)
    output = build_output(topics, clusters)

    if args.json:
        print(json.dumps(output, indent=2))
        return

    print("=" * 62)
    print(f"  TOPIC CLUSTER MAP   {output['total_topics']} topics → {output['total_clusters']} clusters")
    print("=" * 62)

    for cluster in output["clusters"]:
        print(f"\n  Cluster {cluster['cluster_id']}  ({cluster['size']} topics)")
        print(f"  ┌─ PILLAR: {cluster['pillar_topic']}")
        print(f"  │  Slug:   /{cluster['suggested_url_slug']}")
        for st in cluster["supporting_topics"]:
            print(f"  └─ Supporting: {st}")

    print("\n" + "=" * 62)
    print("  RECOMMENDATIONS")
    print("=" * 62)
    for rec in output["recommendations"]:
        print(f"  • {rec}")
    print()


if __name__ == "__main__":
    main()