Add Jupyter Notebook, Local HTML, OpenAPI/Swagger, AsciiDoc, PowerPoint, RSS/Atom, Man Pages, Confluence, Notion, and Slack/Discord Chat as new skill source types. Each type is fully integrated across: - Standalone CLI commands (skill-seekers <type>) - Auto-detection via 'skill-seekers create' (file extension + content sniffing) - Unified multi-source configs (scraped_data, dispatch, config validation) - Unified skill builder (generic merge + source-attributed synthesis) - MCP server (scrape_generic tool with per-type flag mapping) - pyproject.toml (entry points, optional deps, [all] group) Also fixes: EPUB unified pipeline gap, missing word/video config validators, OpenAPI yaml import guard, MCP flag mismatch for all 10 types, stale docstrings, and adds 77 integration tests + complex-merge workflow. 50 files changed, +20,201 lines
1088 lines
44 KiB
Python
1088 lines
44 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
RSS/Atom Feed to Skill Converter
|
|
|
|
Converts RSS 2.0, RSS 1.0 (RDF), and Atom feeds into AI-ready skills.
|
|
Uses feedparser for feed parsing, optionally follows article links to scrape
|
|
full content using requests + BeautifulSoup.
|
|
|
|
Supports both remote feed URLs and local feed XML files. Extracts article
|
|
metadata (title, author, published date, categories), feed-level metadata
|
|
(title, description, link, language), and optionally the full article text
|
|
from linked pages.
|
|
|
|
Usage:
|
|
skill-seekers rss --feed-url https://example.com/feed.xml --name myblog
|
|
skill-seekers rss --feed-path ./feed.xml --name myblog
|
|
skill-seekers rss --feed-url https://example.com/rss --no-follow-links --name myblog
|
|
skill-seekers rss --from-json myblog_extracted.json
|
|
python3 -m skill_seekers.cli.rss_scraper --feed-url https://example.com/atom.xml --name myblog
|
|
"""
|
|
|
|
import argparse
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
# Optional dependency guard — feedparser is not in core deps
|
|
try:
|
|
import feedparser # noqa: F401
|
|
|
|
FEEDPARSER_AVAILABLE = True
|
|
except ImportError:
|
|
FEEDPARSER_AVAILABLE = False
|
|
|
|
# BeautifulSoup is a core dependency (always available)
|
|
from bs4 import BeautifulSoup, Comment, Tag
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Feed type constants
|
|
FEED_TYPE_RSS_20 = "RSS 2.0"
|
|
FEED_TYPE_RSS_10 = "RSS 1.0 (RDF)"
|
|
FEED_TYPE_ATOM = "Atom"
|
|
FEED_TYPE_UNKNOWN = "Unknown"
|
|
|
|
# Default request headers for scraping article pages
|
|
_DEFAULT_HEADERS = {
|
|
"User-Agent": "SkillSeekers/RSS-Scraper (https://github.com/skill-seekers)",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.5",
|
|
}
|
|
|
|
# Tags to strip from scraped article HTML
|
|
_STRIP_TAGS = {"script", "style", "nav", "footer", "header", "aside", "noscript", "iframe"}
|
|
|
|
# Maximum length for a single article's scraped text (characters)
|
|
_MAX_ARTICLE_TEXT_LENGTH = 50_000
|
|
|
|
# Delay between HTTP requests when following links (seconds)
|
|
_REQUEST_DELAY = 1.0
|
|
|
|
|
|
def _check_feedparser_deps() -> None:
|
|
"""Raise RuntimeError if feedparser is not installed."""
|
|
if not FEEDPARSER_AVAILABLE:
|
|
raise RuntimeError(
|
|
"feedparser is required for RSS/Atom feed support.\n"
|
|
'Install with: pip install "skill-seekers[rss]"\n'
|
|
"Or: pip install feedparser"
|
|
)
|
|
|
|
|
|
def infer_description_from_feed(
|
|
feed_meta: dict[str, Any] | None = None,
|
|
name: str = "",
|
|
) -> str:
|
|
"""Infer skill description from feed-level metadata.
|
|
|
|
Tries to build a meaningful "Use when..." description from the feed
|
|
title and subtitle/description fields.
|
|
|
|
Args:
|
|
feed_meta: Feed metadata dict with title, description, link, etc.
|
|
name: Skill name for fallback.
|
|
|
|
Returns:
|
|
Description string suitable for "Use when..." format.
|
|
"""
|
|
if feed_meta:
|
|
desc = feed_meta.get("description", "")
|
|
if desc and len(desc) > 20:
|
|
if len(desc) > 150:
|
|
desc = desc[:147] + "..."
|
|
return f"Use when referencing {desc.lower()}"
|
|
title = feed_meta.get("title", "")
|
|
if title and len(title) > 5:
|
|
return f"Use when referencing articles from {title}"
|
|
return (
|
|
f"Use when referencing {name} feed content"
|
|
if name
|
|
else "Use when referencing this feed content"
|
|
)
|
|
|
|
|
|
class RssToSkillConverter:
|
|
"""Convert RSS/Atom feeds to AI-ready skills.
|
|
|
|
Parses RSS 2.0, RSS 1.0 (RDF), and Atom feeds using feedparser.
|
|
Optionally follows article links to scrape full page content via
|
|
requests + BeautifulSoup.
|
|
"""
|
|
|
|
def __init__(self, config: dict[str, Any]) -> None:
|
|
"""Initialize the converter with configuration.
|
|
|
|
Args:
|
|
config: Dictionary with name (required), feed_url, feed_path,
|
|
follow_links (default True), max_articles (default 50),
|
|
and description (optional).
|
|
"""
|
|
self.config = config
|
|
self.name: str = config["name"]
|
|
self.feed_url: str = config.get("feed_url", "")
|
|
self.feed_path: str = config.get("feed_path", "")
|
|
self.follow_links: bool = config.get("follow_links", True)
|
|
self.max_articles: int = config.get("max_articles", 50)
|
|
self.description: str = config.get(
|
|
"description", f"Use when referencing {self.name} feed content"
|
|
)
|
|
|
|
# Output paths
|
|
self.skill_dir: str = f"output/{self.name}"
|
|
self.data_file: str = f"output/{self.name}_extracted.json"
|
|
|
|
# Internal state
|
|
self.extracted_data: dict[str, Any] | None = None
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Public API
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
def extract_feed(self) -> bool:
|
|
"""Parse the RSS/Atom feed and extract article data.
|
|
|
|
Parses feed, extracts metadata and articles, optionally follows links
|
|
to scrape full content, saves intermediate JSON.
|
|
|
|
Returns:
|
|
True on success.
|
|
"""
|
|
_check_feedparser_deps()
|
|
|
|
source = self.feed_url or self.feed_path
|
|
print(f"\n🔍 Extracting RSS/Atom feed: {source}")
|
|
|
|
# Parse the feed
|
|
parsed = self._parse_feed()
|
|
|
|
# Detect feed type
|
|
feed_type = self._detect_feed_type(parsed)
|
|
print(f" Feed type: {feed_type}")
|
|
|
|
# Extract feed-level metadata
|
|
feed_meta = self._extract_feed_metadata(parsed)
|
|
print(f" Title: {feed_meta.get('title', 'Unknown')}")
|
|
print(f" Link: {feed_meta.get('link', 'N/A')}")
|
|
print(f" Language: {feed_meta.get('language', 'N/A')}")
|
|
|
|
# Update description from feed metadata if not explicitly set
|
|
if "description" not in self.config:
|
|
self.description = infer_description_from_feed(feed_meta, self.name)
|
|
|
|
# Extract articles
|
|
articles = self._extract_articles(parsed)
|
|
print(f" Articles found: {len(articles)}")
|
|
|
|
# Optionally scrape full article content
|
|
if self.follow_links:
|
|
print(f"\n🌐 Following article links (max {len(articles)})...")
|
|
scraped_count = 0
|
|
for i, article in enumerate(articles):
|
|
link = article.get("link", "")
|
|
if not link:
|
|
continue
|
|
print(f" [{i + 1}/{len(articles)}] {link[:80]}...")
|
|
content = self._scrape_article_content(link)
|
|
if content:
|
|
article["full_text"] = content
|
|
scraped_count += 1
|
|
# Be polite — delay between requests
|
|
if i < len(articles) - 1:
|
|
time.sleep(_REQUEST_DELAY)
|
|
print(f" Scraped full content for {scraped_count}/{len(articles)} articles")
|
|
else:
|
|
print(" Skipping link following (--no-follow-links)")
|
|
|
|
# Categorize articles by feed categories/tags
|
|
all_categories = self._collect_all_categories(articles)
|
|
|
|
# Build result data
|
|
result_data: dict[str, Any] = {
|
|
"source": source,
|
|
"feed_type": feed_type,
|
|
"feed_metadata": feed_meta,
|
|
"total_articles": len(articles),
|
|
"followed_links": self.follow_links,
|
|
"all_categories": sorted(all_categories),
|
|
"articles": articles,
|
|
}
|
|
|
|
# Persist extracted data
|
|
os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True)
|
|
with open(self.data_file, "w", encoding="utf-8") as f:
|
|
json.dump(result_data, f, indent=2, ensure_ascii=False, default=str)
|
|
|
|
print(f"\n💾 Saved extracted data to: {self.data_file}")
|
|
self.extracted_data = result_data
|
|
print(
|
|
f"✅ Extracted {len(articles)} articles ({len(all_categories)} unique categories/tags)"
|
|
)
|
|
return True
|
|
|
|
def load_extracted_data(self, json_path: str) -> bool:
|
|
"""Load previously extracted data from a JSON file."""
|
|
print(f"\n📂 Loading extracted data from: {json_path}")
|
|
if not os.path.exists(json_path):
|
|
raise FileNotFoundError(f"Extracted data file not found: {json_path}")
|
|
|
|
with open(json_path, encoding="utf-8") as f:
|
|
self.extracted_data = json.load(f)
|
|
|
|
total = self.extracted_data.get(
|
|
"total_articles", len(self.extracted_data.get("articles", []))
|
|
)
|
|
print(f"✅ Loaded {total} articles")
|
|
return True
|
|
|
|
def categorize_content(self) -> dict[str, dict[str, Any]]:
|
|
"""Categorize articles by their feed categories/tags."""
|
|
print("\n📋 Categorizing content by feed tags...")
|
|
|
|
if not self.extracted_data:
|
|
raise RuntimeError("No extracted data available. Call extract_feed() first.")
|
|
|
|
articles = self.extracted_data.get("articles", [])
|
|
categorized: dict[str, dict[str, Any]] = {}
|
|
|
|
for article in articles:
|
|
cats = article.get("categories", [])
|
|
if not cats:
|
|
cats = ["uncategorized"]
|
|
|
|
for cat in cats:
|
|
cat_key = self._sanitize_filename(cat)
|
|
if cat_key not in categorized:
|
|
categorized[cat_key] = {
|
|
"title": cat,
|
|
"articles": [],
|
|
}
|
|
# Avoid duplicates if an article has overlapping normalized keys
|
|
article_id = article.get("id", article.get("link", ""))
|
|
existing_ids = {
|
|
a.get("id", a.get("link", "")) for a in categorized[cat_key]["articles"]
|
|
}
|
|
if article_id not in existing_ids:
|
|
categorized[cat_key]["articles"].append(article)
|
|
|
|
# If no categories at all, put everything in one group
|
|
if not categorized:
|
|
categorized["all_articles"] = {
|
|
"title": "All Articles",
|
|
"articles": articles,
|
|
}
|
|
|
|
print(f"✅ Created {len(categorized)} categories")
|
|
for cat_key, cat_data in categorized.items():
|
|
print(f" - {cat_data['title']}: {len(cat_data['articles'])} articles")
|
|
|
|
return categorized
|
|
|
|
def build_skill(self) -> None:
|
|
"""Build complete skill structure from extracted data."""
|
|
print(f"\n🏗️ Building skill: {self.name}")
|
|
|
|
if not self.extracted_data:
|
|
raise RuntimeError("No extracted data available. Call extract_feed() first.")
|
|
|
|
# Create directories
|
|
os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
|
|
os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
|
|
os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
|
|
|
|
# Categorize content
|
|
categorized = self.categorize_content()
|
|
|
|
# Generate reference files
|
|
print("\n📝 Generating reference files...")
|
|
for cat_key, cat_data in categorized.items():
|
|
self._generate_reference_file(cat_key, cat_data)
|
|
|
|
# Generate index
|
|
self._generate_index(categorized)
|
|
|
|
# Generate SKILL.md
|
|
self._generate_skill_md(categorized)
|
|
|
|
print(f"\n✅ Skill built successfully: {self.skill_dir}/")
|
|
print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/")
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Feed parsing internals
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
def _parse_feed(self) -> "feedparser.FeedParserDict":
|
|
"""Parse feed from URL or local file using feedparser."""
|
|
import feedparser as fp
|
|
|
|
if self.feed_path:
|
|
if not os.path.exists(self.feed_path):
|
|
raise FileNotFoundError(f"Feed file not found: {self.feed_path}")
|
|
logger.info("Parsing feed from local file: %s", self.feed_path)
|
|
parsed = fp.parse(self.feed_path)
|
|
elif self.feed_url:
|
|
logger.info("Fetching feed from URL: %s", self.feed_url)
|
|
parsed = fp.parse(
|
|
self.feed_url,
|
|
agent="SkillSeekers/RSS-Scraper",
|
|
)
|
|
else:
|
|
raise RuntimeError(
|
|
"No feed source provided. Use feed_url (remote URL) or feed_path (local file)."
|
|
)
|
|
|
|
# Check for parsing errors
|
|
if parsed.bozo and not parsed.entries:
|
|
exc = parsed.get("bozo_exception", "Unknown parse error")
|
|
raise RuntimeError(f"Failed to parse feed: {exc}")
|
|
|
|
return parsed
|
|
|
|
def _detect_feed_type(self, parsed: "feedparser.FeedParserDict") -> str:
|
|
"""Detect RSS 2.0, RSS 1.0, or Atom from feedparser's version field."""
|
|
version = getattr(parsed, "version", "") or ""
|
|
version_lower = version.lower()
|
|
|
|
if "atom" in version_lower:
|
|
return FEED_TYPE_ATOM
|
|
if "rss20" in version_lower or version_lower == "rss20":
|
|
return FEED_TYPE_RSS_20
|
|
if "rss10" in version_lower or "rdf" in version_lower:
|
|
return FEED_TYPE_RSS_10
|
|
if version_lower.startswith("rss"):
|
|
return FEED_TYPE_RSS_20
|
|
|
|
# Fallback heuristic: check feed dict for version clues
|
|
feed = parsed.get("feed", {})
|
|
if feed.get("xmlns", "").startswith("http://www.w3.org/2005/Atom"):
|
|
return FEED_TYPE_ATOM
|
|
if feed.get("rss_version"):
|
|
return FEED_TYPE_RSS_20
|
|
|
|
return FEED_TYPE_UNKNOWN
|
|
|
|
def _extract_feed_metadata(self, parsed: "feedparser.FeedParserDict") -> dict[str, Any]:
|
|
"""Extract feed-level metadata (title, description, link, language, etc.)."""
|
|
feed = parsed.get("feed", {})
|
|
|
|
# feedparser normalizes subtitle (Atom) and description (RSS)
|
|
description = feed.get("subtitle", "") or feed.get("description", "")
|
|
|
|
# Published / updated dates
|
|
published = feed.get("published", "") or feed.get("updated", "")
|
|
|
|
# Feed image (RSS <image>, Atom <icon>/<logo>)
|
|
image_url = ""
|
|
image_data = feed.get("image", {})
|
|
if isinstance(image_data, dict):
|
|
image_url = image_data.get("href", "") or image_data.get("url", "")
|
|
elif isinstance(image_data, str):
|
|
image_url = image_data
|
|
|
|
return {
|
|
"title": feed.get("title", "Untitled Feed"),
|
|
"description": description,
|
|
"link": feed.get("link", ""),
|
|
"language": feed.get("language", ""),
|
|
"author": feed.get("author", ""),
|
|
"published": published,
|
|
"generator": feed.get("generator", ""),
|
|
"image_url": image_url,
|
|
"rights": feed.get("rights", ""),
|
|
}
|
|
|
|
def _extract_articles(self, parsed: "feedparser.FeedParserDict") -> list[dict[str, Any]]:
|
|
"""Extract article entries (title, link, summary, date, author, categories)."""
|
|
articles: list[dict[str, Any]] = []
|
|
|
|
for entry in parsed.entries[: self.max_articles]:
|
|
# Unique identifier (Atom id, RSS guid, or link hash)
|
|
entry_id = entry.get("id", "") or entry.get("link", "")
|
|
if not entry_id:
|
|
entry_id = hashlib.sha256(entry.get("title", "").encode("utf-8")).hexdigest()[:16]
|
|
|
|
# Published date normalization
|
|
published = entry.get("published", "") or entry.get("updated", "")
|
|
published_parsed = entry.get("published_parsed") or entry.get("updated_parsed")
|
|
published_iso = ""
|
|
if published_parsed:
|
|
try:
|
|
dt = datetime(*published_parsed[:6])
|
|
published_iso = dt.isoformat()
|
|
except (TypeError, ValueError):
|
|
published_iso = published
|
|
|
|
# Categories / tags
|
|
categories: list[str] = []
|
|
for tag_data in entry.get("tags", []):
|
|
term = tag_data.get("term", "")
|
|
if term:
|
|
categories.append(term)
|
|
|
|
# Summary — feedparser may provide HTML; clean it
|
|
summary_raw = entry.get("summary", "") or entry.get("description", "")
|
|
summary_text = self._html_to_text(summary_raw) if summary_raw else ""
|
|
|
|
# Content — some feeds include full content inline
|
|
content_text = ""
|
|
content_list = entry.get("content", [])
|
|
if content_list and isinstance(content_list, list):
|
|
for content_block in content_list:
|
|
value = content_block.get("value", "")
|
|
if value:
|
|
content_text += self._html_to_text(value) + "\n\n"
|
|
content_text = content_text.strip()
|
|
|
|
# Author(s)
|
|
author = entry.get("author", "")
|
|
if not author:
|
|
authors_detail = entry.get("authors", [])
|
|
if authors_detail:
|
|
author = ", ".join(a.get("name", "") for a in authors_detail if a.get("name"))
|
|
|
|
article: dict[str, Any] = {
|
|
"id": entry_id,
|
|
"title": entry.get("title", "Untitled"),
|
|
"link": entry.get("link", ""),
|
|
"summary": summary_text,
|
|
"content": content_text,
|
|
"published": published,
|
|
"published_iso": published_iso,
|
|
"author": author,
|
|
"categories": categories,
|
|
}
|
|
|
|
articles.append(article)
|
|
|
|
return articles
|
|
|
|
def _scrape_article_content(self, url: str) -> str:
|
|
"""Follow article URL, extract full page content using requests + BeautifulSoup."""
|
|
try:
|
|
import requests
|
|
except ImportError:
|
|
logger.warning(
|
|
"requests library not available — cannot follow article links. "
|
|
"Install with: pip install requests"
|
|
)
|
|
return ""
|
|
|
|
try:
|
|
response = requests.get(
|
|
url,
|
|
headers=_DEFAULT_HEADERS,
|
|
timeout=15,
|
|
allow_redirects=True,
|
|
)
|
|
response.raise_for_status()
|
|
except Exception as e:
|
|
logger.debug("Failed to fetch %s: %s", url, e)
|
|
return ""
|
|
|
|
content_type = response.headers.get("Content-Type", "")
|
|
if "html" not in content_type.lower() and "xml" not in content_type.lower():
|
|
logger.debug("Skipping non-HTML content at %s (type: %s)", url, content_type)
|
|
return ""
|
|
|
|
return self._extract_article_text(response.text)
|
|
|
|
def _extract_article_text(self, html: str) -> str:
|
|
"""Clean article HTML to text/markdown. Finds <article>/<main>, strips nav/ads."""
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
# Remove unwanted elements
|
|
for tag_name in _STRIP_TAGS:
|
|
for element in soup.find_all(tag_name):
|
|
element.decompose()
|
|
for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
|
|
comment.extract()
|
|
|
|
# Try to find the main article container
|
|
main_content = (
|
|
soup.find("article")
|
|
or soup.find("main")
|
|
or soup.find(attrs={"role": "main"})
|
|
or soup.find(attrs={"id": re.compile(r"(content|article|post|entry)", re.I)})
|
|
or soup.find(attrs={"class": re.compile(r"(content|article|post|entry)", re.I)})
|
|
)
|
|
|
|
if not main_content:
|
|
main_content = soup.find("body") or soup
|
|
|
|
# Convert to text with basic structure preservation
|
|
text_parts: list[str] = []
|
|
for element in main_content.descendants:
|
|
if isinstance(element, Tag):
|
|
if element.name in ("h1", "h2", "h3", "h4", "h5", "h6"):
|
|
level = int(element.name[1])
|
|
heading_text = element.get_text(strip=True)
|
|
if heading_text:
|
|
text_parts.append(f"\n{'#' * level} {heading_text}\n")
|
|
elif element.name == "p":
|
|
para_text = element.get_text(separator=" ", strip=True)
|
|
if para_text:
|
|
text_parts.append(f"\n{para_text}\n")
|
|
elif element.name in ("pre", "code"):
|
|
code_text = element.get_text()
|
|
if code_text and code_text.strip():
|
|
# Detect language from class if available
|
|
classes = element.get("class", [])
|
|
lang = ""
|
|
for cls in classes:
|
|
if isinstance(cls, str) and (
|
|
cls.startswith("language-") or cls.startswith("lang-")
|
|
):
|
|
lang = cls.split("-", 1)[1]
|
|
break
|
|
text_parts.append(f"\n```{lang}\n{code_text.strip()}\n```\n")
|
|
elif element.name == "li":
|
|
li_text = element.get_text(separator=" ", strip=True)
|
|
if li_text:
|
|
text_parts.append(f"- {li_text}")
|
|
elif element.name == "blockquote":
|
|
bq_text = element.get_text(separator=" ", strip=True)
|
|
if bq_text:
|
|
text_parts.append(f"\n> {bq_text}\n")
|
|
|
|
text = "\n".join(text_parts).strip()
|
|
|
|
# Collapse excessive whitespace
|
|
text = re.sub(r"\n{4,}", "\n\n\n", text)
|
|
|
|
# Truncate if too long
|
|
if len(text) > _MAX_ARTICLE_TEXT_LENGTH:
|
|
text = text[:_MAX_ARTICLE_TEXT_LENGTH] + "\n\n[Content truncated]"
|
|
|
|
return text
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Categorization helpers
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
def _collect_all_categories(self, articles: list[dict[str, Any]]) -> set[str]:
|
|
"""Collect all unique category/tag strings across articles."""
|
|
categories: set[str] = set()
|
|
for article in articles:
|
|
for cat in article.get("categories", []):
|
|
if cat:
|
|
categories.add(cat)
|
|
return categories
|
|
|
|
def _html_to_text(self, html_fragment: str) -> str:
|
|
"""Convert an HTML fragment to plain text, stripping all tags."""
|
|
if not html_fragment:
|
|
return ""
|
|
soup = BeautifulSoup(html_fragment, "html.parser")
|
|
text = soup.get_text(separator=" ", strip=True)
|
|
# Collapse multiple spaces
|
|
text = re.sub(r"\s+", " ", text).strip()
|
|
return text
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Skill generation — reference files
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
def _generate_reference_file(self, cat_key: str, cat_data: dict[str, Any]) -> None:
|
|
"""Generate a reference markdown file for a category of articles."""
|
|
safe_name = self._sanitize_filename(cat_data["title"])
|
|
filepath = f"{self.skill_dir}/references/{safe_name}.md"
|
|
|
|
articles = cat_data["articles"]
|
|
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
|
f.write(f"# {cat_data['title']}\n\n")
|
|
f.write(f"**Articles:** {len(articles)}\n\n")
|
|
f.write("---\n\n")
|
|
|
|
for article in articles:
|
|
f.write(f"## {article.get('title', 'Untitled')}\n\n")
|
|
|
|
# Metadata block
|
|
if article.get("author"):
|
|
f.write(f"**Author:** {article['author']}\n\n")
|
|
if article.get("published"):
|
|
f.write(f"**Published:** {article['published']}\n\n")
|
|
if article.get("link"):
|
|
f.write(f"**Link:** {article['link']}\n\n")
|
|
if article.get("categories"):
|
|
tags = ", ".join(article["categories"])
|
|
f.write(f"**Tags:** {tags}\n\n")
|
|
|
|
# Summary
|
|
summary = article.get("summary", "")
|
|
if summary:
|
|
f.write("### Summary\n\n")
|
|
f.write(f"{summary}\n\n")
|
|
|
|
# Inline content from feed (if present)
|
|
inline_content = article.get("content", "")
|
|
if inline_content and inline_content != summary:
|
|
f.write("### Content\n\n")
|
|
f.write(f"{inline_content}\n\n")
|
|
|
|
# Full scraped text
|
|
full_text = article.get("full_text", "")
|
|
if full_text:
|
|
f.write("### Full Article\n\n")
|
|
f.write(f"{full_text}\n\n")
|
|
|
|
f.write("---\n\n")
|
|
|
|
print(f" Generated: {filepath}")
|
|
|
|
def _generate_index(self, categorized: dict[str, dict[str, Any]]) -> None:
|
|
"""Generate the reference index file with category links and statistics."""
|
|
filepath = f"{self.skill_dir}/references/index.md"
|
|
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
|
f.write(f"# {self.name.title()} Feed Reference Index\n\n")
|
|
|
|
feed_meta = self.extracted_data.get("feed_metadata", {})
|
|
if feed_meta.get("title"):
|
|
f.write(f"**Feed:** {feed_meta['title']}\n\n")
|
|
if feed_meta.get("link"):
|
|
f.write(f"**Source:** {feed_meta['link']}\n\n")
|
|
|
|
f.write("## Categories\n\n")
|
|
|
|
total_articles = 0
|
|
for cat_key, cat_data in sorted(categorized.items()):
|
|
safe_name = self._sanitize_filename(cat_data["title"])
|
|
count = len(cat_data["articles"])
|
|
total_articles += count
|
|
f.write(f"- [{cat_data['title']}]({safe_name}.md) ({count} articles)\n")
|
|
|
|
f.write(f"\n**Total articles:** {total_articles}\n\n")
|
|
|
|
# Statistics
|
|
f.write("## Statistics\n\n")
|
|
f.write(f"- Total articles: {self.extracted_data.get('total_articles', 0)}\n")
|
|
f.write(f"- Feed type: {self.extracted_data.get('feed_type', FEED_TYPE_UNKNOWN)}\n")
|
|
f.write(
|
|
f"- Links followed: "
|
|
f"{'Yes' if self.extracted_data.get('followed_links') else 'No'}\n"
|
|
)
|
|
|
|
all_cats = self.extracted_data.get("all_categories", [])
|
|
if all_cats:
|
|
f.write(f"- Unique tags: {len(all_cats)}\n")
|
|
|
|
# Author summary
|
|
author_counts = self._count_authors()
|
|
if author_counts:
|
|
f.write(f"\n## Authors ({len(author_counts)})\n\n")
|
|
for author, count in sorted(
|
|
author_counts.items(), key=lambda x: x[1], reverse=True
|
|
)[:20]:
|
|
f.write(f"- {author}: {count} articles\n")
|
|
|
|
print(f" Generated: {filepath}")
|
|
|
|
def _generate_skill_md(self, categorized: dict[str, dict[str, Any]]) -> None:
|
|
"""Generate the main SKILL.md file with feed overview and navigation."""
|
|
filepath = f"{self.skill_dir}/SKILL.md"
|
|
|
|
feed_meta = self.extracted_data.get("feed_metadata", {})
|
|
feed_title = feed_meta.get("title", self.name.title())
|
|
feed_type = self.extracted_data.get("feed_type", FEED_TYPE_UNKNOWN)
|
|
|
|
# Skill name for frontmatter (lowercase, hyphens, max 64 chars)
|
|
skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
|
|
|
|
# Truncate description
|
|
desc = self.description[:1024] if len(self.description) > 1024 else self.description
|
|
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
|
# YAML frontmatter
|
|
f.write("---\n")
|
|
f.write(f"name: {skill_name}\n")
|
|
f.write(f"description: {desc}\n")
|
|
f.write("---\n\n")
|
|
|
|
# Header
|
|
f.write(f"# {feed_title} Feed Skill\n\n")
|
|
f.write(f"{self.description}\n\n")
|
|
|
|
# Feed Information
|
|
f.write("## 📡 Feed Information\n\n")
|
|
f.write(f"**Feed Title:** {feed_title}\n\n")
|
|
f.write(f"**Feed Type:** {feed_type}\n\n")
|
|
if feed_meta.get("link"):
|
|
f.write(f"**Website:** {feed_meta['link']}\n\n")
|
|
if feed_meta.get("language"):
|
|
f.write(f"**Language:** {feed_meta['language']}\n\n")
|
|
if feed_meta.get("description"):
|
|
feed_desc = feed_meta["description"]
|
|
if len(feed_desc) > 300:
|
|
feed_desc = feed_desc[:297] + "..."
|
|
f.write(f"**Description:** {feed_desc}\n\n")
|
|
if feed_meta.get("generator"):
|
|
f.write(f"**Generator:** {feed_meta['generator']}\n\n")
|
|
if feed_meta.get("rights"):
|
|
f.write(f"**Rights:** {feed_meta['rights']}\n\n")
|
|
|
|
# When to Use
|
|
f.write("## 💡 When to Use This Skill\n\n")
|
|
f.write("Use this skill when you need to:\n")
|
|
f.write(f"- Reference articles and content from {feed_title}\n")
|
|
f.write("- Look up specific topics covered in the feed\n")
|
|
f.write("- Find author perspectives and expert analysis\n")
|
|
f.write("- Review recent posts and updates on the subject\n")
|
|
f.write("- Explore categorized content by tags or topics\n\n")
|
|
|
|
# Article Overview
|
|
total_articles = self.extracted_data.get("total_articles", 0)
|
|
f.write("## 📖 Article Overview\n\n")
|
|
f.write(f"**Total Articles:** {total_articles}\n\n")
|
|
|
|
# Category breakdown
|
|
f.write("**Content by Category:**\n\n")
|
|
for cat_key, cat_data in sorted(categorized.items()):
|
|
count = len(cat_data["articles"])
|
|
f.write(f"- **{cat_data['title']}**: {count} articles\n")
|
|
f.write("\n")
|
|
|
|
# Recent articles (top 10 by date or order)
|
|
articles = self.extracted_data.get("articles", [])
|
|
recent = articles[:10]
|
|
if recent:
|
|
f.write("## 📰 Recent Articles\n\n")
|
|
for article in recent:
|
|
title = article.get("title", "Untitled")
|
|
published = article.get("published", "")
|
|
author = article.get("author", "")
|
|
link = article.get("link", "")
|
|
|
|
f.write(f"### {title}\n\n")
|
|
meta_parts: list[str] = []
|
|
if published:
|
|
meta_parts.append(f"**Published:** {published}")
|
|
if author:
|
|
meta_parts.append(f"**Author:** {author}")
|
|
if meta_parts:
|
|
f.write(" | ".join(meta_parts) + "\n\n")
|
|
|
|
summary = article.get("summary", "")
|
|
if summary:
|
|
# Show first 200 chars of summary
|
|
short = summary[:200] + "..." if len(summary) > 200 else summary
|
|
f.write(f"{short}\n\n")
|
|
|
|
if link:
|
|
f.write(f"[Read more]({link})\n\n")
|
|
|
|
# Authors
|
|
author_counts = self._count_authors()
|
|
if author_counts:
|
|
f.write(f"## ✍️ Authors ({len(author_counts)})\n\n")
|
|
for author, count in sorted(
|
|
author_counts.items(), key=lambda x: x[1], reverse=True
|
|
)[:15]:
|
|
f.write(f"- **{author}**: {count} articles\n")
|
|
f.write("\n")
|
|
|
|
# All categories/tags
|
|
all_cats = self.extracted_data.get("all_categories", [])
|
|
if all_cats:
|
|
f.write(f"## 🏷️ Tags ({len(all_cats)})\n\n")
|
|
f.write(", ".join(f"`{cat}`" for cat in all_cats[:50]))
|
|
if len(all_cats) > 50:
|
|
f.write(f" ... and {len(all_cats) - 50} more")
|
|
f.write("\n\n")
|
|
|
|
# Statistics
|
|
f.write("## 📊 Feed Statistics\n\n")
|
|
f.write(f"- **Total Articles**: {total_articles}\n")
|
|
f.write(f"- **Feed Type**: {feed_type}\n")
|
|
f.write(f"- **Categories/Tags**: {len(all_cats)}\n")
|
|
f.write(f"- **Authors**: {len(author_counts)}\n")
|
|
followed = self.extracted_data.get("followed_links", False)
|
|
f.write(f"- **Full Content Scraped**: {'Yes' if followed else 'No'}\n\n")
|
|
|
|
# Date range
|
|
date_range = self._get_date_range()
|
|
if date_range:
|
|
f.write(f"- **Date Range**: {date_range[0]} to {date_range[1]}\n\n")
|
|
|
|
# Navigation
|
|
f.write("## 🗺️ Navigation\n\n")
|
|
f.write("**Reference Files:**\n\n")
|
|
for cat_key, cat_data in sorted(categorized.items()):
|
|
safe_name = self._sanitize_filename(cat_data["title"])
|
|
f.write(
|
|
f"- `references/{safe_name}.md` - {cat_data['title']}"
|
|
f" ({len(cat_data['articles'])} articles)\n"
|
|
)
|
|
f.write("\n")
|
|
f.write("See `references/index.md` for complete feed structure.\n\n")
|
|
|
|
# Footer
|
|
f.write("---\n\n")
|
|
f.write("**Generated by Skill Seeker** | RSS/Atom Feed Scraper\n")
|
|
|
|
with open(filepath, encoding="utf-8") as f:
|
|
line_count = len(f.read().split("\n"))
|
|
print(f" Generated: {filepath} ({line_count} lines)")
|
|
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
# Utility helpers
|
|
# ──────────────────────────────────────────────────────────────────────
|
|
|
|
def _count_authors(self) -> dict[str, int]:
|
|
"""Count articles per author."""
|
|
if not self.extracted_data:
|
|
return {}
|
|
counts: dict[str, int] = {}
|
|
for article in self.extracted_data.get("articles", []):
|
|
author = article.get("author", "").strip()
|
|
if author:
|
|
counts[author] = counts.get(author, 0) + 1
|
|
return counts
|
|
|
|
def _get_date_range(self) -> tuple[str, str] | None:
|
|
"""Get the date range (earliest, latest) of articles, or None."""
|
|
if not self.extracted_data:
|
|
return None
|
|
dates: list[str] = []
|
|
for article in self.extracted_data.get("articles", []):
|
|
iso = article.get("published_iso", "")
|
|
if iso:
|
|
dates.append(iso)
|
|
if not dates:
|
|
return None
|
|
dates.sort()
|
|
return (dates[0][:10], dates[-1][:10])
|
|
|
|
def _sanitize_filename(self, name: str) -> str:
|
|
"""Convert a string to a safe filename."""
|
|
safe = re.sub(r"[^\w\s-]", "", name.lower())
|
|
safe = re.sub(r"[-\s]+", "_", safe)
|
|
return safe or "unnamed"
|
|
|
|
|
|
# ──────────────────────────────────────────────────────────────────────────
|
|
# CLI entry point
|
|
# ──────────────────────────────────────────────────────────────────────────
|
|
|
|
|
|
def main() -> int:
|
|
"""CLI entry point for the RSS/Atom feed scraper."""
|
|
from .arguments.common import add_all_standard_arguments
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Convert RSS/Atom feed to AI-ready skill",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=(
|
|
"Examples:\n"
|
|
" %(prog)s --feed-url https://example.com/feed.xml --name myblog\n"
|
|
" %(prog)s --feed-path ./feed.xml --name myblog\n"
|
|
" %(prog)s --feed-url https://example.com/rss --no-follow-links --name myblog\n"
|
|
" %(prog)s --from-json myblog_extracted.json\n"
|
|
),
|
|
)
|
|
|
|
# Standard arguments (name, description, output, enhance-level, etc.)
|
|
add_all_standard_arguments(parser)
|
|
|
|
# Override enhance-level default to 0 for RSS
|
|
for action in parser._actions:
|
|
if hasattr(action, "dest") and action.dest == "enhance_level":
|
|
action.default = 0
|
|
action.help = (
|
|
"AI enhancement level (auto-detects API vs LOCAL mode): "
|
|
"0=disabled (default for RSS), 1=SKILL.md only, "
|
|
"2=+architecture/config, 3=full enhancement. "
|
|
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
|
|
"otherwise LOCAL (Claude Code)"
|
|
)
|
|
|
|
# RSS-specific arguments
|
|
parser.add_argument(
|
|
"--feed-url",
|
|
type=str,
|
|
help="URL of the RSS/Atom feed to scrape",
|
|
metavar="URL",
|
|
)
|
|
parser.add_argument(
|
|
"--feed-path",
|
|
type=str,
|
|
help="Local file path to an RSS/Atom XML file",
|
|
metavar="PATH",
|
|
)
|
|
parser.add_argument(
|
|
"--follow-links",
|
|
action="store_true",
|
|
default=True,
|
|
dest="follow_links",
|
|
help="Follow article links to scrape full content (default: enabled)",
|
|
)
|
|
parser.add_argument(
|
|
"--no-follow-links",
|
|
action="store_false",
|
|
dest="follow_links",
|
|
help="Do not follow article links — use feed content only",
|
|
)
|
|
parser.add_argument(
|
|
"--max-articles",
|
|
type=int,
|
|
default=50,
|
|
metavar="N",
|
|
help="Maximum number of articles to process (default: 50)",
|
|
)
|
|
parser.add_argument(
|
|
"--from-json",
|
|
type=str,
|
|
help="Build skill from previously extracted JSON file",
|
|
metavar="FILE",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Set logging level
|
|
if getattr(args, "quiet", False):
|
|
logging.getLogger().setLevel(logging.WARNING)
|
|
elif getattr(args, "verbose", False):
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
# Handle --dry-run
|
|
if getattr(args, "dry_run", False):
|
|
source = (
|
|
getattr(args, "feed_url", None)
|
|
or getattr(args, "feed_path", None)
|
|
or getattr(args, "from_json", None)
|
|
or "(none)"
|
|
)
|
|
print(f"\n{'=' * 60}")
|
|
print("DRY RUN: RSS/Atom Feed Extraction")
|
|
print(f"{'=' * 60}")
|
|
print(f"Source: {source}")
|
|
print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}")
|
|
print(f"Follow links: {getattr(args, 'follow_links', True)}")
|
|
print(f"Max articles: {getattr(args, 'max_articles', 50)}")
|
|
print(f"Enhance level: {getattr(args, 'enhance_level', 0)}")
|
|
print(f"\n✅ Dry run complete")
|
|
return 0
|
|
|
|
# Validate inputs
|
|
has_source = (
|
|
getattr(args, "feed_url", None)
|
|
or getattr(args, "feed_path", None)
|
|
or getattr(args, "from_json", None)
|
|
)
|
|
if not has_source:
|
|
parser.error("Must specify --feed-url, --feed-path, or --from-json")
|
|
|
|
# Build from JSON workflow
|
|
if getattr(args, "from_json", None):
|
|
name = Path(args.from_json).stem.replace("_extracted", "")
|
|
config: dict[str, Any] = {
|
|
"name": getattr(args, "name", None) or name,
|
|
"description": getattr(args, "description", None)
|
|
or f"Use when referencing {name} feed content",
|
|
}
|
|
try:
|
|
converter = RssToSkillConverter(config)
|
|
converter.load_extracted_data(args.from_json)
|
|
converter.build_skill()
|
|
except Exception as e:
|
|
print(f"\n❌ Error: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
return 0
|
|
|
|
# Feed extraction workflow
|
|
if not getattr(args, "name", None):
|
|
# Auto-detect name from URL or file path
|
|
if getattr(args, "feed_url", None):
|
|
from urllib.parse import urlparse
|
|
|
|
parsed_url = urlparse(args.feed_url)
|
|
args.name = parsed_url.hostname.replace(".", "-") if parsed_url.hostname else "feed"
|
|
elif getattr(args, "feed_path", None):
|
|
args.name = Path(args.feed_path).stem
|
|
|
|
config = {
|
|
"name": args.name,
|
|
"feed_url": getattr(args, "feed_url", "") or "",
|
|
"feed_path": getattr(args, "feed_path", "") or "",
|
|
"follow_links": getattr(args, "follow_links", True),
|
|
"max_articles": getattr(args, "max_articles", 50),
|
|
"description": getattr(args, "description", None),
|
|
}
|
|
|
|
try:
|
|
converter = RssToSkillConverter(config)
|
|
|
|
# Extract feed
|
|
if not converter.extract_feed():
|
|
print("\n❌ Feed extraction failed — see error above", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Build skill
|
|
converter.build_skill()
|
|
|
|
# Enhancement Workflow Integration
|
|
from skill_seekers.cli.workflow_runner import run_workflows
|
|
|
|
workflow_executed, workflow_names = run_workflows(args)
|
|
workflow_name = ", ".join(workflow_names) if workflow_names else None
|
|
|
|
# Traditional enhancement (complements workflow system)
|
|
if getattr(args, "enhance_level", 0) > 0:
|
|
api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
|
|
mode = "API" if api_key else "LOCAL"
|
|
|
|
print("\n" + "=" * 80)
|
|
print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})")
|
|
print("=" * 80)
|
|
if workflow_executed:
|
|
print(f" Running after workflow: {workflow_name}")
|
|
print(
|
|
" (Workflow provides specialized analysis, "
|
|
"enhancement provides general improvements)"
|
|
)
|
|
print("")
|
|
|
|
skill_dir = converter.skill_dir
|
|
if api_key:
|
|
try:
|
|
from skill_seekers.cli.enhance_skill import enhance_skill_md
|
|
|
|
enhance_skill_md(skill_dir, api_key)
|
|
print("✅ API enhancement complete!")
|
|
except ImportError:
|
|
print("❌ API enhancement not available. Falling back to LOCAL mode...")
|
|
from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
|
|
|
|
enhancer = LocalSkillEnhancer(Path(skill_dir))
|
|
enhancer.run(headless=True)
|
|
print("✅ Local enhancement complete!")
|
|
else:
|
|
from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
|
|
|
|
enhancer = LocalSkillEnhancer(Path(skill_dir))
|
|
enhancer.run(headless=True)
|
|
print("✅ Local enhancement complete!")
|
|
|
|
except RuntimeError as e:
|
|
print(f"\n❌ Error: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"\n❌ Unexpected error during feed processing: {e}", file=sys.stderr)
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
sys.exit(1)
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|