Files
skill-seekers-reference/src/skill_seekers/cli/rss_scraper.py
yusyus 53b911b697 feat: add 10 new skill source types (17 total) with full pipeline integration
Add Jupyter Notebook, Local HTML, OpenAPI/Swagger, AsciiDoc, PowerPoint,
RSS/Atom, Man Pages, Confluence, Notion, and Slack/Discord Chat as new
skill source types. Each type is fully integrated across:

- Standalone CLI commands (skill-seekers <type>)
- Auto-detection via 'skill-seekers create' (file extension + content sniffing)
- Unified multi-source configs (scraped_data, dispatch, config validation)
- Unified skill builder (generic merge + source-attributed synthesis)
- MCP server (scrape_generic tool with per-type flag mapping)
- pyproject.toml (entry points, optional deps, [all] group)

Also fixes: EPUB unified pipeline gap, missing word/video config validators,
OpenAPI yaml import guard, MCP flag mismatch for all 10 types, stale
docstrings, and adds 77 integration tests + complex-merge workflow.

50 files changed, +20,201 lines
2026-03-15 15:30:15 +03:00

1088 lines
44 KiB
Python

#!/usr/bin/env python3
"""
RSS/Atom Feed to Skill Converter
Converts RSS 2.0, RSS 1.0 (RDF), and Atom feeds into AI-ready skills.
Uses feedparser for feed parsing, optionally follows article links to scrape
full content using requests + BeautifulSoup.
Supports both remote feed URLs and local feed XML files. Extracts article
metadata (title, author, published date, categories), feed-level metadata
(title, description, link, language), and optionally the full article text
from linked pages.
Usage:
skill-seekers rss --feed-url https://example.com/feed.xml --name myblog
skill-seekers rss --feed-path ./feed.xml --name myblog
skill-seekers rss --feed-url https://example.com/rss --no-follow-links --name myblog
skill-seekers rss --from-json myblog_extracted.json
python3 -m skill_seekers.cli.rss_scraper --feed-url https://example.com/atom.xml --name myblog
"""
import argparse
import hashlib
import json
import logging
import os
import re
import sys
import time
from datetime import datetime
from pathlib import Path
from typing import Any
# Optional dependency guard — feedparser is not in core deps
try:
import feedparser # noqa: F401
FEEDPARSER_AVAILABLE = True
except ImportError:
FEEDPARSER_AVAILABLE = False
# BeautifulSoup is a core dependency (always available)
from bs4 import BeautifulSoup, Comment, Tag
logger = logging.getLogger(__name__)
# Feed type constants
FEED_TYPE_RSS_20 = "RSS 2.0"
FEED_TYPE_RSS_10 = "RSS 1.0 (RDF)"
FEED_TYPE_ATOM = "Atom"
FEED_TYPE_UNKNOWN = "Unknown"
# Default request headers for scraping article pages
_DEFAULT_HEADERS = {
"User-Agent": "SkillSeekers/RSS-Scraper (https://github.com/skill-seekers)",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
}
# Tags to strip from scraped article HTML
_STRIP_TAGS = {"script", "style", "nav", "footer", "header", "aside", "noscript", "iframe"}
# Maximum length for a single article's scraped text (characters)
_MAX_ARTICLE_TEXT_LENGTH = 50_000
# Delay between HTTP requests when following links (seconds)
_REQUEST_DELAY = 1.0
def _check_feedparser_deps() -> None:
"""Raise RuntimeError if feedparser is not installed."""
if not FEEDPARSER_AVAILABLE:
raise RuntimeError(
"feedparser is required for RSS/Atom feed support.\n"
'Install with: pip install "skill-seekers[rss]"\n'
"Or: pip install feedparser"
)
def infer_description_from_feed(
feed_meta: dict[str, Any] | None = None,
name: str = "",
) -> str:
"""Infer skill description from feed-level metadata.
Tries to build a meaningful "Use when..." description from the feed
title and subtitle/description fields.
Args:
feed_meta: Feed metadata dict with title, description, link, etc.
name: Skill name for fallback.
Returns:
Description string suitable for "Use when..." format.
"""
if feed_meta:
desc = feed_meta.get("description", "")
if desc and len(desc) > 20:
if len(desc) > 150:
desc = desc[:147] + "..."
return f"Use when referencing {desc.lower()}"
title = feed_meta.get("title", "")
if title and len(title) > 5:
return f"Use when referencing articles from {title}"
return (
f"Use when referencing {name} feed content"
if name
else "Use when referencing this feed content"
)
class RssToSkillConverter:
"""Convert RSS/Atom feeds to AI-ready skills.
Parses RSS 2.0, RSS 1.0 (RDF), and Atom feeds using feedparser.
Optionally follows article links to scrape full page content via
requests + BeautifulSoup.
"""
def __init__(self, config: dict[str, Any]) -> None:
"""Initialize the converter with configuration.
Args:
config: Dictionary with name (required), feed_url, feed_path,
follow_links (default True), max_articles (default 50),
and description (optional).
"""
self.config = config
self.name: str = config["name"]
self.feed_url: str = config.get("feed_url", "")
self.feed_path: str = config.get("feed_path", "")
self.follow_links: bool = config.get("follow_links", True)
self.max_articles: int = config.get("max_articles", 50)
self.description: str = config.get(
"description", f"Use when referencing {self.name} feed content"
)
# Output paths
self.skill_dir: str = f"output/{self.name}"
self.data_file: str = f"output/{self.name}_extracted.json"
# Internal state
self.extracted_data: dict[str, Any] | None = None
# ──────────────────────────────────────────────────────────────────────
# Public API
# ──────────────────────────────────────────────────────────────────────
def extract_feed(self) -> bool:
"""Parse the RSS/Atom feed and extract article data.
Parses feed, extracts metadata and articles, optionally follows links
to scrape full content, saves intermediate JSON.
Returns:
True on success.
"""
_check_feedparser_deps()
source = self.feed_url or self.feed_path
print(f"\n🔍 Extracting RSS/Atom feed: {source}")
# Parse the feed
parsed = self._parse_feed()
# Detect feed type
feed_type = self._detect_feed_type(parsed)
print(f" Feed type: {feed_type}")
# Extract feed-level metadata
feed_meta = self._extract_feed_metadata(parsed)
print(f" Title: {feed_meta.get('title', 'Unknown')}")
print(f" Link: {feed_meta.get('link', 'N/A')}")
print(f" Language: {feed_meta.get('language', 'N/A')}")
# Update description from feed metadata if not explicitly set
if "description" not in self.config:
self.description = infer_description_from_feed(feed_meta, self.name)
# Extract articles
articles = self._extract_articles(parsed)
print(f" Articles found: {len(articles)}")
# Optionally scrape full article content
if self.follow_links:
print(f"\n🌐 Following article links (max {len(articles)})...")
scraped_count = 0
for i, article in enumerate(articles):
link = article.get("link", "")
if not link:
continue
print(f" [{i + 1}/{len(articles)}] {link[:80]}...")
content = self._scrape_article_content(link)
if content:
article["full_text"] = content
scraped_count += 1
# Be polite — delay between requests
if i < len(articles) - 1:
time.sleep(_REQUEST_DELAY)
print(f" Scraped full content for {scraped_count}/{len(articles)} articles")
else:
print(" Skipping link following (--no-follow-links)")
# Categorize articles by feed categories/tags
all_categories = self._collect_all_categories(articles)
# Build result data
result_data: dict[str, Any] = {
"source": source,
"feed_type": feed_type,
"feed_metadata": feed_meta,
"total_articles": len(articles),
"followed_links": self.follow_links,
"all_categories": sorted(all_categories),
"articles": articles,
}
# Persist extracted data
os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True)
with open(self.data_file, "w", encoding="utf-8") as f:
json.dump(result_data, f, indent=2, ensure_ascii=False, default=str)
print(f"\n💾 Saved extracted data to: {self.data_file}")
self.extracted_data = result_data
print(
f"✅ Extracted {len(articles)} articles ({len(all_categories)} unique categories/tags)"
)
return True
def load_extracted_data(self, json_path: str) -> bool:
"""Load previously extracted data from a JSON file."""
print(f"\n📂 Loading extracted data from: {json_path}")
if not os.path.exists(json_path):
raise FileNotFoundError(f"Extracted data file not found: {json_path}")
with open(json_path, encoding="utf-8") as f:
self.extracted_data = json.load(f)
total = self.extracted_data.get(
"total_articles", len(self.extracted_data.get("articles", []))
)
print(f"✅ Loaded {total} articles")
return True
def categorize_content(self) -> dict[str, dict[str, Any]]:
"""Categorize articles by their feed categories/tags."""
print("\n📋 Categorizing content by feed tags...")
if not self.extracted_data:
raise RuntimeError("No extracted data available. Call extract_feed() first.")
articles = self.extracted_data.get("articles", [])
categorized: dict[str, dict[str, Any]] = {}
for article in articles:
cats = article.get("categories", [])
if not cats:
cats = ["uncategorized"]
for cat in cats:
cat_key = self._sanitize_filename(cat)
if cat_key not in categorized:
categorized[cat_key] = {
"title": cat,
"articles": [],
}
# Avoid duplicates if an article has overlapping normalized keys
article_id = article.get("id", article.get("link", ""))
existing_ids = {
a.get("id", a.get("link", "")) for a in categorized[cat_key]["articles"]
}
if article_id not in existing_ids:
categorized[cat_key]["articles"].append(article)
# If no categories at all, put everything in one group
if not categorized:
categorized["all_articles"] = {
"title": "All Articles",
"articles": articles,
}
print(f"✅ Created {len(categorized)} categories")
for cat_key, cat_data in categorized.items():
print(f" - {cat_data['title']}: {len(cat_data['articles'])} articles")
return categorized
def build_skill(self) -> None:
"""Build complete skill structure from extracted data."""
print(f"\n🏗️ Building skill: {self.name}")
if not self.extracted_data:
raise RuntimeError("No extracted data available. Call extract_feed() first.")
# Create directories
os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
# Categorize content
categorized = self.categorize_content()
# Generate reference files
print("\n📝 Generating reference files...")
for cat_key, cat_data in categorized.items():
self._generate_reference_file(cat_key, cat_data)
# Generate index
self._generate_index(categorized)
# Generate SKILL.md
self._generate_skill_md(categorized)
print(f"\n✅ Skill built successfully: {self.skill_dir}/")
print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/")
# ──────────────────────────────────────────────────────────────────────
# Feed parsing internals
# ──────────────────────────────────────────────────────────────────────
def _parse_feed(self) -> "feedparser.FeedParserDict":
"""Parse feed from URL or local file using feedparser."""
import feedparser as fp
if self.feed_path:
if not os.path.exists(self.feed_path):
raise FileNotFoundError(f"Feed file not found: {self.feed_path}")
logger.info("Parsing feed from local file: %s", self.feed_path)
parsed = fp.parse(self.feed_path)
elif self.feed_url:
logger.info("Fetching feed from URL: %s", self.feed_url)
parsed = fp.parse(
self.feed_url,
agent="SkillSeekers/RSS-Scraper",
)
else:
raise RuntimeError(
"No feed source provided. Use feed_url (remote URL) or feed_path (local file)."
)
# Check for parsing errors
if parsed.bozo and not parsed.entries:
exc = parsed.get("bozo_exception", "Unknown parse error")
raise RuntimeError(f"Failed to parse feed: {exc}")
return parsed
def _detect_feed_type(self, parsed: "feedparser.FeedParserDict") -> str:
"""Detect RSS 2.0, RSS 1.0, or Atom from feedparser's version field."""
version = getattr(parsed, "version", "") or ""
version_lower = version.lower()
if "atom" in version_lower:
return FEED_TYPE_ATOM
if "rss20" in version_lower or version_lower == "rss20":
return FEED_TYPE_RSS_20
if "rss10" in version_lower or "rdf" in version_lower:
return FEED_TYPE_RSS_10
if version_lower.startswith("rss"):
return FEED_TYPE_RSS_20
# Fallback heuristic: check feed dict for version clues
feed = parsed.get("feed", {})
if feed.get("xmlns", "").startswith("http://www.w3.org/2005/Atom"):
return FEED_TYPE_ATOM
if feed.get("rss_version"):
return FEED_TYPE_RSS_20
return FEED_TYPE_UNKNOWN
def _extract_feed_metadata(self, parsed: "feedparser.FeedParserDict") -> dict[str, Any]:
"""Extract feed-level metadata (title, description, link, language, etc.)."""
feed = parsed.get("feed", {})
# feedparser normalizes subtitle (Atom) and description (RSS)
description = feed.get("subtitle", "") or feed.get("description", "")
# Published / updated dates
published = feed.get("published", "") or feed.get("updated", "")
# Feed image (RSS <image>, Atom <icon>/<logo>)
image_url = ""
image_data = feed.get("image", {})
if isinstance(image_data, dict):
image_url = image_data.get("href", "") or image_data.get("url", "")
elif isinstance(image_data, str):
image_url = image_data
return {
"title": feed.get("title", "Untitled Feed"),
"description": description,
"link": feed.get("link", ""),
"language": feed.get("language", ""),
"author": feed.get("author", ""),
"published": published,
"generator": feed.get("generator", ""),
"image_url": image_url,
"rights": feed.get("rights", ""),
}
def _extract_articles(self, parsed: "feedparser.FeedParserDict") -> list[dict[str, Any]]:
"""Extract article entries (title, link, summary, date, author, categories)."""
articles: list[dict[str, Any]] = []
for entry in parsed.entries[: self.max_articles]:
# Unique identifier (Atom id, RSS guid, or link hash)
entry_id = entry.get("id", "") or entry.get("link", "")
if not entry_id:
entry_id = hashlib.sha256(entry.get("title", "").encode("utf-8")).hexdigest()[:16]
# Published date normalization
published = entry.get("published", "") or entry.get("updated", "")
published_parsed = entry.get("published_parsed") or entry.get("updated_parsed")
published_iso = ""
if published_parsed:
try:
dt = datetime(*published_parsed[:6])
published_iso = dt.isoformat()
except (TypeError, ValueError):
published_iso = published
# Categories / tags
categories: list[str] = []
for tag_data in entry.get("tags", []):
term = tag_data.get("term", "")
if term:
categories.append(term)
# Summary — feedparser may provide HTML; clean it
summary_raw = entry.get("summary", "") or entry.get("description", "")
summary_text = self._html_to_text(summary_raw) if summary_raw else ""
# Content — some feeds include full content inline
content_text = ""
content_list = entry.get("content", [])
if content_list and isinstance(content_list, list):
for content_block in content_list:
value = content_block.get("value", "")
if value:
content_text += self._html_to_text(value) + "\n\n"
content_text = content_text.strip()
# Author(s)
author = entry.get("author", "")
if not author:
authors_detail = entry.get("authors", [])
if authors_detail:
author = ", ".join(a.get("name", "") for a in authors_detail if a.get("name"))
article: dict[str, Any] = {
"id": entry_id,
"title": entry.get("title", "Untitled"),
"link": entry.get("link", ""),
"summary": summary_text,
"content": content_text,
"published": published,
"published_iso": published_iso,
"author": author,
"categories": categories,
}
articles.append(article)
return articles
def _scrape_article_content(self, url: str) -> str:
"""Follow article URL, extract full page content using requests + BeautifulSoup."""
try:
import requests
except ImportError:
logger.warning(
"requests library not available — cannot follow article links. "
"Install with: pip install requests"
)
return ""
try:
response = requests.get(
url,
headers=_DEFAULT_HEADERS,
timeout=15,
allow_redirects=True,
)
response.raise_for_status()
except Exception as e:
logger.debug("Failed to fetch %s: %s", url, e)
return ""
content_type = response.headers.get("Content-Type", "")
if "html" not in content_type.lower() and "xml" not in content_type.lower():
logger.debug("Skipping non-HTML content at %s (type: %s)", url, content_type)
return ""
return self._extract_article_text(response.text)
def _extract_article_text(self, html: str) -> str:
"""Clean article HTML to text/markdown. Finds <article>/<main>, strips nav/ads."""
soup = BeautifulSoup(html, "html.parser")
# Remove unwanted elements
for tag_name in _STRIP_TAGS:
for element in soup.find_all(tag_name):
element.decompose()
for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
comment.extract()
# Try to find the main article container
main_content = (
soup.find("article")
or soup.find("main")
or soup.find(attrs={"role": "main"})
or soup.find(attrs={"id": re.compile(r"(content|article|post|entry)", re.I)})
or soup.find(attrs={"class": re.compile(r"(content|article|post|entry)", re.I)})
)
if not main_content:
main_content = soup.find("body") or soup
# Convert to text with basic structure preservation
text_parts: list[str] = []
for element in main_content.descendants:
if isinstance(element, Tag):
if element.name in ("h1", "h2", "h3", "h4", "h5", "h6"):
level = int(element.name[1])
heading_text = element.get_text(strip=True)
if heading_text:
text_parts.append(f"\n{'#' * level} {heading_text}\n")
elif element.name == "p":
para_text = element.get_text(separator=" ", strip=True)
if para_text:
text_parts.append(f"\n{para_text}\n")
elif element.name in ("pre", "code"):
code_text = element.get_text()
if code_text and code_text.strip():
# Detect language from class if available
classes = element.get("class", [])
lang = ""
for cls in classes:
if isinstance(cls, str) and (
cls.startswith("language-") or cls.startswith("lang-")
):
lang = cls.split("-", 1)[1]
break
text_parts.append(f"\n```{lang}\n{code_text.strip()}\n```\n")
elif element.name == "li":
li_text = element.get_text(separator=" ", strip=True)
if li_text:
text_parts.append(f"- {li_text}")
elif element.name == "blockquote":
bq_text = element.get_text(separator=" ", strip=True)
if bq_text:
text_parts.append(f"\n> {bq_text}\n")
text = "\n".join(text_parts).strip()
# Collapse excessive whitespace
text = re.sub(r"\n{4,}", "\n\n\n", text)
# Truncate if too long
if len(text) > _MAX_ARTICLE_TEXT_LENGTH:
text = text[:_MAX_ARTICLE_TEXT_LENGTH] + "\n\n[Content truncated]"
return text
# ──────────────────────────────────────────────────────────────────────
# Categorization helpers
# ──────────────────────────────────────────────────────────────────────
def _collect_all_categories(self, articles: list[dict[str, Any]]) -> set[str]:
"""Collect all unique category/tag strings across articles."""
categories: set[str] = set()
for article in articles:
for cat in article.get("categories", []):
if cat:
categories.add(cat)
return categories
def _html_to_text(self, html_fragment: str) -> str:
"""Convert an HTML fragment to plain text, stripping all tags."""
if not html_fragment:
return ""
soup = BeautifulSoup(html_fragment, "html.parser")
text = soup.get_text(separator=" ", strip=True)
# Collapse multiple spaces
text = re.sub(r"\s+", " ", text).strip()
return text
# ──────────────────────────────────────────────────────────────────────
# Skill generation — reference files
# ──────────────────────────────────────────────────────────────────────
def _generate_reference_file(self, cat_key: str, cat_data: dict[str, Any]) -> None:
"""Generate a reference markdown file for a category of articles."""
safe_name = self._sanitize_filename(cat_data["title"])
filepath = f"{self.skill_dir}/references/{safe_name}.md"
articles = cat_data["articles"]
with open(filepath, "w", encoding="utf-8") as f:
f.write(f"# {cat_data['title']}\n\n")
f.write(f"**Articles:** {len(articles)}\n\n")
f.write("---\n\n")
for article in articles:
f.write(f"## {article.get('title', 'Untitled')}\n\n")
# Metadata block
if article.get("author"):
f.write(f"**Author:** {article['author']}\n\n")
if article.get("published"):
f.write(f"**Published:** {article['published']}\n\n")
if article.get("link"):
f.write(f"**Link:** {article['link']}\n\n")
if article.get("categories"):
tags = ", ".join(article["categories"])
f.write(f"**Tags:** {tags}\n\n")
# Summary
summary = article.get("summary", "")
if summary:
f.write("### Summary\n\n")
f.write(f"{summary}\n\n")
# Inline content from feed (if present)
inline_content = article.get("content", "")
if inline_content and inline_content != summary:
f.write("### Content\n\n")
f.write(f"{inline_content}\n\n")
# Full scraped text
full_text = article.get("full_text", "")
if full_text:
f.write("### Full Article\n\n")
f.write(f"{full_text}\n\n")
f.write("---\n\n")
print(f" Generated: {filepath}")
def _generate_index(self, categorized: dict[str, dict[str, Any]]) -> None:
"""Generate the reference index file with category links and statistics."""
filepath = f"{self.skill_dir}/references/index.md"
with open(filepath, "w", encoding="utf-8") as f:
f.write(f"# {self.name.title()} Feed Reference Index\n\n")
feed_meta = self.extracted_data.get("feed_metadata", {})
if feed_meta.get("title"):
f.write(f"**Feed:** {feed_meta['title']}\n\n")
if feed_meta.get("link"):
f.write(f"**Source:** {feed_meta['link']}\n\n")
f.write("## Categories\n\n")
total_articles = 0
for cat_key, cat_data in sorted(categorized.items()):
safe_name = self._sanitize_filename(cat_data["title"])
count = len(cat_data["articles"])
total_articles += count
f.write(f"- [{cat_data['title']}]({safe_name}.md) ({count} articles)\n")
f.write(f"\n**Total articles:** {total_articles}\n\n")
# Statistics
f.write("## Statistics\n\n")
f.write(f"- Total articles: {self.extracted_data.get('total_articles', 0)}\n")
f.write(f"- Feed type: {self.extracted_data.get('feed_type', FEED_TYPE_UNKNOWN)}\n")
f.write(
f"- Links followed: "
f"{'Yes' if self.extracted_data.get('followed_links') else 'No'}\n"
)
all_cats = self.extracted_data.get("all_categories", [])
if all_cats:
f.write(f"- Unique tags: {len(all_cats)}\n")
# Author summary
author_counts = self._count_authors()
if author_counts:
f.write(f"\n## Authors ({len(author_counts)})\n\n")
for author, count in sorted(
author_counts.items(), key=lambda x: x[1], reverse=True
)[:20]:
f.write(f"- {author}: {count} articles\n")
print(f" Generated: {filepath}")
def _generate_skill_md(self, categorized: dict[str, dict[str, Any]]) -> None:
"""Generate the main SKILL.md file with feed overview and navigation."""
filepath = f"{self.skill_dir}/SKILL.md"
feed_meta = self.extracted_data.get("feed_metadata", {})
feed_title = feed_meta.get("title", self.name.title())
feed_type = self.extracted_data.get("feed_type", FEED_TYPE_UNKNOWN)
# Skill name for frontmatter (lowercase, hyphens, max 64 chars)
skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
# Truncate description
desc = self.description[:1024] if len(self.description) > 1024 else self.description
with open(filepath, "w", encoding="utf-8") as f:
# YAML frontmatter
f.write("---\n")
f.write(f"name: {skill_name}\n")
f.write(f"description: {desc}\n")
f.write("---\n\n")
# Header
f.write(f"# {feed_title} Feed Skill\n\n")
f.write(f"{self.description}\n\n")
# Feed Information
f.write("## 📡 Feed Information\n\n")
f.write(f"**Feed Title:** {feed_title}\n\n")
f.write(f"**Feed Type:** {feed_type}\n\n")
if feed_meta.get("link"):
f.write(f"**Website:** {feed_meta['link']}\n\n")
if feed_meta.get("language"):
f.write(f"**Language:** {feed_meta['language']}\n\n")
if feed_meta.get("description"):
feed_desc = feed_meta["description"]
if len(feed_desc) > 300:
feed_desc = feed_desc[:297] + "..."
f.write(f"**Description:** {feed_desc}\n\n")
if feed_meta.get("generator"):
f.write(f"**Generator:** {feed_meta['generator']}\n\n")
if feed_meta.get("rights"):
f.write(f"**Rights:** {feed_meta['rights']}\n\n")
# When to Use
f.write("## 💡 When to Use This Skill\n\n")
f.write("Use this skill when you need to:\n")
f.write(f"- Reference articles and content from {feed_title}\n")
f.write("- Look up specific topics covered in the feed\n")
f.write("- Find author perspectives and expert analysis\n")
f.write("- Review recent posts and updates on the subject\n")
f.write("- Explore categorized content by tags or topics\n\n")
# Article Overview
total_articles = self.extracted_data.get("total_articles", 0)
f.write("## 📖 Article Overview\n\n")
f.write(f"**Total Articles:** {total_articles}\n\n")
# Category breakdown
f.write("**Content by Category:**\n\n")
for cat_key, cat_data in sorted(categorized.items()):
count = len(cat_data["articles"])
f.write(f"- **{cat_data['title']}**: {count} articles\n")
f.write("\n")
# Recent articles (top 10 by date or order)
articles = self.extracted_data.get("articles", [])
recent = articles[:10]
if recent:
f.write("## 📰 Recent Articles\n\n")
for article in recent:
title = article.get("title", "Untitled")
published = article.get("published", "")
author = article.get("author", "")
link = article.get("link", "")
f.write(f"### {title}\n\n")
meta_parts: list[str] = []
if published:
meta_parts.append(f"**Published:** {published}")
if author:
meta_parts.append(f"**Author:** {author}")
if meta_parts:
f.write(" | ".join(meta_parts) + "\n\n")
summary = article.get("summary", "")
if summary:
# Show first 200 chars of summary
short = summary[:200] + "..." if len(summary) > 200 else summary
f.write(f"{short}\n\n")
if link:
f.write(f"[Read more]({link})\n\n")
# Authors
author_counts = self._count_authors()
if author_counts:
f.write(f"## ✍️ Authors ({len(author_counts)})\n\n")
for author, count in sorted(
author_counts.items(), key=lambda x: x[1], reverse=True
)[:15]:
f.write(f"- **{author}**: {count} articles\n")
f.write("\n")
# All categories/tags
all_cats = self.extracted_data.get("all_categories", [])
if all_cats:
f.write(f"## 🏷️ Tags ({len(all_cats)})\n\n")
f.write(", ".join(f"`{cat}`" for cat in all_cats[:50]))
if len(all_cats) > 50:
f.write(f" ... and {len(all_cats) - 50} more")
f.write("\n\n")
# Statistics
f.write("## 📊 Feed Statistics\n\n")
f.write(f"- **Total Articles**: {total_articles}\n")
f.write(f"- **Feed Type**: {feed_type}\n")
f.write(f"- **Categories/Tags**: {len(all_cats)}\n")
f.write(f"- **Authors**: {len(author_counts)}\n")
followed = self.extracted_data.get("followed_links", False)
f.write(f"- **Full Content Scraped**: {'Yes' if followed else 'No'}\n\n")
# Date range
date_range = self._get_date_range()
if date_range:
f.write(f"- **Date Range**: {date_range[0]} to {date_range[1]}\n\n")
# Navigation
f.write("## 🗺️ Navigation\n\n")
f.write("**Reference Files:**\n\n")
for cat_key, cat_data in sorted(categorized.items()):
safe_name = self._sanitize_filename(cat_data["title"])
f.write(
f"- `references/{safe_name}.md` - {cat_data['title']}"
f" ({len(cat_data['articles'])} articles)\n"
)
f.write("\n")
f.write("See `references/index.md` for complete feed structure.\n\n")
# Footer
f.write("---\n\n")
f.write("**Generated by Skill Seeker** | RSS/Atom Feed Scraper\n")
with open(filepath, encoding="utf-8") as f:
line_count = len(f.read().split("\n"))
print(f" Generated: {filepath} ({line_count} lines)")
# ──────────────────────────────────────────────────────────────────────
# Utility helpers
# ──────────────────────────────────────────────────────────────────────
def _count_authors(self) -> dict[str, int]:
"""Count articles per author."""
if not self.extracted_data:
return {}
counts: dict[str, int] = {}
for article in self.extracted_data.get("articles", []):
author = article.get("author", "").strip()
if author:
counts[author] = counts.get(author, 0) + 1
return counts
def _get_date_range(self) -> tuple[str, str] | None:
"""Get the date range (earliest, latest) of articles, or None."""
if not self.extracted_data:
return None
dates: list[str] = []
for article in self.extracted_data.get("articles", []):
iso = article.get("published_iso", "")
if iso:
dates.append(iso)
if not dates:
return None
dates.sort()
return (dates[0][:10], dates[-1][:10])
def _sanitize_filename(self, name: str) -> str:
"""Convert a string to a safe filename."""
safe = re.sub(r"[^\w\s-]", "", name.lower())
safe = re.sub(r"[-\s]+", "_", safe)
return safe or "unnamed"
# ──────────────────────────────────────────────────────────────────────────
# CLI entry point
# ──────────────────────────────────────────────────────────────────────────
def main() -> int:
"""CLI entry point for the RSS/Atom feed scraper."""
from .arguments.common import add_all_standard_arguments
parser = argparse.ArgumentParser(
description="Convert RSS/Atom feed to AI-ready skill",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=(
"Examples:\n"
" %(prog)s --feed-url https://example.com/feed.xml --name myblog\n"
" %(prog)s --feed-path ./feed.xml --name myblog\n"
" %(prog)s --feed-url https://example.com/rss --no-follow-links --name myblog\n"
" %(prog)s --from-json myblog_extracted.json\n"
),
)
# Standard arguments (name, description, output, enhance-level, etc.)
add_all_standard_arguments(parser)
# Override enhance-level default to 0 for RSS
for action in parser._actions:
if hasattr(action, "dest") and action.dest == "enhance_level":
action.default = 0
action.help = (
"AI enhancement level (auto-detects API vs LOCAL mode): "
"0=disabled (default for RSS), 1=SKILL.md only, "
"2=+architecture/config, 3=full enhancement. "
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
"otherwise LOCAL (Claude Code)"
)
# RSS-specific arguments
parser.add_argument(
"--feed-url",
type=str,
help="URL of the RSS/Atom feed to scrape",
metavar="URL",
)
parser.add_argument(
"--feed-path",
type=str,
help="Local file path to an RSS/Atom XML file",
metavar="PATH",
)
parser.add_argument(
"--follow-links",
action="store_true",
default=True,
dest="follow_links",
help="Follow article links to scrape full content (default: enabled)",
)
parser.add_argument(
"--no-follow-links",
action="store_false",
dest="follow_links",
help="Do not follow article links — use feed content only",
)
parser.add_argument(
"--max-articles",
type=int,
default=50,
metavar="N",
help="Maximum number of articles to process (default: 50)",
)
parser.add_argument(
"--from-json",
type=str,
help="Build skill from previously extracted JSON file",
metavar="FILE",
)
args = parser.parse_args()
# Set logging level
if getattr(args, "quiet", False):
logging.getLogger().setLevel(logging.WARNING)
elif getattr(args, "verbose", False):
logging.getLogger().setLevel(logging.DEBUG)
# Handle --dry-run
if getattr(args, "dry_run", False):
source = (
getattr(args, "feed_url", None)
or getattr(args, "feed_path", None)
or getattr(args, "from_json", None)
or "(none)"
)
print(f"\n{'=' * 60}")
print("DRY RUN: RSS/Atom Feed Extraction")
print(f"{'=' * 60}")
print(f"Source: {source}")
print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}")
print(f"Follow links: {getattr(args, 'follow_links', True)}")
print(f"Max articles: {getattr(args, 'max_articles', 50)}")
print(f"Enhance level: {getattr(args, 'enhance_level', 0)}")
print(f"\n✅ Dry run complete")
return 0
# Validate inputs
has_source = (
getattr(args, "feed_url", None)
or getattr(args, "feed_path", None)
or getattr(args, "from_json", None)
)
if not has_source:
parser.error("Must specify --feed-url, --feed-path, or --from-json")
# Build from JSON workflow
if getattr(args, "from_json", None):
name = Path(args.from_json).stem.replace("_extracted", "")
config: dict[str, Any] = {
"name": getattr(args, "name", None) or name,
"description": getattr(args, "description", None)
or f"Use when referencing {name} feed content",
}
try:
converter = RssToSkillConverter(config)
converter.load_extracted_data(args.from_json)
converter.build_skill()
except Exception as e:
print(f"\n❌ Error: {e}", file=sys.stderr)
sys.exit(1)
return 0
# Feed extraction workflow
if not getattr(args, "name", None):
# Auto-detect name from URL or file path
if getattr(args, "feed_url", None):
from urllib.parse import urlparse
parsed_url = urlparse(args.feed_url)
args.name = parsed_url.hostname.replace(".", "-") if parsed_url.hostname else "feed"
elif getattr(args, "feed_path", None):
args.name = Path(args.feed_path).stem
config = {
"name": args.name,
"feed_url": getattr(args, "feed_url", "") or "",
"feed_path": getattr(args, "feed_path", "") or "",
"follow_links": getattr(args, "follow_links", True),
"max_articles": getattr(args, "max_articles", 50),
"description": getattr(args, "description", None),
}
try:
converter = RssToSkillConverter(config)
# Extract feed
if not converter.extract_feed():
print("\n❌ Feed extraction failed — see error above", file=sys.stderr)
sys.exit(1)
# Build skill
converter.build_skill()
# Enhancement Workflow Integration
from skill_seekers.cli.workflow_runner import run_workflows
workflow_executed, workflow_names = run_workflows(args)
workflow_name = ", ".join(workflow_names) if workflow_names else None
# Traditional enhancement (complements workflow system)
if getattr(args, "enhance_level", 0) > 0:
api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
mode = "API" if api_key else "LOCAL"
print("\n" + "=" * 80)
print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})")
print("=" * 80)
if workflow_executed:
print(f" Running after workflow: {workflow_name}")
print(
" (Workflow provides specialized analysis, "
"enhancement provides general improvements)"
)
print("")
skill_dir = converter.skill_dir
if api_key:
try:
from skill_seekers.cli.enhance_skill import enhance_skill_md
enhance_skill_md(skill_dir, api_key)
print("✅ API enhancement complete!")
except ImportError:
print("❌ API enhancement not available. Falling back to LOCAL mode...")
from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
enhancer = LocalSkillEnhancer(Path(skill_dir))
enhancer.run(headless=True)
print("✅ Local enhancement complete!")
else:
from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
enhancer = LocalSkillEnhancer(Path(skill_dir))
enhancer.run(headless=True)
print("✅ Local enhancement complete!")
except RuntimeError as e:
print(f"\n❌ Error: {e}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"\n❌ Unexpected error during feed processing: {e}", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)
return 0
if __name__ == "__main__":
sys.exit(main())