#!/usr/bin/env python3 """ RSS/Atom Feed to Skill Converter Converts RSS 2.0, RSS 1.0 (RDF), and Atom feeds into AI-ready skills. Uses feedparser for feed parsing, optionally follows article links to scrape full content using requests + BeautifulSoup. Supports both remote feed URLs and local feed XML files. Extracts article metadata (title, author, published date, categories), feed-level metadata (title, description, link, language), and optionally the full article text from linked pages. Usage: skill-seekers rss --feed-url https://example.com/feed.xml --name myblog skill-seekers rss --feed-path ./feed.xml --name myblog skill-seekers rss --feed-url https://example.com/rss --no-follow-links --name myblog skill-seekers rss --from-json myblog_extracted.json python3 -m skill_seekers.cli.rss_scraper --feed-url https://example.com/atom.xml --name myblog """ import argparse import hashlib import json import logging import os import re import sys import time from datetime import datetime from pathlib import Path from typing import Any # Optional dependency guard — feedparser is not in core deps try: import feedparser # noqa: F401 FEEDPARSER_AVAILABLE = True except ImportError: FEEDPARSER_AVAILABLE = False # BeautifulSoup is a core dependency (always available) from bs4 import BeautifulSoup, Comment, Tag logger = logging.getLogger(__name__) # Feed type constants FEED_TYPE_RSS_20 = "RSS 2.0" FEED_TYPE_RSS_10 = "RSS 1.0 (RDF)" FEED_TYPE_ATOM = "Atom" FEED_TYPE_UNKNOWN = "Unknown" # Default request headers for scraping article pages _DEFAULT_HEADERS = { "User-Agent": "SkillSeekers/RSS-Scraper (https://github.com/skill-seekers)", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", } # Tags to strip from scraped article HTML _STRIP_TAGS = {"script", "style", "nav", "footer", "header", "aside", "noscript", "iframe"} # Maximum length for a single article's scraped text (characters) _MAX_ARTICLE_TEXT_LENGTH = 50_000 # Delay between HTTP requests when following links (seconds) _REQUEST_DELAY = 1.0 def _check_feedparser_deps() -> None: """Raise RuntimeError if feedparser is not installed.""" if not FEEDPARSER_AVAILABLE: raise RuntimeError( "feedparser is required for RSS/Atom feed support.\n" 'Install with: pip install "skill-seekers[rss]"\n' "Or: pip install feedparser" ) def infer_description_from_feed( feed_meta: dict[str, Any] | None = None, name: str = "", ) -> str: """Infer skill description from feed-level metadata. Tries to build a meaningful "Use when..." description from the feed title and subtitle/description fields. Args: feed_meta: Feed metadata dict with title, description, link, etc. name: Skill name for fallback. Returns: Description string suitable for "Use when..." format. """ if feed_meta: desc = feed_meta.get("description", "") if desc and len(desc) > 20: if len(desc) > 150: desc = desc[:147] + "..." return f"Use when referencing {desc.lower()}" title = feed_meta.get("title", "") if title and len(title) > 5: return f"Use when referencing articles from {title}" return ( f"Use when referencing {name} feed content" if name else "Use when referencing this feed content" ) class RssToSkillConverter: """Convert RSS/Atom feeds to AI-ready skills. Parses RSS 2.0, RSS 1.0 (RDF), and Atom feeds using feedparser. Optionally follows article links to scrape full page content via requests + BeautifulSoup. """ def __init__(self, config: dict[str, Any]) -> None: """Initialize the converter with configuration. Args: config: Dictionary with name (required), feed_url, feed_path, follow_links (default True), max_articles (default 50), and description (optional). """ self.config = config self.name: str = config["name"] self.feed_url: str = config.get("feed_url", "") self.feed_path: str = config.get("feed_path", "") self.follow_links: bool = config.get("follow_links", True) self.max_articles: int = config.get("max_articles", 50) self.description: str = config.get( "description", f"Use when referencing {self.name} feed content" ) # Output paths self.skill_dir: str = f"output/{self.name}" self.data_file: str = f"output/{self.name}_extracted.json" # Internal state self.extracted_data: dict[str, Any] | None = None # ────────────────────────────────────────────────────────────────────── # Public API # ────────────────────────────────────────────────────────────────────── def extract_feed(self) -> bool: """Parse the RSS/Atom feed and extract article data. Parses feed, extracts metadata and articles, optionally follows links to scrape full content, saves intermediate JSON. Returns: True on success. """ _check_feedparser_deps() source = self.feed_url or self.feed_path print(f"\n🔍 Extracting RSS/Atom feed: {source}") # Parse the feed parsed = self._parse_feed() # Detect feed type feed_type = self._detect_feed_type(parsed) print(f" Feed type: {feed_type}") # Extract feed-level metadata feed_meta = self._extract_feed_metadata(parsed) print(f" Title: {feed_meta.get('title', 'Unknown')}") print(f" Link: {feed_meta.get('link', 'N/A')}") print(f" Language: {feed_meta.get('language', 'N/A')}") # Update description from feed metadata if not explicitly set if "description" not in self.config: self.description = infer_description_from_feed(feed_meta, self.name) # Extract articles articles = self._extract_articles(parsed) print(f" Articles found: {len(articles)}") # Optionally scrape full article content if self.follow_links: print(f"\n🌐 Following article links (max {len(articles)})...") scraped_count = 0 for i, article in enumerate(articles): link = article.get("link", "") if not link: continue print(f" [{i + 1}/{len(articles)}] {link[:80]}...") content = self._scrape_article_content(link) if content: article["full_text"] = content scraped_count += 1 # Be polite — delay between requests if i < len(articles) - 1: time.sleep(_REQUEST_DELAY) print(f" Scraped full content for {scraped_count}/{len(articles)} articles") else: print(" Skipping link following (--no-follow-links)") # Categorize articles by feed categories/tags all_categories = self._collect_all_categories(articles) # Build result data result_data: dict[str, Any] = { "source": source, "feed_type": feed_type, "feed_metadata": feed_meta, "total_articles": len(articles), "followed_links": self.follow_links, "all_categories": sorted(all_categories), "articles": articles, } # Persist extracted data os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True) with open(self.data_file, "w", encoding="utf-8") as f: json.dump(result_data, f, indent=2, ensure_ascii=False, default=str) print(f"\n💾 Saved extracted data to: {self.data_file}") self.extracted_data = result_data print( f"✅ Extracted {len(articles)} articles ({len(all_categories)} unique categories/tags)" ) return True def load_extracted_data(self, json_path: str) -> bool: """Load previously extracted data from a JSON file.""" print(f"\n📂 Loading extracted data from: {json_path}") if not os.path.exists(json_path): raise FileNotFoundError(f"Extracted data file not found: {json_path}") with open(json_path, encoding="utf-8") as f: self.extracted_data = json.load(f) total = self.extracted_data.get( "total_articles", len(self.extracted_data.get("articles", [])) ) print(f"✅ Loaded {total} articles") return True def categorize_content(self) -> dict[str, dict[str, Any]]: """Categorize articles by their feed categories/tags.""" print("\n📋 Categorizing content by feed tags...") if not self.extracted_data: raise RuntimeError("No extracted data available. Call extract_feed() first.") articles = self.extracted_data.get("articles", []) categorized: dict[str, dict[str, Any]] = {} for article in articles: cats = article.get("categories", []) if not cats: cats = ["uncategorized"] for cat in cats: cat_key = self._sanitize_filename(cat) if cat_key not in categorized: categorized[cat_key] = { "title": cat, "articles": [], } # Avoid duplicates if an article has overlapping normalized keys article_id = article.get("id", article.get("link", "")) existing_ids = { a.get("id", a.get("link", "")) for a in categorized[cat_key]["articles"] } if article_id not in existing_ids: categorized[cat_key]["articles"].append(article) # If no categories at all, put everything in one group if not categorized: categorized["all_articles"] = { "title": "All Articles", "articles": articles, } print(f"✅ Created {len(categorized)} categories") for cat_key, cat_data in categorized.items(): print(f" - {cat_data['title']}: {len(cat_data['articles'])} articles") return categorized def build_skill(self) -> None: """Build complete skill structure from extracted data.""" print(f"\n🏗️ Building skill: {self.name}") if not self.extracted_data: raise RuntimeError("No extracted data available. Call extract_feed() first.") # Create directories os.makedirs(f"{self.skill_dir}/references", exist_ok=True) os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True) os.makedirs(f"{self.skill_dir}/assets", exist_ok=True) # Categorize content categorized = self.categorize_content() # Generate reference files print("\n📝 Generating reference files...") for cat_key, cat_data in categorized.items(): self._generate_reference_file(cat_key, cat_data) # Generate index self._generate_index(categorized) # Generate SKILL.md self._generate_skill_md(categorized) print(f"\n✅ Skill built successfully: {self.skill_dir}/") print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/") # ────────────────────────────────────────────────────────────────────── # Feed parsing internals # ────────────────────────────────────────────────────────────────────── def _parse_feed(self) -> "feedparser.FeedParserDict": """Parse feed from URL or local file using feedparser.""" import feedparser as fp if self.feed_path: if not os.path.exists(self.feed_path): raise FileNotFoundError(f"Feed file not found: {self.feed_path}") logger.info("Parsing feed from local file: %s", self.feed_path) parsed = fp.parse(self.feed_path) elif self.feed_url: logger.info("Fetching feed from URL: %s", self.feed_url) parsed = fp.parse( self.feed_url, agent="SkillSeekers/RSS-Scraper", ) else: raise RuntimeError( "No feed source provided. Use feed_url (remote URL) or feed_path (local file)." ) # Check for parsing errors if parsed.bozo and not parsed.entries: exc = parsed.get("bozo_exception", "Unknown parse error") raise RuntimeError(f"Failed to parse feed: {exc}") return parsed def _detect_feed_type(self, parsed: "feedparser.FeedParserDict") -> str: """Detect RSS 2.0, RSS 1.0, or Atom from feedparser's version field.""" version = getattr(parsed, "version", "") or "" version_lower = version.lower() if "atom" in version_lower: return FEED_TYPE_ATOM if "rss20" in version_lower or version_lower == "rss20": return FEED_TYPE_RSS_20 if "rss10" in version_lower or "rdf" in version_lower: return FEED_TYPE_RSS_10 if version_lower.startswith("rss"): return FEED_TYPE_RSS_20 # Fallback heuristic: check feed dict for version clues feed = parsed.get("feed", {}) if feed.get("xmlns", "").startswith("http://www.w3.org/2005/Atom"): return FEED_TYPE_ATOM if feed.get("rss_version"): return FEED_TYPE_RSS_20 return FEED_TYPE_UNKNOWN def _extract_feed_metadata(self, parsed: "feedparser.FeedParserDict") -> dict[str, Any]: """Extract feed-level metadata (title, description, link, language, etc.).""" feed = parsed.get("feed", {}) # feedparser normalizes subtitle (Atom) and description (RSS) description = feed.get("subtitle", "") or feed.get("description", "") # Published / updated dates published = feed.get("published", "") or feed.get("updated", "") # Feed image (RSS , Atom /) image_url = "" image_data = feed.get("image", {}) if isinstance(image_data, dict): image_url = image_data.get("href", "") or image_data.get("url", "") elif isinstance(image_data, str): image_url = image_data return { "title": feed.get("title", "Untitled Feed"), "description": description, "link": feed.get("link", ""), "language": feed.get("language", ""), "author": feed.get("author", ""), "published": published, "generator": feed.get("generator", ""), "image_url": image_url, "rights": feed.get("rights", ""), } def _extract_articles(self, parsed: "feedparser.FeedParserDict") -> list[dict[str, Any]]: """Extract article entries (title, link, summary, date, author, categories).""" articles: list[dict[str, Any]] = [] for entry in parsed.entries[: self.max_articles]: # Unique identifier (Atom id, RSS guid, or link hash) entry_id = entry.get("id", "") or entry.get("link", "") if not entry_id: entry_id = hashlib.sha256(entry.get("title", "").encode("utf-8")).hexdigest()[:16] # Published date normalization published = entry.get("published", "") or entry.get("updated", "") published_parsed = entry.get("published_parsed") or entry.get("updated_parsed") published_iso = "" if published_parsed: try: dt = datetime(*published_parsed[:6]) published_iso = dt.isoformat() except (TypeError, ValueError): published_iso = published # Categories / tags categories: list[str] = [] for tag_data in entry.get("tags", []): term = tag_data.get("term", "") if term: categories.append(term) # Summary — feedparser may provide HTML; clean it summary_raw = entry.get("summary", "") or entry.get("description", "") summary_text = self._html_to_text(summary_raw) if summary_raw else "" # Content — some feeds include full content inline content_text = "" content_list = entry.get("content", []) if content_list and isinstance(content_list, list): for content_block in content_list: value = content_block.get("value", "") if value: content_text += self._html_to_text(value) + "\n\n" content_text = content_text.strip() # Author(s) author = entry.get("author", "") if not author: authors_detail = entry.get("authors", []) if authors_detail: author = ", ".join(a.get("name", "") for a in authors_detail if a.get("name")) article: dict[str, Any] = { "id": entry_id, "title": entry.get("title", "Untitled"), "link": entry.get("link", ""), "summary": summary_text, "content": content_text, "published": published, "published_iso": published_iso, "author": author, "categories": categories, } articles.append(article) return articles def _scrape_article_content(self, url: str) -> str: """Follow article URL, extract full page content using requests + BeautifulSoup.""" try: import requests except ImportError: logger.warning( "requests library not available — cannot follow article links. " "Install with: pip install requests" ) return "" try: response = requests.get( url, headers=_DEFAULT_HEADERS, timeout=15, allow_redirects=True, ) response.raise_for_status() except Exception as e: logger.debug("Failed to fetch %s: %s", url, e) return "" content_type = response.headers.get("Content-Type", "") if "html" not in content_type.lower() and "xml" not in content_type.lower(): logger.debug("Skipping non-HTML content at %s (type: %s)", url, content_type) return "" return self._extract_article_text(response.text) def _extract_article_text(self, html: str) -> str: """Clean article HTML to text/markdown. Finds
/
, strips nav/ads.""" soup = BeautifulSoup(html, "html.parser") # Remove unwanted elements for tag_name in _STRIP_TAGS: for element in soup.find_all(tag_name): element.decompose() for comment in soup.find_all(string=lambda t: isinstance(t, Comment)): comment.extract() # Try to find the main article container main_content = ( soup.find("article") or soup.find("main") or soup.find(attrs={"role": "main"}) or soup.find(attrs={"id": re.compile(r"(content|article|post|entry)", re.I)}) or soup.find(attrs={"class": re.compile(r"(content|article|post|entry)", re.I)}) ) if not main_content: main_content = soup.find("body") or soup # Convert to text with basic structure preservation text_parts: list[str] = [] for element in main_content.descendants: if isinstance(element, Tag): if element.name in ("h1", "h2", "h3", "h4", "h5", "h6"): level = int(element.name[1]) heading_text = element.get_text(strip=True) if heading_text: text_parts.append(f"\n{'#' * level} {heading_text}\n") elif element.name == "p": para_text = element.get_text(separator=" ", strip=True) if para_text: text_parts.append(f"\n{para_text}\n") elif element.name in ("pre", "code"): code_text = element.get_text() if code_text and code_text.strip(): # Detect language from class if available classes = element.get("class", []) lang = "" for cls in classes: if isinstance(cls, str) and ( cls.startswith("language-") or cls.startswith("lang-") ): lang = cls.split("-", 1)[1] break text_parts.append(f"\n```{lang}\n{code_text.strip()}\n```\n") elif element.name == "li": li_text = element.get_text(separator=" ", strip=True) if li_text: text_parts.append(f"- {li_text}") elif element.name == "blockquote": bq_text = element.get_text(separator=" ", strip=True) if bq_text: text_parts.append(f"\n> {bq_text}\n") text = "\n".join(text_parts).strip() # Collapse excessive whitespace text = re.sub(r"\n{4,}", "\n\n\n", text) # Truncate if too long if len(text) > _MAX_ARTICLE_TEXT_LENGTH: text = text[:_MAX_ARTICLE_TEXT_LENGTH] + "\n\n[Content truncated]" return text # ────────────────────────────────────────────────────────────────────── # Categorization helpers # ────────────────────────────────────────────────────────────────────── def _collect_all_categories(self, articles: list[dict[str, Any]]) -> set[str]: """Collect all unique category/tag strings across articles.""" categories: set[str] = set() for article in articles: for cat in article.get("categories", []): if cat: categories.add(cat) return categories def _html_to_text(self, html_fragment: str) -> str: """Convert an HTML fragment to plain text, stripping all tags.""" if not html_fragment: return "" soup = BeautifulSoup(html_fragment, "html.parser") text = soup.get_text(separator=" ", strip=True) # Collapse multiple spaces text = re.sub(r"\s+", " ", text).strip() return text # ────────────────────────────────────────────────────────────────────── # Skill generation — reference files # ────────────────────────────────────────────────────────────────────── def _generate_reference_file(self, cat_key: str, cat_data: dict[str, Any]) -> None: """Generate a reference markdown file for a category of articles.""" safe_name = self._sanitize_filename(cat_data["title"]) filepath = f"{self.skill_dir}/references/{safe_name}.md" articles = cat_data["articles"] with open(filepath, "w", encoding="utf-8") as f: f.write(f"# {cat_data['title']}\n\n") f.write(f"**Articles:** {len(articles)}\n\n") f.write("---\n\n") for article in articles: f.write(f"## {article.get('title', 'Untitled')}\n\n") # Metadata block if article.get("author"): f.write(f"**Author:** {article['author']}\n\n") if article.get("published"): f.write(f"**Published:** {article['published']}\n\n") if article.get("link"): f.write(f"**Link:** {article['link']}\n\n") if article.get("categories"): tags = ", ".join(article["categories"]) f.write(f"**Tags:** {tags}\n\n") # Summary summary = article.get("summary", "") if summary: f.write("### Summary\n\n") f.write(f"{summary}\n\n") # Inline content from feed (if present) inline_content = article.get("content", "") if inline_content and inline_content != summary: f.write("### Content\n\n") f.write(f"{inline_content}\n\n") # Full scraped text full_text = article.get("full_text", "") if full_text: f.write("### Full Article\n\n") f.write(f"{full_text}\n\n") f.write("---\n\n") print(f" Generated: {filepath}") def _generate_index(self, categorized: dict[str, dict[str, Any]]) -> None: """Generate the reference index file with category links and statistics.""" filepath = f"{self.skill_dir}/references/index.md" with open(filepath, "w", encoding="utf-8") as f: f.write(f"# {self.name.title()} Feed Reference Index\n\n") feed_meta = self.extracted_data.get("feed_metadata", {}) if feed_meta.get("title"): f.write(f"**Feed:** {feed_meta['title']}\n\n") if feed_meta.get("link"): f.write(f"**Source:** {feed_meta['link']}\n\n") f.write("## Categories\n\n") total_articles = 0 for cat_key, cat_data in sorted(categorized.items()): safe_name = self._sanitize_filename(cat_data["title"]) count = len(cat_data["articles"]) total_articles += count f.write(f"- [{cat_data['title']}]({safe_name}.md) ({count} articles)\n") f.write(f"\n**Total articles:** {total_articles}\n\n") # Statistics f.write("## Statistics\n\n") f.write(f"- Total articles: {self.extracted_data.get('total_articles', 0)}\n") f.write(f"- Feed type: {self.extracted_data.get('feed_type', FEED_TYPE_UNKNOWN)}\n") f.write( f"- Links followed: " f"{'Yes' if self.extracted_data.get('followed_links') else 'No'}\n" ) all_cats = self.extracted_data.get("all_categories", []) if all_cats: f.write(f"- Unique tags: {len(all_cats)}\n") # Author summary author_counts = self._count_authors() if author_counts: f.write(f"\n## Authors ({len(author_counts)})\n\n") for author, count in sorted( author_counts.items(), key=lambda x: x[1], reverse=True )[:20]: f.write(f"- {author}: {count} articles\n") print(f" Generated: {filepath}") def _generate_skill_md(self, categorized: dict[str, dict[str, Any]]) -> None: """Generate the main SKILL.md file with feed overview and navigation.""" filepath = f"{self.skill_dir}/SKILL.md" feed_meta = self.extracted_data.get("feed_metadata", {}) feed_title = feed_meta.get("title", self.name.title()) feed_type = self.extracted_data.get("feed_type", FEED_TYPE_UNKNOWN) # Skill name for frontmatter (lowercase, hyphens, max 64 chars) skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64] # Truncate description desc = self.description[:1024] if len(self.description) > 1024 else self.description with open(filepath, "w", encoding="utf-8") as f: # YAML frontmatter f.write("---\n") f.write(f"name: {skill_name}\n") f.write(f"description: {desc}\n") f.write("---\n\n") # Header f.write(f"# {feed_title} Feed Skill\n\n") f.write(f"{self.description}\n\n") # Feed Information f.write("## 📡 Feed Information\n\n") f.write(f"**Feed Title:** {feed_title}\n\n") f.write(f"**Feed Type:** {feed_type}\n\n") if feed_meta.get("link"): f.write(f"**Website:** {feed_meta['link']}\n\n") if feed_meta.get("language"): f.write(f"**Language:** {feed_meta['language']}\n\n") if feed_meta.get("description"): feed_desc = feed_meta["description"] if len(feed_desc) > 300: feed_desc = feed_desc[:297] + "..." f.write(f"**Description:** {feed_desc}\n\n") if feed_meta.get("generator"): f.write(f"**Generator:** {feed_meta['generator']}\n\n") if feed_meta.get("rights"): f.write(f"**Rights:** {feed_meta['rights']}\n\n") # When to Use f.write("## 💡 When to Use This Skill\n\n") f.write("Use this skill when you need to:\n") f.write(f"- Reference articles and content from {feed_title}\n") f.write("- Look up specific topics covered in the feed\n") f.write("- Find author perspectives and expert analysis\n") f.write("- Review recent posts and updates on the subject\n") f.write("- Explore categorized content by tags or topics\n\n") # Article Overview total_articles = self.extracted_data.get("total_articles", 0) f.write("## 📖 Article Overview\n\n") f.write(f"**Total Articles:** {total_articles}\n\n") # Category breakdown f.write("**Content by Category:**\n\n") for cat_key, cat_data in sorted(categorized.items()): count = len(cat_data["articles"]) f.write(f"- **{cat_data['title']}**: {count} articles\n") f.write("\n") # Recent articles (top 10 by date or order) articles = self.extracted_data.get("articles", []) recent = articles[:10] if recent: f.write("## 📰 Recent Articles\n\n") for article in recent: title = article.get("title", "Untitled") published = article.get("published", "") author = article.get("author", "") link = article.get("link", "") f.write(f"### {title}\n\n") meta_parts: list[str] = [] if published: meta_parts.append(f"**Published:** {published}") if author: meta_parts.append(f"**Author:** {author}") if meta_parts: f.write(" | ".join(meta_parts) + "\n\n") summary = article.get("summary", "") if summary: # Show first 200 chars of summary short = summary[:200] + "..." if len(summary) > 200 else summary f.write(f"{short}\n\n") if link: f.write(f"[Read more]({link})\n\n") # Authors author_counts = self._count_authors() if author_counts: f.write(f"## ✍️ Authors ({len(author_counts)})\n\n") for author, count in sorted( author_counts.items(), key=lambda x: x[1], reverse=True )[:15]: f.write(f"- **{author}**: {count} articles\n") f.write("\n") # All categories/tags all_cats = self.extracted_data.get("all_categories", []) if all_cats: f.write(f"## 🏷️ Tags ({len(all_cats)})\n\n") f.write(", ".join(f"`{cat}`" for cat in all_cats[:50])) if len(all_cats) > 50: f.write(f" ... and {len(all_cats) - 50} more") f.write("\n\n") # Statistics f.write("## 📊 Feed Statistics\n\n") f.write(f"- **Total Articles**: {total_articles}\n") f.write(f"- **Feed Type**: {feed_type}\n") f.write(f"- **Categories/Tags**: {len(all_cats)}\n") f.write(f"- **Authors**: {len(author_counts)}\n") followed = self.extracted_data.get("followed_links", False) f.write(f"- **Full Content Scraped**: {'Yes' if followed else 'No'}\n\n") # Date range date_range = self._get_date_range() if date_range: f.write(f"- **Date Range**: {date_range[0]} to {date_range[1]}\n\n") # Navigation f.write("## 🗺️ Navigation\n\n") f.write("**Reference Files:**\n\n") for cat_key, cat_data in sorted(categorized.items()): safe_name = self._sanitize_filename(cat_data["title"]) f.write( f"- `references/{safe_name}.md` - {cat_data['title']}" f" ({len(cat_data['articles'])} articles)\n" ) f.write("\n") f.write("See `references/index.md` for complete feed structure.\n\n") # Footer f.write("---\n\n") f.write("**Generated by Skill Seeker** | RSS/Atom Feed Scraper\n") with open(filepath, encoding="utf-8") as f: line_count = len(f.read().split("\n")) print(f" Generated: {filepath} ({line_count} lines)") # ────────────────────────────────────────────────────────────────────── # Utility helpers # ────────────────────────────────────────────────────────────────────── def _count_authors(self) -> dict[str, int]: """Count articles per author.""" if not self.extracted_data: return {} counts: dict[str, int] = {} for article in self.extracted_data.get("articles", []): author = article.get("author", "").strip() if author: counts[author] = counts.get(author, 0) + 1 return counts def _get_date_range(self) -> tuple[str, str] | None: """Get the date range (earliest, latest) of articles, or None.""" if not self.extracted_data: return None dates: list[str] = [] for article in self.extracted_data.get("articles", []): iso = article.get("published_iso", "") if iso: dates.append(iso) if not dates: return None dates.sort() return (dates[0][:10], dates[-1][:10]) def _sanitize_filename(self, name: str) -> str: """Convert a string to a safe filename.""" safe = re.sub(r"[^\w\s-]", "", name.lower()) safe = re.sub(r"[-\s]+", "_", safe) return safe or "unnamed" # ────────────────────────────────────────────────────────────────────────── # CLI entry point # ────────────────────────────────────────────────────────────────────────── def main() -> int: """CLI entry point for the RSS/Atom feed scraper.""" from .arguments.common import add_all_standard_arguments parser = argparse.ArgumentParser( description="Convert RSS/Atom feed to AI-ready skill", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=( "Examples:\n" " %(prog)s --feed-url https://example.com/feed.xml --name myblog\n" " %(prog)s --feed-path ./feed.xml --name myblog\n" " %(prog)s --feed-url https://example.com/rss --no-follow-links --name myblog\n" " %(prog)s --from-json myblog_extracted.json\n" ), ) # Standard arguments (name, description, output, enhance-level, etc.) add_all_standard_arguments(parser) # Override enhance-level default to 0 for RSS for action in parser._actions: if hasattr(action, "dest") and action.dest == "enhance_level": action.default = 0 action.help = ( "AI enhancement level (auto-detects API vs LOCAL mode): " "0=disabled (default for RSS), 1=SKILL.md only, " "2=+architecture/config, 3=full enhancement. " "Mode selection: uses API if ANTHROPIC_API_KEY is set, " "otherwise LOCAL (Claude Code)" ) # RSS-specific arguments parser.add_argument( "--feed-url", type=str, help="URL of the RSS/Atom feed to scrape", metavar="URL", ) parser.add_argument( "--feed-path", type=str, help="Local file path to an RSS/Atom XML file", metavar="PATH", ) parser.add_argument( "--follow-links", action="store_true", default=True, dest="follow_links", help="Follow article links to scrape full content (default: enabled)", ) parser.add_argument( "--no-follow-links", action="store_false", dest="follow_links", help="Do not follow article links — use feed content only", ) parser.add_argument( "--max-articles", type=int, default=50, metavar="N", help="Maximum number of articles to process (default: 50)", ) parser.add_argument( "--from-json", type=str, help="Build skill from previously extracted JSON file", metavar="FILE", ) args = parser.parse_args() # Set logging level if getattr(args, "quiet", False): logging.getLogger().setLevel(logging.WARNING) elif getattr(args, "verbose", False): logging.getLogger().setLevel(logging.DEBUG) # Handle --dry-run if getattr(args, "dry_run", False): source = ( getattr(args, "feed_url", None) or getattr(args, "feed_path", None) or getattr(args, "from_json", None) or "(none)" ) print(f"\n{'=' * 60}") print("DRY RUN: RSS/Atom Feed Extraction") print(f"{'=' * 60}") print(f"Source: {source}") print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}") print(f"Follow links: {getattr(args, 'follow_links', True)}") print(f"Max articles: {getattr(args, 'max_articles', 50)}") print(f"Enhance level: {getattr(args, 'enhance_level', 0)}") print(f"\n✅ Dry run complete") return 0 # Validate inputs has_source = ( getattr(args, "feed_url", None) or getattr(args, "feed_path", None) or getattr(args, "from_json", None) ) if not has_source: parser.error("Must specify --feed-url, --feed-path, or --from-json") # Build from JSON workflow if getattr(args, "from_json", None): name = Path(args.from_json).stem.replace("_extracted", "") config: dict[str, Any] = { "name": getattr(args, "name", None) or name, "description": getattr(args, "description", None) or f"Use when referencing {name} feed content", } try: converter = RssToSkillConverter(config) converter.load_extracted_data(args.from_json) converter.build_skill() except Exception as e: print(f"\n❌ Error: {e}", file=sys.stderr) sys.exit(1) return 0 # Feed extraction workflow if not getattr(args, "name", None): # Auto-detect name from URL or file path if getattr(args, "feed_url", None): from urllib.parse import urlparse parsed_url = urlparse(args.feed_url) args.name = parsed_url.hostname.replace(".", "-") if parsed_url.hostname else "feed" elif getattr(args, "feed_path", None): args.name = Path(args.feed_path).stem config = { "name": args.name, "feed_url": getattr(args, "feed_url", "") or "", "feed_path": getattr(args, "feed_path", "") or "", "follow_links": getattr(args, "follow_links", True), "max_articles": getattr(args, "max_articles", 50), "description": getattr(args, "description", None), } try: converter = RssToSkillConverter(config) # Extract feed if not converter.extract_feed(): print("\n❌ Feed extraction failed — see error above", file=sys.stderr) sys.exit(1) # Build skill converter.build_skill() # Enhancement Workflow Integration from skill_seekers.cli.workflow_runner import run_workflows workflow_executed, workflow_names = run_workflows(args) workflow_name = ", ".join(workflow_names) if workflow_names else None # Traditional enhancement (complements workflow system) if getattr(args, "enhance_level", 0) > 0: api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY") mode = "API" if api_key else "LOCAL" print("\n" + "=" * 80) print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})") print("=" * 80) if workflow_executed: print(f" Running after workflow: {workflow_name}") print( " (Workflow provides specialized analysis, " "enhancement provides general improvements)" ) print("") skill_dir = converter.skill_dir if api_key: try: from skill_seekers.cli.enhance_skill import enhance_skill_md enhance_skill_md(skill_dir, api_key) print("✅ API enhancement complete!") except ImportError: print("❌ API enhancement not available. Falling back to LOCAL mode...") from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer enhancer = LocalSkillEnhancer(Path(skill_dir)) enhancer.run(headless=True) print("✅ Local enhancement complete!") else: from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer enhancer = LocalSkillEnhancer(Path(skill_dir)) enhancer.run(headless=True) print("✅ Local enhancement complete!") except RuntimeError as e: print(f"\n❌ Error: {e}", file=sys.stderr) sys.exit(1) except Exception as e: print(f"\n❌ Unexpected error during feed processing: {e}", file=sys.stderr) import traceback traceback.print_exc() sys.exit(1) return 0 if __name__ == "__main__": sys.exit(main())