#!/usr/bin/env python3 """ Fetch Twitter/X Article with images using twitter-cli. Usage: python fetch_article.py [output_dir] Example: python fetch_article.py https://x.com/HiTw93/status/2040047268221608281 ./Clippings Features: - Fetches structured data via twitter-cli - Downloads all images to attachments folder - Generates Markdown with embedded image references """ import sys import os import re import subprocess import argparse from pathlib import Path from datetime import datetime def run_twitter_cli(url: str) -> dict: """Fetch article data using twitter-cli via uv run.""" cmd = ["uv", "run", "--with", "twitter-cli", "twitter", "article", url] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: print(f"Error fetching article: {result.stderr}", file=sys.stderr) sys.exit(1) return parse_yaml_output(result.stdout) def run_jina_api(url: str) -> str: """Fetch article text with images using Jina API.""" api_key = os.getenv("JINA_API_KEY", "") jina_url = f"https://r.jina.ai/{url}" cmd = ["curl", "-s", jina_url] if api_key: cmd.extend(["-H", f"Authorization: Bearer {api_key}"]) result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: print(f"Warning: Jina API failed: {result.stderr}", file=sys.stderr) return "" return result.stdout def parse_yaml_output(output: str) -> dict: """Parse twitter-cli YAML output into dict.""" try: import yaml data = yaml.safe_load(output) if data.get("ok") and "data" in data: return data["data"] return data except ImportError: print("Error: PyYAML required. Install with: uv pip install pyyaml", file=sys.stderr) sys.exit(1) except Exception as e: print(f"Error parsing YAML: {e}", file=sys.stderr) sys.exit(1) def extract_image_urls(text: str) -> list: """Extract image URLs from markdown text.""" # Extract all pbs.twimg.com URLs (note: twimg not twitter) pattern = r'https://pbs\.twimg\.com/media/[^\s\)"\']+' matches = re.findall(pattern, text) # Deduplicate and normalize to large size seen = set() urls = [] for url in matches: base_url = url.split('?')[0] if base_url not in seen: seen.add(base_url) urls.append(f"{base_url}?format=jpg&name=large") return urls def download_images(image_urls: list, attachments_dir: Path) -> list: """Download images and return list of local paths.""" attachments_dir.mkdir(parents=True, exist_ok=True) local_paths = [] for i, url in enumerate(image_urls, 1): filename = f"{i:02d}-image.jpg" filepath = attachments_dir / filename cmd = ["curl", "-sL", url, "-o", str(filepath)] result = subprocess.run(cmd, capture_output=True) if result.returncode == 0 and filepath.exists() and filepath.stat().st_size > 0: local_paths.append(f"attachments/{attachments_dir.name}/{filename}") print(f" āœ“ {filename}") else: print(f" āœ— Failed: {filename}") return local_paths def replace_image_urls(text: str, image_urls: list, local_paths: list) -> str: """Replace remote image URLs with local paths in markdown text.""" for remote_url, local_path in zip(image_urls, local_paths): # Extract base URL pattern base_url = remote_url.split('?')[0].replace('?format=jpg&name=large', '') # Replace all variations of this URL pattern = re.escape(base_url) + r'(\?[^\)]*)?' text = re.sub(pattern, local_path, text) return text def sanitize_filename(name: str) -> str: """Sanitize string for use in filename.""" # Remove special chars, keep alphanumeric, CJK, and some safe chars name = re.sub(r'[^\w\s\-\u4e00-\u9fff]', '', name) name = re.sub(r'\s+', '-', name.strip()) return name[:60] # Limit length def generate_markdown(data: dict, text: str, image_urls: list, local_paths: list, source_url: str) -> str: """Generate complete Markdown document.""" # Parse date created = data.get("createdAtLocal", "") if created: date_str = created[:10] else: date_str = datetime.now().strftime("%Y-%m-%d") author = data.get("author", {}) metrics = data.get("metrics", {}) title = data.get("articleTitle", "Untitled") # Build frontmatter md = f"""--- source: {source_url} author: {author.get("name", "")} date: {date_str} likes: {metrics.get("likes", 0)} retweets: {metrics.get("retweets", 0)} bookmarks: {metrics.get("bookmarks", 0)} --- # {title} """ # Replace image URLs with local paths if image_urls and local_paths: text = replace_image_urls(text, image_urls, local_paths) md += text return md def main(): parser = argparse.ArgumentParser(description="Fetch Twitter/X Article with images") parser.add_argument("url", help="Twitter/X article URL") parser.add_argument("output_dir", nargs="?", default=".", help="Output directory (default: current)") args = parser.parse_args() if not args.url.startswith(("https://x.com/", "https://twitter.com/")): print("Error: URL must be from x.com or twitter.com", file=sys.stderr) sys.exit(1) print(f"Fetching: {args.url}") print("-" * 50) # Fetch metadata from twitter-cli print("Getting metadata...") data = run_twitter_cli(args.url) title = data.get("articleTitle", "") if not title: print("Error: Could not fetch article data", file=sys.stderr) sys.exit(1) author = data.get("author", {}) print(f"Title: {title}") print(f"Author: {author.get('name', 'Unknown')}") print(f"Likes: {data.get('metrics', {}).get('likes', 0)}") # Fetch content with images from Jina API print("\nGetting content and images...") jina_content = run_jina_api(args.url) # Use Jina content if available, otherwise fall back to twitter-cli text if jina_content: text = jina_content # Remove Jina header lines to get clean markdown # Find "Markdown Content:" and keep everything after it marker = "Markdown Content:" idx = text.find(marker) if idx != -1: text = text[idx + len(marker):].lstrip() else: text = data.get("articleText", "") # Extract image URLs image_urls = extract_image_urls(text) print(f"Images: {len(image_urls)}") # Setup output paths output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Create attachments folder date_str = data.get("createdAtLocal", "")[:10] if data.get("createdAtLocal") else datetime.now().strftime("%Y-%m-%d") safe_author = sanitize_filename(author.get("screenName", "unknown")) safe_title = sanitize_filename(title) attachments_name = f"{date_str}-{safe_author}-{safe_title[:30]}" attachments_dir = output_dir / "attachments" / attachments_name # Download images local_paths = [] if image_urls: print(f"\nDownloading {len(image_urls)} images...") local_paths = download_images(image_urls, attachments_dir) # Generate Markdown md_content = generate_markdown(data, text, image_urls, local_paths, args.url) # Save Markdown md_filename = f"{date_str}-{safe_title}.md" md_path = output_dir / md_filename md_path.write_text(md_content, encoding="utf-8") print(f"\nāœ“ Saved: {md_path}") if local_paths: print(f"āœ“ Images: {attachments_dir} ({len(local_paths)} downloaded)") return md_path if __name__ == "__main__": main()