Files
daymade 22ec9f0d59 feat(twitter-reader): add fetch_article.py for X Articles with images
- Use twitter-cli for structured metadata (likes, retweets, bookmarks)
- Use Jina API for content with images
- Auto-download all images to attachments/
- Generate Markdown with YAML frontmatter and local image references
- Security scan passed

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-06 16:31:33 +08:00

248 lines
7.6 KiB
Python

#!/usr/bin/env python3
"""
Fetch Twitter/X Article with images using twitter-cli.
Usage:
python fetch_article.py <article_url> [output_dir]
Example:
python fetch_article.py https://x.com/HiTw93/status/2040047268221608281 ./Clippings
Features:
- Fetches structured data via twitter-cli
- Downloads all images to attachments folder
- Generates Markdown with embedded image references
"""
import sys
import os
import re
import subprocess
import argparse
from pathlib import Path
from datetime import datetime
def run_twitter_cli(url: str) -> dict:
"""Fetch article data using twitter-cli via uv run."""
cmd = ["uv", "run", "--with", "twitter-cli", "twitter", "article", url]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"Error fetching article: {result.stderr}", file=sys.stderr)
sys.exit(1)
return parse_yaml_output(result.stdout)
def run_jina_api(url: str) -> str:
"""Fetch article text with images using Jina API."""
api_key = os.getenv("JINA_API_KEY", "")
jina_url = f"https://r.jina.ai/{url}"
cmd = ["curl", "-s", jina_url]
if api_key:
cmd.extend(["-H", f"Authorization: Bearer {api_key}"])
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"Warning: Jina API failed: {result.stderr}", file=sys.stderr)
return ""
return result.stdout
def parse_yaml_output(output: str) -> dict:
"""Parse twitter-cli YAML output into dict."""
try:
import yaml
data = yaml.safe_load(output)
if data.get("ok") and "data" in data:
return data["data"]
return data
except ImportError:
print("Error: PyYAML required. Install with: uv pip install pyyaml", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error parsing YAML: {e}", file=sys.stderr)
sys.exit(1)
def extract_image_urls(text: str) -> list:
"""Extract image URLs from markdown text."""
# Extract all pbs.twimg.com URLs (note: twimg not twitter)
pattern = r'https://pbs\.twimg\.com/media/[^\s\)"\']+'
matches = re.findall(pattern, text)
# Deduplicate and normalize to large size
seen = set()
urls = []
for url in matches:
base_url = url.split('?')[0]
if base_url not in seen:
seen.add(base_url)
urls.append(f"{base_url}?format=jpg&name=large")
return urls
def download_images(image_urls: list, attachments_dir: Path) -> list:
"""Download images and return list of local paths."""
attachments_dir.mkdir(parents=True, exist_ok=True)
local_paths = []
for i, url in enumerate(image_urls, 1):
filename = f"{i:02d}-image.jpg"
filepath = attachments_dir / filename
cmd = ["curl", "-sL", url, "-o", str(filepath)]
result = subprocess.run(cmd, capture_output=True)
if result.returncode == 0 and filepath.exists() and filepath.stat().st_size > 0:
local_paths.append(f"attachments/{attachments_dir.name}/{filename}")
print(f"{filename}")
else:
print(f" ✗ Failed: {filename}")
return local_paths
def replace_image_urls(text: str, image_urls: list, local_paths: list) -> str:
"""Replace remote image URLs with local paths in markdown text."""
for remote_url, local_path in zip(image_urls, local_paths):
# Extract base URL pattern
base_url = remote_url.split('?')[0].replace('?format=jpg&name=large', '')
# Replace all variations of this URL
pattern = re.escape(base_url) + r'(\?[^\)]*)?'
text = re.sub(pattern, local_path, text)
return text
def sanitize_filename(name: str) -> str:
"""Sanitize string for use in filename."""
# Remove special chars, keep alphanumeric, CJK, and some safe chars
name = re.sub(r'[^\w\s\-\u4e00-\u9fff]', '', name)
name = re.sub(r'\s+', '-', name.strip())
return name[:60] # Limit length
def generate_markdown(data: dict, text: str, image_urls: list, local_paths: list, source_url: str) -> str:
"""Generate complete Markdown document."""
# Parse date
created = data.get("createdAtLocal", "")
if created:
date_str = created[:10]
else:
date_str = datetime.now().strftime("%Y-%m-%d")
author = data.get("author", {})
metrics = data.get("metrics", {})
title = data.get("articleTitle", "Untitled")
# Build frontmatter
md = f"""---
source: {source_url}
author: {author.get("name", "")}
date: {date_str}
likes: {metrics.get("likes", 0)}
retweets: {metrics.get("retweets", 0)}
bookmarks: {metrics.get("bookmarks", 0)}
---
# {title}
"""
# Replace image URLs with local paths
if image_urls and local_paths:
text = replace_image_urls(text, image_urls, local_paths)
md += text
return md
def main():
parser = argparse.ArgumentParser(description="Fetch Twitter/X Article with images")
parser.add_argument("url", help="Twitter/X article URL")
parser.add_argument("output_dir", nargs="?", default=".", help="Output directory (default: current)")
args = parser.parse_args()
if not args.url.startswith(("https://x.com/", "https://twitter.com/")):
print("Error: URL must be from x.com or twitter.com", file=sys.stderr)
sys.exit(1)
print(f"Fetching: {args.url}")
print("-" * 50)
# Fetch metadata from twitter-cli
print("Getting metadata...")
data = run_twitter_cli(args.url)
title = data.get("articleTitle", "")
if not title:
print("Error: Could not fetch article data", file=sys.stderr)
sys.exit(1)
author = data.get("author", {})
print(f"Title: {title}")
print(f"Author: {author.get('name', 'Unknown')}")
print(f"Likes: {data.get('metrics', {}).get('likes', 0)}")
# Fetch content with images from Jina API
print("\nGetting content and images...")
jina_content = run_jina_api(args.url)
# Use Jina content if available, otherwise fall back to twitter-cli text
if jina_content:
text = jina_content
# Remove Jina header lines to get clean markdown
# Find "Markdown Content:" and keep everything after it
marker = "Markdown Content:"
idx = text.find(marker)
if idx != -1:
text = text[idx + len(marker):].lstrip()
else:
text = data.get("articleText", "")
# Extract image URLs
image_urls = extract_image_urls(text)
print(f"Images: {len(image_urls)}")
# Setup output paths
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Create attachments folder
date_str = data.get("createdAtLocal", "")[:10] if data.get("createdAtLocal") else datetime.now().strftime("%Y-%m-%d")
safe_author = sanitize_filename(author.get("screenName", "unknown"))
safe_title = sanitize_filename(title)
attachments_name = f"{date_str}-{safe_author}-{safe_title[:30]}"
attachments_dir = output_dir / "attachments" / attachments_name
# Download images
local_paths = []
if image_urls:
print(f"\nDownloading {len(image_urls)} images...")
local_paths = download_images(image_urls, attachments_dir)
# Generate Markdown
md_content = generate_markdown(data, text, image_urls, local_paths, args.url)
# Save Markdown
md_filename = f"{date_str}-{safe_title}.md"
md_path = output_dir / md_filename
md_path.write_text(md_content, encoding="utf-8")
print(f"\n✓ Saved: {md_path}")
if local_paths:
print(f"✓ Images: {attachments_dir} ({len(local_paths)} downloaded)")
return md_path
if __name__ == "__main__":
main()