feat(twitter-reader): add fetch_article.py for X Articles with images
- Use twitter-cli for structured metadata (likes, retweets, bookmarks) - Use Jina API for content with images - Auto-download all images to attachments/ - Generate Markdown with YAML frontmatter and local image references - Security scan passed Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
247
twitter-reader/scripts/fetch_article.py
Normal file
247
twitter-reader/scripts/fetch_article.py
Normal file
@@ -0,0 +1,247 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fetch Twitter/X Article with images using twitter-cli.
|
||||
|
||||
Usage:
|
||||
python fetch_article.py <article_url> [output_dir]
|
||||
|
||||
Example:
|
||||
python fetch_article.py https://x.com/HiTw93/status/2040047268221608281 ./Clippings
|
||||
|
||||
Features:
|
||||
- Fetches structured data via twitter-cli
|
||||
- Downloads all images to attachments folder
|
||||
- Generates Markdown with embedded image references
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def run_twitter_cli(url: str) -> dict:
|
||||
"""Fetch article data using twitter-cli via uv run."""
|
||||
cmd = ["uv", "run", "--with", "twitter-cli", "twitter", "article", url]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"Error fetching article: {result.stderr}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
return parse_yaml_output(result.stdout)
|
||||
|
||||
|
||||
def run_jina_api(url: str) -> str:
|
||||
"""Fetch article text with images using Jina API."""
|
||||
api_key = os.getenv("JINA_API_KEY", "")
|
||||
jina_url = f"https://r.jina.ai/{url}"
|
||||
|
||||
cmd = ["curl", "-s", jina_url]
|
||||
if api_key:
|
||||
cmd.extend(["-H", f"Authorization: Bearer {api_key}"])
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"Warning: Jina API failed: {result.stderr}", file=sys.stderr)
|
||||
return ""
|
||||
|
||||
return result.stdout
|
||||
|
||||
|
||||
def parse_yaml_output(output: str) -> dict:
|
||||
"""Parse twitter-cli YAML output into dict."""
|
||||
try:
|
||||
import yaml
|
||||
data = yaml.safe_load(output)
|
||||
if data.get("ok") and "data" in data:
|
||||
return data["data"]
|
||||
return data
|
||||
except ImportError:
|
||||
print("Error: PyYAML required. Install with: uv pip install pyyaml", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error parsing YAML: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def extract_image_urls(text: str) -> list:
|
||||
"""Extract image URLs from markdown text."""
|
||||
# Extract all pbs.twimg.com URLs (note: twimg not twitter)
|
||||
pattern = r'https://pbs\.twimg\.com/media/[^\s\)"\']+'
|
||||
matches = re.findall(pattern, text)
|
||||
|
||||
# Deduplicate and normalize to large size
|
||||
seen = set()
|
||||
urls = []
|
||||
for url in matches:
|
||||
base_url = url.split('?')[0]
|
||||
if base_url not in seen:
|
||||
seen.add(base_url)
|
||||
urls.append(f"{base_url}?format=jpg&name=large")
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
def download_images(image_urls: list, attachments_dir: Path) -> list:
|
||||
"""Download images and return list of local paths."""
|
||||
attachments_dir.mkdir(parents=True, exist_ok=True)
|
||||
local_paths = []
|
||||
|
||||
for i, url in enumerate(image_urls, 1):
|
||||
filename = f"{i:02d}-image.jpg"
|
||||
filepath = attachments_dir / filename
|
||||
|
||||
cmd = ["curl", "-sL", url, "-o", str(filepath)]
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
|
||||
if result.returncode == 0 and filepath.exists() and filepath.stat().st_size > 0:
|
||||
local_paths.append(f"attachments/{attachments_dir.name}/{filename}")
|
||||
print(f" ✓ {filename}")
|
||||
else:
|
||||
print(f" ✗ Failed: {filename}")
|
||||
|
||||
return local_paths
|
||||
|
||||
|
||||
def replace_image_urls(text: str, image_urls: list, local_paths: list) -> str:
|
||||
"""Replace remote image URLs with local paths in markdown text."""
|
||||
for remote_url, local_path in zip(image_urls, local_paths):
|
||||
# Extract base URL pattern
|
||||
base_url = remote_url.split('?')[0].replace('?format=jpg&name=large', '')
|
||||
# Replace all variations of this URL
|
||||
pattern = re.escape(base_url) + r'(\?[^\)]*)?'
|
||||
text = re.sub(pattern, local_path, text)
|
||||
return text
|
||||
|
||||
|
||||
def sanitize_filename(name: str) -> str:
|
||||
"""Sanitize string for use in filename."""
|
||||
# Remove special chars, keep alphanumeric, CJK, and some safe chars
|
||||
name = re.sub(r'[^\w\s\-\u4e00-\u9fff]', '', name)
|
||||
name = re.sub(r'\s+', '-', name.strip())
|
||||
return name[:60] # Limit length
|
||||
|
||||
|
||||
def generate_markdown(data: dict, text: str, image_urls: list, local_paths: list, source_url: str) -> str:
|
||||
"""Generate complete Markdown document."""
|
||||
# Parse date
|
||||
created = data.get("createdAtLocal", "")
|
||||
if created:
|
||||
date_str = created[:10]
|
||||
else:
|
||||
date_str = datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
author = data.get("author", {})
|
||||
metrics = data.get("metrics", {})
|
||||
title = data.get("articleTitle", "Untitled")
|
||||
|
||||
# Build frontmatter
|
||||
md = f"""---
|
||||
source: {source_url}
|
||||
author: {author.get("name", "")}
|
||||
date: {date_str}
|
||||
likes: {metrics.get("likes", 0)}
|
||||
retweets: {metrics.get("retweets", 0)}
|
||||
bookmarks: {metrics.get("bookmarks", 0)}
|
||||
---
|
||||
|
||||
# {title}
|
||||
|
||||
"""
|
||||
|
||||
# Replace image URLs with local paths
|
||||
if image_urls and local_paths:
|
||||
text = replace_image_urls(text, image_urls, local_paths)
|
||||
|
||||
md += text
|
||||
return md
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Fetch Twitter/X Article with images")
|
||||
parser.add_argument("url", help="Twitter/X article URL")
|
||||
parser.add_argument("output_dir", nargs="?", default=".", help="Output directory (default: current)")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.url.startswith(("https://x.com/", "https://twitter.com/")):
|
||||
print("Error: URL must be from x.com or twitter.com", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Fetching: {args.url}")
|
||||
print("-" * 50)
|
||||
|
||||
# Fetch metadata from twitter-cli
|
||||
print("Getting metadata...")
|
||||
data = run_twitter_cli(args.url)
|
||||
|
||||
title = data.get("articleTitle", "")
|
||||
if not title:
|
||||
print("Error: Could not fetch article data", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
author = data.get("author", {})
|
||||
|
||||
print(f"Title: {title}")
|
||||
print(f"Author: {author.get('name', 'Unknown')}")
|
||||
print(f"Likes: {data.get('metrics', {}).get('likes', 0)}")
|
||||
|
||||
# Fetch content with images from Jina API
|
||||
print("\nGetting content and images...")
|
||||
jina_content = run_jina_api(args.url)
|
||||
|
||||
# Use Jina content if available, otherwise fall back to twitter-cli text
|
||||
if jina_content:
|
||||
text = jina_content
|
||||
# Remove Jina header lines to get clean markdown
|
||||
# Find "Markdown Content:" and keep everything after it
|
||||
marker = "Markdown Content:"
|
||||
idx = text.find(marker)
|
||||
if idx != -1:
|
||||
text = text[idx + len(marker):].lstrip()
|
||||
else:
|
||||
text = data.get("articleText", "")
|
||||
|
||||
# Extract image URLs
|
||||
image_urls = extract_image_urls(text)
|
||||
print(f"Images: {len(image_urls)}")
|
||||
|
||||
# Setup output paths
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create attachments folder
|
||||
date_str = data.get("createdAtLocal", "")[:10] if data.get("createdAtLocal") else datetime.now().strftime("%Y-%m-%d")
|
||||
safe_author = sanitize_filename(author.get("screenName", "unknown"))
|
||||
safe_title = sanitize_filename(title)
|
||||
attachments_name = f"{date_str}-{safe_author}-{safe_title[:30]}"
|
||||
attachments_dir = output_dir / "attachments" / attachments_name
|
||||
|
||||
# Download images
|
||||
local_paths = []
|
||||
if image_urls:
|
||||
print(f"\nDownloading {len(image_urls)} images...")
|
||||
local_paths = download_images(image_urls, attachments_dir)
|
||||
|
||||
# Generate Markdown
|
||||
md_content = generate_markdown(data, text, image_urls, local_paths, args.url)
|
||||
|
||||
# Save Markdown
|
||||
md_filename = f"{date_str}-{safe_title}.md"
|
||||
md_path = output_dir / md_filename
|
||||
md_path.write_text(md_content, encoding="utf-8")
|
||||
|
||||
print(f"\n✓ Saved: {md_path}")
|
||||
if local_paths:
|
||||
print(f"✓ Images: {attachments_dir} ({len(local_paths)} downloaded)")
|
||||
|
||||
return md_path
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user