feat(twitter-reader): add fetch_article.py for X Articles with images

- Use twitter-cli for structured metadata (likes, retweets, bookmarks) - Use Jina API for content with images - Auto-download all images to attachments/ - Generate Markdown with YAML frontmatter and local image references - Security scan passed Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-06 16:31:33 +08:00
parent 673980639b
commit 22ec9f0d59
3 changed files with 377 additions and 46 deletions
--- a/twitter-reader/scripts/fetch_article.py
+++ b/twitter-reader/scripts/fetch_article.py
@@ -0,0 +1,247 @@
+#!/usr/bin/env python3
+"""
+Fetch Twitter/X Article with images using twitter-cli.
+
+Usage:
+    python fetch_article.py <article_url> [output_dir]
+
+Example:
+    python fetch_article.py https://x.com/HiTw93/status/2040047268221608281 ./Clippings
+
+Features:
+    - Fetches structured data via twitter-cli
+    - Downloads all images to attachments folder
+    - Generates Markdown with embedded image references
+"""
+
+import sys
+import os
+import re
+import subprocess
+import argparse
+from pathlib import Path
+from datetime import datetime
+
+
+def run_twitter_cli(url: str) -> dict:
+    """Fetch article data using twitter-cli via uv run."""
+    cmd = ["uv", "run", "--with", "twitter-cli", "twitter", "article", url]
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+
+    if result.returncode != 0:
+        print(f"Error fetching article: {result.stderr}", file=sys.stderr)
+        sys.exit(1)
+
+    return parse_yaml_output(result.stdout)
+
+
+def run_jina_api(url: str) -> str:
+    """Fetch article text with images using Jina API."""
+    api_key = os.getenv("JINA_API_KEY", "")
+    jina_url = f"https://r.jina.ai/{url}"
+
+    cmd = ["curl", "-s", jina_url]
+    if api_key:
+        cmd.extend(["-H", f"Authorization: Bearer {api_key}"])
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+
+    if result.returncode != 0:
+        print(f"Warning: Jina API failed: {result.stderr}", file=sys.stderr)
+        return ""
+
+    return result.stdout
+
+
+def parse_yaml_output(output: str) -> dict:
+    """Parse twitter-cli YAML output into dict."""
+    try:
+        import yaml
+        data = yaml.safe_load(output)
+        if data.get("ok") and "data" in data:
+            return data["data"]
+        return data
+    except ImportError:
+        print("Error: PyYAML required. Install with: uv pip install pyyaml", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error parsing YAML: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+def extract_image_urls(text: str) -> list:
+    """Extract image URLs from markdown text."""
+    # Extract all pbs.twimg.com URLs (note: twimg not twitter)
+    pattern = r'https://pbs\.twimg\.com/media/[^\s\)"\']+'
+    matches = re.findall(pattern, text)
+
+    # Deduplicate and normalize to large size
+    seen = set()
+    urls = []
+    for url in matches:
+        base_url = url.split('?')[0]
+        if base_url not in seen:
+            seen.add(base_url)
+            urls.append(f"{base_url}?format=jpg&name=large")
+
+    return urls
+
+
+def download_images(image_urls: list, attachments_dir: Path) -> list:
+    """Download images and return list of local paths."""
+    attachments_dir.mkdir(parents=True, exist_ok=True)
+    local_paths = []
+
+    for i, url in enumerate(image_urls, 1):
+        filename = f"{i:02d}-image.jpg"
+        filepath = attachments_dir / filename
+
+        cmd = ["curl", "-sL", url, "-o", str(filepath)]
+        result = subprocess.run(cmd, capture_output=True)
+
+        if result.returncode == 0 and filepath.exists() and filepath.stat().st_size > 0:
+            local_paths.append(f"attachments/{attachments_dir.name}/{filename}")
+            print(f"  ✓ {filename}")
+        else:
+            print(f"  ✗ Failed: {filename}")
+
+    return local_paths
+
+
+def replace_image_urls(text: str, image_urls: list, local_paths: list) -> str:
+    """Replace remote image URLs with local paths in markdown text."""
+    for remote_url, local_path in zip(image_urls, local_paths):
+        # Extract base URL pattern
+        base_url = remote_url.split('?')[0].replace('?format=jpg&name=large', '')
+        # Replace all variations of this URL
+        pattern = re.escape(base_url) + r'(\?[^\)]*)?'
+        text = re.sub(pattern, local_path, text)
+    return text
+
+
+def sanitize_filename(name: str) -> str:
+    """Sanitize string for use in filename."""
+    # Remove special chars, keep alphanumeric, CJK, and some safe chars
+    name = re.sub(r'[^\w\s\-\u4e00-\u9fff]', '', name)
+    name = re.sub(r'\s+', '-', name.strip())
+    return name[:60]  # Limit length
+
+
+def generate_markdown(data: dict, text: str, image_urls: list, local_paths: list, source_url: str) -> str:
+    """Generate complete Markdown document."""
+    # Parse date
+    created = data.get("createdAtLocal", "")
+    if created:
+        date_str = created[:10]
+    else:
+        date_str = datetime.now().strftime("%Y-%m-%d")
+
+    author = data.get("author", {})
+    metrics = data.get("metrics", {})
+    title = data.get("articleTitle", "Untitled")
+
+    # Build frontmatter
+    md = f"""---
+source: {source_url}
+author: {author.get("name", "")}
+date: {date_str}
+likes: {metrics.get("likes", 0)}
+retweets: {metrics.get("retweets", 0)}
+bookmarks: {metrics.get("bookmarks", 0)}
+---
+
+# {title}
+
+"""
+
+    # Replace image URLs with local paths
+    if image_urls and local_paths:
+        text = replace_image_urls(text, image_urls, local_paths)
+
+    md += text
+    return md
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Fetch Twitter/X Article with images")
+    parser.add_argument("url", help="Twitter/X article URL")
+    parser.add_argument("output_dir", nargs="?", default=".", help="Output directory (default: current)")
+    args = parser.parse_args()
+
+    if not args.url.startswith(("https://x.com/", "https://twitter.com/")):
+        print("Error: URL must be from x.com or twitter.com", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Fetching: {args.url}")
+    print("-" * 50)
+
+    # Fetch metadata from twitter-cli
+    print("Getting metadata...")
+    data = run_twitter_cli(args.url)
+
+    title = data.get("articleTitle", "")
+    if not title:
+        print("Error: Could not fetch article data", file=sys.stderr)
+        sys.exit(1)
+
+    author = data.get("author", {})
+
+    print(f"Title: {title}")
+    print(f"Author: {author.get('name', 'Unknown')}")
+    print(f"Likes: {data.get('metrics', {}).get('likes', 0)}")
+
+    # Fetch content with images from Jina API
+    print("\nGetting content and images...")
+    jina_content = run_jina_api(args.url)
+
+    # Use Jina content if available, otherwise fall back to twitter-cli text
+    if jina_content:
+        text = jina_content
+        # Remove Jina header lines to get clean markdown
+        # Find "Markdown Content:" and keep everything after it
+        marker = "Markdown Content:"
+        idx = text.find(marker)
+        if idx != -1:
+            text = text[idx + len(marker):].lstrip()
+    else:
+        text = data.get("articleText", "")
+
+    # Extract image URLs
+    image_urls = extract_image_urls(text)
+    print(f"Images: {len(image_urls)}")
+
+    # Setup output paths
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Create attachments folder
+    date_str = data.get("createdAtLocal", "")[:10] if data.get("createdAtLocal") else datetime.now().strftime("%Y-%m-%d")
+    safe_author = sanitize_filename(author.get("screenName", "unknown"))
+    safe_title = sanitize_filename(title)
+    attachments_name = f"{date_str}-{safe_author}-{safe_title[:30]}"
+    attachments_dir = output_dir / "attachments" / attachments_name
+
+    # Download images
+    local_paths = []
+    if image_urls:
+        print(f"\nDownloading {len(image_urls)} images...")
+        local_paths = download_images(image_urls, attachments_dir)
+
+    # Generate Markdown
+    md_content = generate_markdown(data, text, image_urls, local_paths, args.url)
+
+    # Save Markdown
+    md_filename = f"{date_str}-{safe_title}.md"
+    md_path = output_dir / md_filename
+    md_path.write_text(md_content, encoding="utf-8")
+
+    print(f"\n✓ Saved: {md_path}")
+    if local_paths:
+        print(f"✓ Images: {attachments_dir} ({len(local_paths)} downloaded)")
+
+    return md_path
+
+
+if __name__ == "__main__":
+    main()