"""OpenAI Responses API client for Reddit discovery.""" import json import re import sys from typing import Any, Dict, List, Optional from . import http def _log_error(msg: str): """Log error to stderr.""" sys.stderr.write(f"[REDDIT ERROR] {msg}\n") sys.stderr.flush() OPENAI_RESPONSES_URL = "https://api.openai.com/v1/responses" # Depth configurations: (min, max) threads to request # Request MORE than needed since many get filtered by date DEPTH_CONFIG = { "quick": (15, 25), "default": (30, 50), "deep": (70, 100), } REDDIT_SEARCH_PROMPT = """Find Reddit discussion threads about: {topic} STEP 1: EXTRACT THE CORE SUBJECT Get the MAIN NOUN/PRODUCT/TOPIC: - "best nano banana prompting practices" → "nano banana" - "killer features of clawdbot" → "clawdbot" - "top Claude Code skills" → "Claude Code" DO NOT include "best", "top", "tips", "practices", "features" in your search. STEP 2: SEARCH BROADLY Search for the core subject: 1. "[core subject] site:reddit.com" 2. "reddit [core subject]" 3. "[core subject] reddit" Return as many relevant threads as you find. We filter by date server-side. STEP 3: INCLUDE ALL MATCHES - Include ALL threads about the core subject - Set date to "YYYY-MM-DD" if you can determine it, otherwise null - We verify dates and filter old content server-side - DO NOT pre-filter aggressively - include anything relevant REQUIRED: URLs must contain "/r/" AND "/comments/" REJECT: developers.reddit.com, business.reddit.com Find {min_items}-{max_items} threads. Return MORE rather than fewer. Return JSON: {{ "items": [ {{ "title": "Thread title", "url": "https://www.reddit.com/r/sub/comments/xyz/title/", "subreddit": "subreddit_name", "date": "YYYY-MM-DD or null", "why_relevant": "Why relevant", "relevance": 0.85 }} ] }}""" def _extract_core_subject(topic: str) -> str: """Extract core subject from verbose query for retry.""" noise = ['best', 'top', 'how to', 'tips for', 'practices', 'features', 'killer', 'guide', 'tutorial', 'recommendations', 'advice', 'prompting', 'using', 'for', 'with', 'the', 'of', 'in', 'on'] words = topic.lower().split() result = [w for w in words if w not in noise] return ' '.join(result[:3]) or topic # Keep max 3 words def search_reddit( api_key: str, model: str, topic: str, from_date: str, to_date: str, depth: str = "default", mock_response: Optional[Dict] = None, _retry: bool = False, ) -> Dict[str, Any]: """Search Reddit for relevant threads using OpenAI Responses API. Args: api_key: OpenAI API key model: Model to use topic: Search topic from_date: Start date (YYYY-MM-DD) - only include threads after this to_date: End date (YYYY-MM-DD) - only include threads before this depth: Research depth - "quick", "default", or "deep" mock_response: Mock response for testing Returns: Raw API response """ if mock_response is not None: return mock_response min_items, max_items = DEPTH_CONFIG.get(depth, DEPTH_CONFIG["default"]) headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", } # Adjust timeout based on depth (generous for OpenAI web_search which can be slow) timeout = 90 if depth == "quick" else 120 if depth == "default" else 180 # Note: allowed_domains accepts base domain, not subdomains # We rely on prompt to filter out developers.reddit.com, etc. payload = { "model": model, "tools": [ { "type": "web_search", "filters": { "allowed_domains": ["reddit.com"] } } ], "include": ["web_search_call.action.sources"], "input": REDDIT_SEARCH_PROMPT.format( topic=topic, from_date=from_date, to_date=to_date, min_items=min_items, max_items=max_items, ), } return http.post(OPENAI_RESPONSES_URL, payload, headers=headers, timeout=timeout) def parse_reddit_response(response: Dict[str, Any]) -> List[Dict[str, Any]]: """Parse OpenAI response to extract Reddit items. Args: response: Raw API response Returns: List of item dicts """ items = [] # Check for API errors first if "error" in response and response["error"]: error = response["error"] err_msg = error.get("message", str(error)) if isinstance(error, dict) else str(error) _log_error(f"OpenAI API error: {err_msg}") if http.DEBUG: _log_error(f"Full error response: {json.dumps(response, indent=2)[:1000]}") return items # Try to find the output text output_text = "" if "output" in response: output = response["output"] if isinstance(output, str): output_text = output elif isinstance(output, list): for item in output: if isinstance(item, dict): if item.get("type") == "message": content = item.get("content", []) for c in content: if isinstance(c, dict) and c.get("type") == "output_text": output_text = c.get("text", "") break elif "text" in item: output_text = item["text"] elif isinstance(item, str): output_text = item if output_text: break # Also check for choices (older format) if not output_text and "choices" in response: for choice in response["choices"]: if "message" in choice: output_text = choice["message"].get("content", "") break if not output_text: print(f"[REDDIT WARNING] No output text found in OpenAI response. Keys present: {list(response.keys())}", flush=True) return items # Extract JSON from the response json_match = re.search(r'\{[\s\S]*"items"[\s\S]*\}', output_text) if json_match: try: data = json.loads(json_match.group()) items = data.get("items", []) except json.JSONDecodeError: pass # Validate and clean items clean_items = [] for i, item in enumerate(items): if not isinstance(item, dict): continue url = item.get("url", "") if not url or "reddit.com" not in url: continue clean_item = { "id": f"R{i+1}", "title": str(item.get("title", "")).strip(), "url": url, "subreddit": str(item.get("subreddit", "")).strip().lstrip("r/"), "date": item.get("date"), "why_relevant": str(item.get("why_relevant", "")).strip(), "relevance": min(1.0, max(0.0, float(item.get("relevance", 0.5)))), } # Validate date format if clean_item["date"]: if not re.match(r'^\d{4}-\d{2}-\d{2}$', str(clean_item["date"])): clean_item["date"] = None clean_items.append(clean_item) return clean_items