diff --git a/deployments/staggered-restart/staggered-restart.py b/deployments/staggered-restart/staggered-restart.py new file mode 100644 index 0000000..4d1be7f --- /dev/null +++ b/deployments/staggered-restart/staggered-restart.py @@ -0,0 +1,284 @@ +#!/usr/bin/env python3 +""" +Firefrost Gaming - Staggered Server Restart System +Automated daily/weekly restart system for Minecraft servers via Pterodactyl API + +Author: Michael "Frostystyle" Krause & Claude "The Chronicler" +Version: 1.0.0 +Date: 2026-02-17 +""" + +import json +import time +import logging +import sys +from datetime import datetime +from pathlib import Path + +try: + import requests +except ImportError: + print("ERROR: requests module not installed. Run: pip3 install requests --break-system-packages") + sys.exit(1) + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('/var/log/staggered-restart.log'), + logging.StreamHandler() + ] +) + +logger = logging.getLogger(__name__) + + +class StaggeredRestartSystem: + def __init__(self, config_path='/opt/automation/restart-config.json'): + """Initialize the restart system with configuration""" + self.config = self.load_config(config_path) + self.ptero_url = self.config['pterodactyl']['url'] + self.ptero_key = self.config['pterodactyl']['api_key'] + self.discord_webhook = self.config['discord']['webhook_url'] + self.discord_enabled = self.config['discord']['notifications_enabled'] + self.settings = self.config['restart_settings'] + self.servers = sorted(self.config['servers'], key=lambda x: x['priority']) + + self.results = { + 'successful': [], + 'failed': [], + 'skipped': [] + } + + def load_config(self, path): + """Load configuration from JSON file""" + try: + with open(path, 'r') as f: + return json.load(f) + except FileNotFoundError: + logger.error(f"Config file not found: {path}") + sys.exit(1) + except json.JSONDecodeError as e: + logger.error(f"Invalid JSON in config file: {e}") + sys.exit(1) + + def api_request(self, endpoint, method='GET', data=None): + """Make request to Pterodactyl API""" + url = f"{self.ptero_url}/api/client/{endpoint}" + headers = { + 'Authorization': f'Bearer {self.ptero_key}', + 'Accept': 'application/vnd.pterodactyl.v1+json', + 'Content-Type': 'application/json' + } + + try: + if method == 'GET': + response = requests.get(url, headers=headers, timeout=30) + elif method == 'POST': + response = requests.post(url, headers=headers, json=data, timeout=30) + + response.raise_for_status() + return response.json() if response.text else {} + + except requests.exceptions.RequestException as e: + logger.error(f"API request failed: {e}") + return None + + def get_server_status(self, uuid): + """Get server status from Pterodactyl""" + data = self.api_request(f"servers/{uuid}/resources") + if data and 'attributes' in data: + return data['attributes']['current_state'] + return None + + def get_player_count(self, uuid): + """Get current player count (approximation via resource usage)""" + # Note: Pterodactyl doesn't directly expose player count + # This is a placeholder - may need RCON integration for accurate count + data = self.api_request(f"servers/{uuid}/resources") + if data and 'attributes' in data: + # Estimate based on memory usage or implement RCON query + return 0 # Placeholder + return 0 + + def send_console_command(self, uuid, command): + """Send command to server console""" + data = {'command': command} + result = self.api_request(f"servers/{uuid}/command", method='POST', data=data) + return result is not None + + def send_warning(self, uuid, server_name, seconds): + """Send restart warning to server""" + if seconds >= 60: + minutes = seconds // 60 + message = f"say Server restart in {minutes} minute{'s' if minutes != 1 else ''}! Save your work!" + else: + message = f"say Server restart in {seconds} seconds! Save NOW!" + + logger.info(f"{server_name}: Sending {seconds}s warning") + return self.send_console_command(uuid, message) + + def restart_server(self, uuid, server_name): + """Restart a server via Pterodactyl API""" + logger.info(f"{server_name}: Initiating restart") + + # Send restart signal + data = {'signal': 'restart'} + result = self.api_request(f"servers/{uuid}/power", method='POST', data=data) + + if result is None: + logger.error(f"{server_name}: Restart API call failed") + return False + + logger.info(f"{server_name}: Restart signal sent") + return True + + def wait_for_online(self, uuid, server_name, timeout=300): + """Wait for server to come back online""" + logger.info(f"{server_name}: Waiting for server to come online") + start_time = time.time() + + while (time.time() - start_time) < timeout: + status = self.get_server_status(uuid) + + if status == 'running': + duration = int(time.time() - start_time) + logger.info(f"{server_name}: Server online after {duration} seconds") + return True + + time.sleep(10) # Check every 10 seconds + + logger.error(f"{server_name}: Server failed to come online within {timeout} seconds") + return False + + def discord_notify(self, message, color=None): + """Send notification to Discord webhook""" + if not self.discord_enabled or not self.discord_webhook: + return + + embed = { + 'description': message, + 'timestamp': datetime.utcnow().isoformat() + } + + if color: + embed['color'] = color # Decimal color code + + payload = { + 'embeds': [embed] + } + + try: + requests.post(self.discord_webhook, json=payload, timeout=10) + except requests.exceptions.RequestException as e: + logger.error(f"Discord notification failed: {e}") + + def restart_with_warnings(self, server): + """Restart a server with player warnings""" + name = server['name'] + uuid = server['uuid'] + + logger.info(f"=== Starting restart sequence for {name} ===") + + # Check if we should skip based on player count + if self.settings['skip_if_players']: + player_count = self.get_player_count(uuid) + if player_count >= self.settings['player_threshold']: + logger.info(f"{name}: Skipped - {player_count} players online") + self.results['skipped'].append(name) + self.discord_notify(f"⏭️ **{name}** - Skipped (players online)", color=16776960) # Yellow + return False + + # Send warnings at configured intervals + for interval in self.settings['warning_intervals']: + self.send_warning(uuid, name, interval) + time.sleep(1) # Brief delay between warnings + + # Perform restart + if not self.restart_server(uuid, name): + logger.error(f"{name}: Restart failed") + self.results['failed'].append(name) + self.discord_notify(f"❌ **{name}** - Restart failed", color=16711680) # Red + return False + + # Wait for server to come back online + if not self.wait_for_online(uuid, name): + logger.error(f"{name}: Failed to come online") + self.results['failed'].append(name) + self.discord_notify(f"❌ **{name}** - Failed to come online", color=16711680) # Red + return False + + # Success + self.results['successful'].append(name) + self.discord_notify(f"✅ **{name}** - Restarted successfully", color=65280) # Green + return True + + def run(self): + """Main restart cycle""" + logger.info("=" * 60) + logger.info("STAGGERED RESTART SYSTEM STARTED") + logger.info(f"Servers to restart: {len(self.servers)}") + logger.info("=" * 60) + + # Send start notification + start_time = datetime.now() + estimated_duration = len(self.servers) * (self.settings['delay_between_restarts'] + 180) // 60 + self.discord_notify( + f"🔄 **Staggered Server Restart Started**\n" + f"Servers: {len(self.servers)}\n" + f"Estimated duration: ~{estimated_duration} minutes", + color=3447003 # Blue + ) + + # Restart each server + for i, server in enumerate(self.servers, 1): + name = server['name'] + logger.info(f"\n[{i}/{len(self.servers)}] Processing: {name}") + + # Restart with warnings + success = self.restart_with_warnings(server) + + # Wait before next server (except last one) + if i < len(self.servers): + delay = self.settings['delay_between_restarts'] + logger.info(f"Waiting {delay} seconds before next server...") + time.sleep(delay) + + # Summary + duration = (datetime.now() - start_time).total_seconds() / 60 + logger.info("\n" + "=" * 60) + logger.info("RESTART CYCLE COMPLETE") + logger.info(f"Successful: {len(self.results['successful'])}") + logger.info(f"Failed: {len(self.results['failed'])}") + logger.info(f"Skipped: {len(self.results['skipped'])}") + logger.info(f"Duration: {duration:.1f} minutes") + logger.info("=" * 60) + + # Send completion notification + status_emoji = "✅" if len(self.results['failed']) == 0 else "⚠️" + summary = ( + f"{status_emoji} **Restart Cycle Complete**\n" + f"Successful: {len(self.results['successful'])}\n" + f"Failed: {len(self.results['failed'])}\n" + f"Skipped: {len(self.results['skipped'])}\n" + f"Duration: {duration:.1f} minutes" + ) + + if self.results['failed']: + summary += f"\n\n❌ **Failed Servers:**\n" + "\n".join(f"- {s}" for s in self.results['failed']) + + color = 65280 if len(self.results['failed']) == 0 else 16776960 # Green or Yellow + self.discord_notify(summary, color=color) + + +if __name__ == '__main__': + try: + restart_system = StaggeredRestartSystem() + restart_system.run() + except KeyboardInterrupt: + logger.info("\nRestart cycle interrupted by user") + sys.exit(0) + except Exception as e: + logger.error(f"Unexpected error: {e}", exc_info=True) + sys.exit(1) diff --git a/docs/tasks/staggered-server-restart-system/deployment-plan.md b/docs/tasks/staggered-server-restart-system/deployment-plan.md new file mode 100644 index 0000000..fc0f8a4 --- /dev/null +++ b/docs/tasks/staggered-server-restart-system/deployment-plan.md @@ -0,0 +1,459 @@ +# Staggered Server Restart System - Deployment Plan + +**Status:** Planning Complete, Ready to Implement +**Priority:** Tier 3 - Automation & Optimization +**Time Estimate:** 2 hours implementation +**Last Updated:** 2026-02-17 + +--- + +## Overview + +Automated staggered restart system for all 11 Minecraft servers. Prevents simultaneous restarts, maintains continuous player access, and reduces server load by restarting servers one at a time with configurable spacing. + +**The Problem:** +- Manual server restarts are time-consuming +- Simultaneous restarts = all servers down at once +- Memory leaks require regular restarts (especially ATM10) +- Players need warning before restarts + +**The Solution:** +- Automated Python script using Pterodactyl API +- Staggered restarts (one server at a time) +- Configurable spacing between restarts (default: 5 minutes) +- Player warnings before restart +- Skip servers with active players (optional) +- Discord notifications +- Scheduled via cron + +--- + +## Architecture + +``` +Command Center (Cron Job) + ↓ +Python Script + ↓ +Pterodactyl API + ↓ +Game Servers (restart one at a time) + ↓ +Discord Webhook (notifications) +``` + +**Flow:** +1. Cron triggers script at scheduled time (e.g., 4 AM daily) +2. Script fetches list of all 11 Minecraft servers from Pterodactyl +3. For each server: + - Check if players online (optional skip logic) + - Send in-game warning (5 min, 3 min, 1 min, 30 sec) + - Execute restart via API + - Wait for server to come back online + - Wait configured delay (5 min) before next server +4. Post summary to Discord when complete + +--- + +## Features + +### Core Features + +**✅ Staggered Restarts** +- One server at a time +- Configurable delay between restarts (default: 5 min) +- Prevents infrastructure overload +- Maintains player access to other servers + +**✅ Player Warnings** +- 5 minute warning +- 3 minute warning +- 1 minute warning +- 30 second warning +- Messages sent via Pterodactyl console commands + +**✅ Active Player Detection (Optional)** +- Check player count before restart +- Skip servers with active players +- Retry later or on next cycle +- Configurable threshold (skip if > X players) + +**✅ Discord Notifications** +- Start notification (restart cycle beginning) +- Per-server notifications (Server X restarted) +- Completion notification (all servers done) +- Error notifications (if restart fails) + +**✅ Error Handling** +- Retry failed restarts (3 attempts) +- Continue to next server if one fails +- Log all errors +- Discord alert on failures + +--- + +## Server List + +**11 Minecraft Servers (order matters for restart sequence):** + +**TX1 Dallas (5 servers):** +1. Vanilla 1.21.11 - `3bed1bda-f648-4630-801a-fe9f2e3d3f27` (lowest priority - simple) +2. All The Mons - `668a5220-7e72-4379-9165-bdbb84bc9806` +3. Stoneblock 4 - `a0efbfe8-4b97-4a90-869d-ffe6d3072bd5` +4. Society: Sunlit Valley - `9310d0a6-62a6-4fe6-82c4-eb483dc68876` +5. Reclamation - `1eb33479-a6bc-4e8f-b64d-d1e4bfa0a8b4` + +**NC1 Charlotte (6 servers):** +6. Homestead - `2f85d4ef-aa49-4dd6-b448-beb3fca1db12` +7. EMC Subterra Tech - `09a95f38-9f8c-404a-9557-3a7c44258223` +8. The Ember Project - `124f9060-58a7-457a-b2cf-b4024fce2951` +9. Minecolonies: Create and Conquer - `a14201d2-83b2-44e6-ae48-e6c4cbc56f24` +10. All The Mods 10 - `82e63949-8fbf-4a44-b32a-53324e8492bf` (highest priority - memory leak) + +**Note:** FoundryVTT and Hytale excluded (not Minecraft) + +**Restart Order Logic:** +- Start with simple/low-population servers +- End with complex/high-population servers +- ATM10 last (most likely to have players, most need of restart) + +--- + +## Implementation + +### Script Location + +**File:** `/opt/automation/staggered-restart.py` +**Config:** `/opt/automation/restart-config.json` +**Logs:** `/var/log/staggered-restart.log` + +### Configuration File (restart-config.json) + +```json +{ + "pterodactyl": { + "url": "https://panel.firefrostgaming.com", + "api_key": "PTERODACTYL_API_KEY_HERE" + }, + "discord": { + "webhook_url": "DISCORD_WEBHOOK_URL_HERE", + "notifications_enabled": true + }, + "restart_settings": { + "delay_between_restarts": 300, + "warning_intervals": [300, 180, 60, 30], + "skip_if_players": false, + "player_threshold": 5, + "max_retries": 3, + "retry_delay": 60 + }, + "servers": [ + { + "name": "Vanilla 1.21.11", + "uuid": "3bed1bda-f648-4630-801a-fe9f2e3d3f27", + "priority": 1 + }, + { + "name": "All The Mons", + "uuid": "668a5220-7e72-4379-9165-bdbb84bc9806", + "priority": 2 + }, + { + "name": "Stoneblock 4", + "uuid": "a0efbfe8-4b97-4a90-869d-ffe6d3072bd5", + "priority": 3 + }, + { + "name": "Society: Sunlit Valley", + "uuid": "9310d0a6-62a6-4fe6-82c4-eb483dc68876", + "priority": 4 + }, + { + "name": "Reclamation", + "uuid": "1eb33479-a6bc-4e8f-b64d-d1e4bfa0a8b4", + "priority": 5 + }, + { + "name": "Homestead", + "uuid": "2f85d4ef-aa49-4dd6-b448-beb3fca1db12", + "priority": 6 + }, + { + "name": "EMC Subterra Tech", + "uuid": "09a95f38-9f8c-404a-9557-3a7c44258223", + "priority": 7 + }, + { + "name": "The Ember Project", + "uuid": "124f9060-58a7-457a-b2cf-b4024fce2951", + "priority": 8 + }, + { + "name": "Minecolonies: Create and Conquer", + "uuid": "a14201d2-83b2-44e6-ae48-e6c4cbc56f24", + "priority": 9 + }, + { + "name": "All The Mods 10", + "uuid": "82e63949-8fbf-4a44-b32a-53324e8492bf", + "priority": 10 + } + ] +} +``` + +### Main Script (staggered-restart.py) + +**See artifact for complete Python script** (will create after this overview) + +**Key functions:** +- `load_config()` - Load JSON configuration +- `get_server_status(uuid)` - Check if server is online +- `get_player_count(uuid)` - Count online players +- `send_warning(uuid, message)` - Send in-game message +- `restart_server(uuid)` - Trigger restart via API +- `wait_for_online(uuid)` - Poll until server is back +- `discord_notify(message)` - Send to Discord +- `main()` - Orchestrate the restart cycle + +--- + +## Deployment Steps + +### Phase 1: Prerequisites (10 min) + +- [ ] Pterodactyl API key with server control permissions +- [ ] Discord webhook URL for notifications channel +- [ ] Command Center SSH access +- [ ] Python 3.9+ installed on Command Center +- [ ] Required Python packages: `requests`, `json`, `time`, `logging` + +### Phase 2: Install Script (15 min) + +```bash +# On Command Center +mkdir -p /opt/automation +cd /opt/automation + +# Create config file +nano restart-config.json +# Paste config, update API key and webhook URL + +# Create Python script +nano staggered-restart.py +# Paste script contents + +# Make executable +chmod +x staggered-restart.py + +# Install dependencies +pip3 install requests --break-system-packages + +# Create log directory +mkdir -p /var/log +touch /var/log/staggered-restart.log +chmod 644 /var/log/staggered-restart.log +``` + +### Phase 3: Test Run (30 min) + +```bash +# Test with ONE server first (Vanilla - least impactful) +# Edit config to include only Vanilla server +python3 /opt/automation/staggered-restart.py + +# Watch logs +tail -f /var/log/staggered-restart.log + +# Verify: +# - Warning messages sent to server +# - Server restarted successfully +# - Discord notification received +# - Logs written properly +``` + +### Phase 4: Schedule with Cron (15 min) + +```bash +# Edit crontab +crontab -e + +# Add daily restart at 4 AM CST +0 4 * * * /usr/bin/python3 /opt/automation/staggered-restart.py >> /var/log/staggered-restart.log 2>&1 + +# Or weekly on Sundays at 4 AM +0 4 * * 0 /usr/bin/python3 /opt/automation/staggered-restart.py >> /var/log/staggered-restart.log 2>&1 +``` + +**Restart Schedule Options:** +- **Daily:** `0 4 * * *` (4 AM every day) +- **Weekly:** `0 4 * * 0` (4 AM every Sunday) +- **Twice Weekly:** `0 4 * * 0,3` (4 AM Sunday and Wednesday) +- **Monthly:** `0 4 1 * *` (4 AM first day of month) + +**Choose based on:** +- ATM10 memory leak severity (daily if bad) +- Player activity patterns +- Server performance needs + +### Phase 5: Monitor & Adjust (Ongoing) + +- [ ] Monitor first 3 restart cycles +- [ ] Adjust timing if players complain +- [ ] Fine-tune delay between restarts +- [ ] Adjust skip logic if needed +- [ ] Add/remove servers as needed + +--- + +## Discord Integration + +### Webhook Setup + +1. In Discord, go to server settings +2. Integrations → Webhooks +3. Create new webhook for #server-status channel +4. Copy webhook URL +5. Add to restart-config.json + +### Notification Examples + +**Start:** +``` +🔄 **Staggered Server Restart Started** +Time: 4:00 AM CST +Servers: 11 total +Estimated completion: ~60 minutes +``` + +**Per-Server:** +``` +✅ **Reclamation** restarted successfully +Online: Yes | Players: 0 | Duration: 45 seconds +``` + +**Completion:** +``` +✅ **All Servers Restarted** +Total: 11 servers +Successful: 11 +Failed: 0 +Duration: 57 minutes +``` + +**Error:** +``` +❌ **Restart Failed: ATM10** +Reason: API timeout +Attempts: 3/3 +Action: Manual restart required +``` + +--- + +## Advanced Features (Phase 2) + +**Future enhancements:** + +**Smart Scheduling:** +- Different schedules per server +- Priority-based restart order +- Peak hour avoidance + +**Enhanced Player Detection:** +- Check for admin override (restart even with players) +- Gentle kick players with warning +- Transfer players to similar server before restart + +**Performance Monitoring:** +- Track restart duration per server +- Memory usage before/after restart +- TPS monitoring integration + +**Advanced Notifications:** +- Per-server Discord channels +- SMS/email alerts on failures +- Grafana dashboard integration + +--- + +## Troubleshooting + +### Script doesn't run via cron + +**Check:** +```bash +# Verify cron is running +systemctl status cron + +# Check cron logs +grep CRON /var/log/syslog + +# Test script manually +/usr/bin/python3 /opt/automation/staggered-restart.py +``` + +### API key doesn't work + +**Verify:** +```bash +# Test API key +curl -H "Authorization: Bearer YOUR_API_KEY" \ + https://panel.firefrostgaming.com/api/client +``` + +### Server won't restart + +**Check:** +- Server status in Pterodactyl panel +- API permissions (needs server control) +- Server resource limits +- Pterodactyl Wings status on node + +### Warnings not sending + +**Verify:** +- Server console access in Pterodactyl +- Console command format correct +- Server is actually online when warnings sent + +--- + +## Maintenance + +### Daily + +- Check Discord for restart notifications +- Verify all servers came back online + +### Weekly + +- Review logs for errors +- Check restart duration trends +- Adjust schedule if needed + +### Monthly + +- Review player feedback +- Optimize restart order +- Update server list (add/remove servers) +- Test manual run to verify functionality + +--- + +## Related Tasks + +- **World Backup Automation** - Run backups before restarts +- **Netdata Deployment** - Monitor restart impact on resources +- **Discord Reorganization** - #server-status channel for notifications + +--- + +**Fire + Frost + Foundation = Where Love Builds Legacy** 💙🔥❄️ + +--- + +**Document Status:** COMPLETE +**Ready for Implementation:** When SSH access available (2 hours) +**Dependencies:** Pterodactyl API key, Discord webhook, Command Center access +**Owner:** The Wizard (Michael) + The Chronicler (automation)