antigravity-skills-reference/skills/loki-mode/tests/test-state-recovery.sh

#!/bin/bash
# Test: State Recovery and Checkpoint Functionality
# Tests checkpoint creation, recovery, and rate limit handling

set -uo pipefail
# Note: Not using -e to allow collecting all test results

TEST_DIR=$(mktemp -d)
PASSED=0
FAILED=0

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

log_pass() { echo -e "${GREEN}[PASS]${NC} $1"; ((PASSED++)); }
log_fail() { echo -e "${RED}[FAIL]${NC} $1"; ((FAILED++)); }
log_test() { echo -e "${YELLOW}[TEST]${NC} $1"; }

cleanup() {
    rm -rf "$TEST_DIR"
}
trap cleanup EXIT

cd "$TEST_DIR"

echo "========================================"
echo "Loki Mode State Recovery Tests"
echo "========================================"
echo ""

# Initialize structure
mkdir -p .loki/{state/{agents,checkpoints},queue,artifacts/backups}

# Create initial state
cat > .loki/state/orchestrator.json << 'EOF'
{
  "version": "2.1.0",
  "startupId": "test-session-001",
  "phase": "development",
  "agents": {"active":["eng-backend-01"],"idle":[],"failed":[],"totalSpawned":5},
  "metrics": {"tasksCompleted":10,"tasksFailed":2,"deployments":0},
  "circuitBreakers": {},
  "lastCheckpoint": "",
  "currentRelease": "0.1.0"
}
EOF

# Create agent state
cat > .loki/state/agents/eng-backend-01.json << 'EOF'
{
  "id": "eng-backend-01",
  "status": "active",
  "currentTask": "task-042",
  "tasksCompleted": 8,
  "lastHeartbeat": "2025-01-15T10:30:00Z"
}
EOF

# Create queue state
cat > .loki/queue/pending.json << 'EOF'
{"tasks":[{"id":"task-043","type":"eng-frontend","priority":5}]}
EOF
cat > .loki/queue/in-progress.json << 'EOF'
{"tasks":[{"id":"task-042","type":"eng-backend","claimedBy":"eng-backend-01"}]}
EOF

# Test 1: Create checkpoint
log_test "Create checkpoint"
CHECKPOINT_DIR=".loki/state/checkpoints/$(date +%Y%m%d-%H%M%S)"
mkdir -p "$CHECKPOINT_DIR"
cp .loki/state/orchestrator.json "$CHECKPOINT_DIR/"
cp -r .loki/state/agents "$CHECKPOINT_DIR/"
cp -r .loki/queue "$CHECKPOINT_DIR/"

if [ -f "$CHECKPOINT_DIR/orchestrator.json" ] && [ -d "$CHECKPOINT_DIR/agents" ]; then
    log_pass "Checkpoint created at $CHECKPOINT_DIR"
else
    log_fail "Checkpoint creation failed"
fi

# Test 2: Update lastCheckpoint in state
log_test "Update lastCheckpoint timestamp"
python3 << EOF
import json
from datetime import datetime

with open('.loki/state/orchestrator.json', 'r') as f:
    state = json.load(f)

state['lastCheckpoint'] = datetime.utcnow().isoformat() + 'Z'

with open('.loki/state/orchestrator.json', 'w') as f:
    json.dump(state, f, indent=2)

print("UPDATED")
EOF

has_checkpoint=$(python3 -c "
import json
data = json.load(open('.loki/state/orchestrator.json'))
print('yes' if data.get('lastCheckpoint') else 'no')
")

if [ "$has_checkpoint" = "yes" ]; then
    log_pass "lastCheckpoint timestamp updated"
else
    log_fail "lastCheckpoint not set"
fi

# Test 3: Simulate crash and corrupt state
log_test "Detect corrupted state"
echo "corrupted{json" > .loki/state/orchestrator.json.corrupted

python3 << 'EOF'
import json

def is_valid_state(filepath):
    try:
        with open(filepath, 'r') as f:
            data = json.load(f)
        return isinstance(data, dict) and 'version' in data
    except (json.JSONDecodeError, KeyError):
        return False

is_valid = is_valid_state('.loki/state/orchestrator.json.corrupted')
print("CORRUPTED" if not is_valid else "VALID")
assert not is_valid, "Should detect corrupted state"
EOF

log_pass "Corrupted state detected"

# Test 4: Restore from checkpoint
log_test "Restore from checkpoint"
python3 << EOF
import json
import os
import shutil
from pathlib import Path

# Find latest checkpoint
checkpoints_dir = Path('.loki/state/checkpoints')
checkpoints = sorted(checkpoints_dir.iterdir(), reverse=True)

if checkpoints:
    latest = checkpoints[0]

    # Restore orchestrator state
    if (latest / 'orchestrator.json').exists():
        shutil.copy(latest / 'orchestrator.json', '.loki/state/orchestrator.json')

    # Restore agent states
    if (latest / 'agents').exists():
        for agent_file in (latest / 'agents').iterdir():
            shutil.copy(agent_file, f'.loki/state/agents/{agent_file.name}')

    # Restore queue
    if (latest / 'queue').exists():
        for queue_file in (latest / 'queue').iterdir():
            shutil.copy(queue_file, f'.loki/queue/{queue_file.name}')

    print(f"RESTORED:{latest.name}")
else:
    print("NO_CHECKPOINT")
EOF

# Verify restoration
restored_version=$(python3 -c "
import json
data = json.load(open('.loki/state/orchestrator.json'))
print(data.get('version', 'unknown'))
")

if [ "$restored_version" = "2.1.0" ]; then
    log_pass "State restored from checkpoint"
else
    log_fail "State restoration failed (version: $restored_version)"
fi

# Test 5: Orphaned task detection
log_test "Detect orphaned tasks"
python3 << 'EOF'
import json
from datetime import datetime, timedelta

CLAIM_TIMEOUT = 3600  # 1 hour

# Create an old claimed task
old_task = {
    "id": "task-old-001",
    "type": "eng-backend",
    "claimedBy": "dead-agent-99",
    "claimedAt": (datetime.utcnow() - timedelta(hours=2)).isoformat() + 'Z'
}

with open('.loki/queue/in-progress.json', 'r') as f:
    in_progress = json.load(f)

in_progress['tasks'].append(old_task)

with open('.loki/queue/in-progress.json', 'w') as f:
    json.dump(in_progress, f)

def find_orphaned_tasks(in_progress_tasks):
    orphaned = []
    now = datetime.utcnow()

    for task in in_progress_tasks:
        if task.get('claimedAt'):
            claimed_at = datetime.fromisoformat(task['claimedAt'].replace('Z', '+00:00'))
            age = (now.replace(tzinfo=claimed_at.tzinfo) - claimed_at).total_seconds()
            if age > CLAIM_TIMEOUT:
                orphaned.append(task['id'])

    return orphaned

orphaned = find_orphaned_tasks(in_progress['tasks'])
print(f"ORPHANED:{len(orphaned)}")
assert len(orphaned) >= 1, "Should find orphaned task"
print("VERIFIED")
EOF

log_pass "Orphaned task detection works"

# Test 6: Re-queue orphaned tasks
log_test "Re-queue orphaned tasks"
python3 << 'EOF'
import json
from datetime import datetime, timedelta

CLAIM_TIMEOUT = 3600

with open('.loki/queue/in-progress.json', 'r') as f:
    in_progress = json.load(f)

with open('.loki/queue/pending.json', 'r') as f:
    pending = json.load(f)

now = datetime.utcnow()
requeued = []

for task in in_progress['tasks'][:]:
    if task.get('claimedAt'):
        claimed_at = datetime.fromisoformat(task['claimedAt'].replace('Z', '+00:00'))
        age = (now.replace(tzinfo=claimed_at.tzinfo) - claimed_at).total_seconds()

        if age > CLAIM_TIMEOUT:
            # Re-queue: clear claim and move to pending
            task['claimedBy'] = None
            task['claimedAt'] = None
            task['requeuedAt'] = now.isoformat() + 'Z'
            task['requeueReason'] = 'claim_timeout'

            pending['tasks'].append(task)
            in_progress['tasks'].remove(task)
            requeued.append(task['id'])

with open('.loki/queue/in-progress.json', 'w') as f:
    json.dump(in_progress, f)

with open('.loki/queue/pending.json', 'w') as f:
    json.dump(pending, f)

print(f"REQUEUED:{len(requeued)}")
EOF

log_pass "Orphaned tasks re-queued"

# Test 7: Rate limit backoff simulation
log_test "Rate limit exponential backoff"
python3 << 'EOF'
import time
import random

def calculate_backoff(attempt, base_delay=60, max_delay=3600):
    """Calculate exponential backoff with jitter"""
    delay = min(base_delay * (2 ** attempt), max_delay)
    jitter = random.uniform(0, delay * 0.1)
    return delay + jitter

# Test backoff progression
delays = []
for attempt in range(5):
    delay = calculate_backoff(attempt)
    delays.append(int(delay))
    print(f"Attempt {attempt}: {delay:.0f}s")

# Verify exponential growth
assert delays[0] >= 60, "Initial delay should be ~60s"
assert delays[1] >= 120, "Second delay should be ~120s"
assert delays[2] >= 240, "Third delay should be ~240s"
assert delays[4] <= 4000, "Should cap at max_delay"

print("VERIFIED")
EOF

log_pass "Exponential backoff works"

# Test 8: Full system recovery
log_test "Full system recovery simulation"
python3 << 'EOF'
import json
import os
from pathlib import Path
from datetime import datetime, timedelta

def recover_system():
    """Full system recovery procedure"""
    recovery_log = []

    # 1. Check orchestrator state
    try:
        with open('.loki/state/orchestrator.json', 'r') as f:
            state = json.load(f)
        recovery_log.append("Orchestrator state: OK")
    except:
        recovery_log.append("Orchestrator state: RESTORE FROM CHECKPOINT")
        # Would restore here

    # 2. Check agent states
    agents_dir = Path('.loki/state/agents')
    active_agents = []
    dead_agents = []

    for agent_file in agents_dir.glob('*.json'):
        with open(agent_file, 'r') as f:
            agent = json.load(f)

        # Check heartbeat
        if agent.get('lastHeartbeat'):
            hb = datetime.fromisoformat(agent['lastHeartbeat'].replace('Z', '+00:00'))
            age = (datetime.now(hb.tzinfo) - hb).total_seconds()
            if age > 600:  # 10 min heartbeat timeout
                dead_agents.append(agent['id'])
            else:
                active_agents.append(agent['id'])

    recovery_log.append(f"Active agents: {len(active_agents)}")
    recovery_log.append(f"Dead agents: {len(dead_agents)}")

    # 3. Re-queue tasks from dead agents
    with open('.loki/queue/in-progress.json', 'r') as f:
        in_progress = json.load(f)

    requeued = 0
    for task in in_progress['tasks'][:]:
        if task.get('claimedBy') in dead_agents:
            task['claimedBy'] = None
            task['claimedAt'] = None
            requeued += 1

    with open('.loki/queue/in-progress.json', 'w') as f:
        json.dump(in_progress, f)

    recovery_log.append(f"Re-queued tasks: {requeued}")

    # 4. Reset circuit breakers if cooldown expired
    if 'circuitBreakers' in state:
        for cb_name, cb in state['circuitBreakers'].items():
            if cb.get('state') == 'open' and cb.get('cooldownUntil'):
                cooldown = datetime.fromisoformat(cb['cooldownUntil'].replace('Z', '+00:00'))
                if datetime.now(cooldown.tzinfo) > cooldown:
                    cb['state'] = 'half-open'
                    recovery_log.append(f"Circuit breaker {cb_name}: OPEN -> HALF-OPEN")

    return recovery_log

log = recover_system()
for entry in log:
    print(entry)

print("RECOVERY_COMPLETE")
EOF

log_pass "Full system recovery works"

echo ""
echo "========================================"
echo "Test Summary"
echo "========================================"
echo -e "${GREEN}Passed: $PASSED${NC}"
echo -e "${RED}Failed: $FAILED${NC}"
echo ""

if [ $FAILED -eq 0 ]; then
    echo -e "${GREEN}All tests passed!${NC}"
    exit 0
else
    echo -e "${RED}Some tests failed!${NC}"
    exit 1
fi