Files
claude-code-skills-reference/transcript-fixer/scripts/utils/audit_log_retention.py
daymade 9b724f33e3 Release v1.9.0: Add video-comparer skill and enhance transcript-fixer
## New Skill: video-comparer v1.0.0
- Compare original and compressed videos with interactive HTML reports
- Calculate quality metrics (PSNR, SSIM) for compression analysis
- Generate frame-by-frame visual comparisons (slider, side-by-side, grid)
- Extract video metadata (codec, resolution, bitrate, duration)
- Multi-platform FFmpeg support with security features

## transcript-fixer Enhancements
- Add async AI processor for parallel processing
- Add connection pool management for database operations
- Add concurrency manager and rate limiter
- Add audit log retention and database migrations
- Add health check and metrics monitoring
- Add comprehensive test suite (8 new test files)
- Enhance security with domain and path validators

## Marketplace Updates
- Update marketplace version from 1.8.0 to 1.9.0
- Update skills count from 15 to 16
- Update documentation (README.md, CLAUDE.md, CHANGELOG.md)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-30 00:23:12 +08:00

710 lines
25 KiB
Python

#!/usr/bin/env python3
"""
Audit Log Retention Management Module
CRITICAL FIX (P1-11): Production-grade audit log retention and compliance
Features:
- Configurable retention policies per entity type
- Automatic cleanup of expired logs
- Archive capability for long-term storage
- Compliance reporting (GDPR, SOX, etc.)
- Selective retention based on criticality
- Restoration from archives
Compliance Standards:
- GDPR: Right to erasure, data minimization
- SOX: 7-year retention for financial records
- HIPAA: 6-year retention for healthcare data
- Industry best practices
Author: Chief Engineer (ISTJ, 20 years experience)
Date: 2025-10-29
Priority: P1 - High
"""
from __future__ import annotations
import gzip
import json
import logging
import sqlite3
from datetime import datetime, timedelta
from dataclasses import dataclass, asdict
from enum import Enum
from pathlib import Path
from typing import Dict, List, Optional, Any, Final
from contextlib import contextmanager
logger = logging.getLogger(__name__)
class RetentionPeriod(Enum):
"""Standard retention periods"""
SHORT = 30 # 30 days - operational logs
MEDIUM = 90 # 90 days - default
LONG = 180 # 180 days - 6 months
ANNUAL = 365 # 1 year
COMPLIANCE_SOX = 2555 # 7 years for SOX compliance
COMPLIANCE_HIPAA = 2190 # 6 years for HIPAA
PERMANENT = -1 # Never delete
class CleanupStrategy(Enum):
"""Cleanup strategies"""
DELETE = "delete" # Permanent deletion
ARCHIVE = "archive" # Move to archive before deletion
ANONYMIZE = "anonymize" # Remove PII, keep metadata
@dataclass
class RetentionPolicy:
"""Retention policy configuration"""
entity_type: str
retention_days: int
strategy: CleanupStrategy = CleanupStrategy.ARCHIVE
critical_action_retention_days: Optional[int] = None # Extended retention for critical actions
is_active: bool = True
description: Optional[str] = None
def __post_init__(self):
"""Validate retention policy"""
if self.retention_days < -1:
raise ValueError("retention_days must be -1 (permanent) or positive")
if self.critical_action_retention_days and self.critical_action_retention_days < self.retention_days:
raise ValueError("critical_action_retention_days must be >= retention_days")
@dataclass
class CleanupResult:
"""Result of cleanup operation"""
entity_type: str
records_scanned: int
records_deleted: int
records_archived: int
records_anonymized: int
execution_time_ms: int
errors: List[str]
success: bool
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary"""
return asdict(self)
@dataclass
class ComplianceReport:
"""Compliance report for audit purposes"""
report_date: datetime
total_audit_logs: int
oldest_log_date: Optional[datetime]
newest_log_date: Optional[datetime]
logs_by_entity_type: Dict[str, int]
retention_violations: List[str]
archived_logs_count: int
storage_size_mb: float
is_compliant: bool
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary"""
result = asdict(self)
result['report_date'] = self.report_date.isoformat()
if self.oldest_log_date:
result['oldest_log_date'] = self.oldest_log_date.isoformat()
if self.newest_log_date:
result['newest_log_date'] = self.newest_log_date.isoformat()
return result
# Critical actions that require extended retention
CRITICAL_ACTIONS: Final[set] = {
'delete_correction',
'update_correction',
'approve_learned_suggestion',
'reject_learned_suggestion',
'system_config_change',
'migration_applied',
'security_event',
}
class AuditLogRetentionManager:
"""
Production-grade audit log retention management
Features:
- Automatic cleanup based on retention policies
- Archival to compressed files
- Compliance reporting
- Selective retention for critical actions
- Transaction safety
"""
def __init__(self, db_path: Path, archive_dir: Optional[Path] = None):
"""
Initialize retention manager
Args:
db_path: Path to SQLite database
archive_dir: Directory for archived logs (defaults to db_path.parent / 'archives')
"""
self.db_path = Path(db_path)
self.archive_dir = archive_dir or (self.db_path.parent / "archives")
self.archive_dir.mkdir(parents=True, exist_ok=True)
# Default retention policies (can be overridden in database)
self.default_policies = {
'correction': RetentionPolicy(
entity_type='correction',
retention_days=RetentionPeriod.ANNUAL.value,
strategy=CleanupStrategy.ARCHIVE,
critical_action_retention_days=RetentionPeriod.COMPLIANCE_SOX.value,
description='Correction operations'
),
'suggestion': RetentionPolicy(
entity_type='suggestion',
retention_days=RetentionPeriod.MEDIUM.value,
strategy=CleanupStrategy.ARCHIVE,
description='Learning suggestions'
),
'system': RetentionPolicy(
entity_type='system',
retention_days=RetentionPeriod.COMPLIANCE_SOX.value,
strategy=CleanupStrategy.ARCHIVE,
description='System configuration changes'
),
'migration': RetentionPolicy(
entity_type='migration',
retention_days=RetentionPeriod.PERMANENT.value,
strategy=CleanupStrategy.ARCHIVE,
description='Database migrations'
),
}
@contextmanager
def _get_connection(self):
"""Get database connection"""
conn = sqlite3.connect(str(self.db_path))
conn.row_factory = sqlite3.Row
try:
yield conn
finally:
conn.close()
@contextmanager
def _transaction(self):
"""Transaction context manager"""
with self._get_connection() as conn:
cursor = conn.cursor()
cursor.execute("BEGIN")
try:
yield cursor
conn.commit()
except Exception:
conn.rollback()
raise
def load_retention_policies(self) -> Dict[str, RetentionPolicy]:
"""
Load retention policies from database
Returns:
Dictionary of policies by entity_type
"""
policies = dict(self.default_policies)
try:
with self._get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT entity_type, retention_days, is_active, description
FROM retention_policies
WHERE is_active = 1
""")
for row in cursor.fetchall():
entity_type = row['entity_type']
# Update default policy or create new one
if entity_type in policies:
policies[entity_type].retention_days = row['retention_days']
policies[entity_type].is_active = bool(row['is_active'])
else:
policies[entity_type] = RetentionPolicy(
entity_type=entity_type,
retention_days=row['retention_days'],
is_active=bool(row['is_active']),
description=row['description']
)
except sqlite3.Error as e:
logger.warning(f"Failed to load retention policies from database: {e}")
# Continue with default policies
return policies
def _archive_logs(self, logs: List[Dict[str, Any]], entity_type: str) -> Path:
"""
Archive logs to compressed file
Args:
logs: List of log records
entity_type: Entity type being archived
Returns:
Path to archive file
"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
archive_file = self.archive_dir / f"audit_log_{entity_type}_{timestamp}.json.gz"
with gzip.open(archive_file, 'wt', encoding='utf-8') as f:
json.dump(logs, f, indent=2, default=str)
logger.info(f"Archived {len(logs)} logs to {archive_file}")
return archive_file
def _anonymize_log(self, log: Dict[str, Any]) -> Dict[str, Any]:
"""
Anonymize log record (remove PII while keeping metadata)
Args:
log: Log record
Returns:
Anonymized log record
"""
anonymized = dict(log)
# Remove/mask PII fields
if 'user' in anonymized and anonymized['user']:
anonymized['user'] = 'ANONYMIZED'
if 'details' in anonymized and anonymized['details']:
# Keep only non-PII metadata
try:
details = json.loads(anonymized['details'])
# Remove potential PII
for key in list(details.keys()):
if any(pii in key.lower() for pii in ['email', 'name', 'ip', 'address']):
details[key] = 'ANONYMIZED'
anonymized['details'] = json.dumps(details)
except (json.JSONDecodeError, TypeError):
anonymized['details'] = 'ANONYMIZED'
return anonymized
def cleanup_expired_logs(
self,
entity_type: Optional[str] = None,
dry_run: bool = False
) -> List[CleanupResult]:
"""
Clean up expired audit logs based on retention policies
Args:
entity_type: Specific entity type to clean (None for all)
dry_run: If True, only simulate without actual deletion
Returns:
List of cleanup results per entity type
"""
policies = self.load_retention_policies()
results = []
# Filter policies
if entity_type:
if entity_type not in policies:
logger.warning(f"No retention policy found for entity_type: {entity_type}")
return results
policies = {entity_type: policies[entity_type]}
for entity_type, policy in policies.items():
if not policy.is_active:
logger.info(f"Skipping inactive policy for {entity_type}")
continue
if policy.retention_days == RetentionPeriod.PERMANENT.value:
logger.info(f"Permanent retention for {entity_type}, skipping cleanup")
continue
result = self._cleanup_entity_type(policy, dry_run)
results.append(result)
return results
def _cleanup_entity_type(
self,
policy: RetentionPolicy,
dry_run: bool = False
) -> CleanupResult:
"""
Clean up logs for specific entity type
Args:
policy: Retention policy to apply
dry_run: Simulation mode
Returns:
Cleanup result
"""
start_time = datetime.now()
entity_type = policy.entity_type
errors = []
records_scanned = 0
records_deleted = 0
records_archived = 0
records_anonymized = 0
try:
# Calculate cutoff date
cutoff_date = datetime.now() - timedelta(days=policy.retention_days)
# Extended retention for critical actions
critical_cutoff_date = None
if policy.critical_action_retention_days:
critical_cutoff_date = datetime.now() - timedelta(
days=policy.critical_action_retention_days
)
with self._transaction() as cursor:
# Find expired logs
cursor.execute("""
SELECT * FROM audit_log
WHERE entity_type = ?
AND timestamp < ?
ORDER BY timestamp ASC
""", (entity_type, cutoff_date.isoformat()))
expired_logs = [dict(row) for row in cursor.fetchall()]
records_scanned = len(expired_logs)
if records_scanned == 0:
logger.info(f"No expired logs found for {entity_type}")
return CleanupResult(
entity_type=entity_type,
records_scanned=0,
records_deleted=0,
records_archived=0,
records_anonymized=0,
execution_time_ms=0,
errors=[],
success=True
)
# Filter out critical actions with extended retention
logs_to_process = []
for log in expired_logs:
action = log.get('action', '')
if action in CRITICAL_ACTIONS and critical_cutoff_date:
log_date = datetime.fromisoformat(log['timestamp'])
if log_date >= critical_cutoff_date:
# Skip - still within critical retention period
continue
logs_to_process.append(log)
if not logs_to_process:
logger.info(f"All expired logs for {entity_type} are critical, skipping")
return CleanupResult(
entity_type=entity_type,
records_scanned=records_scanned,
records_deleted=0,
records_archived=0,
records_anonymized=0,
execution_time_ms=0,
errors=[],
success=True
)
if dry_run:
logger.info(
f"[DRY RUN] Would process {len(logs_to_process)} logs "
f"for {entity_type} with strategy {policy.strategy.value}"
)
return CleanupResult(
entity_type=entity_type,
records_scanned=records_scanned,
records_deleted=len(logs_to_process) if policy.strategy == CleanupStrategy.DELETE else 0,
records_archived=len(logs_to_process) if policy.strategy == CleanupStrategy.ARCHIVE else 0,
records_anonymized=len(logs_to_process) if policy.strategy == CleanupStrategy.ANONYMIZE else 0,
execution_time_ms=0,
errors=[],
success=True
)
# Execute cleanup strategy
log_ids = [log['id'] for log in logs_to_process]
if policy.strategy == CleanupStrategy.ARCHIVE:
# Archive before deletion
try:
archive_path = self._archive_logs(logs_to_process, entity_type)
records_archived = len(logs_to_process)
logger.info(f"Archived to {archive_path}")
except Exception as e:
errors.append(f"Archive failed: {e}")
raise
# Delete archived logs
cursor.execute(f"""
DELETE FROM audit_log
WHERE id IN ({','.join('?' * len(log_ids))})
""", log_ids)
records_deleted = cursor.rowcount
elif policy.strategy == CleanupStrategy.DELETE:
# Direct deletion (permanent)
cursor.execute(f"""
DELETE FROM audit_log
WHERE id IN ({','.join('?' * len(log_ids))})
""", log_ids)
records_deleted = cursor.rowcount
elif policy.strategy == CleanupStrategy.ANONYMIZE:
# Anonymize in place
for log in logs_to_process:
anonymized = self._anonymize_log(log)
cursor.execute("""
UPDATE audit_log
SET user = ?, details = ?
WHERE id = ?
""", (anonymized['user'], anonymized['details'], log['id']))
records_anonymized = len(logs_to_process)
# Record cleanup in history
execution_time_ms = int((datetime.now() - start_time).total_seconds() * 1000)
cursor.execute("""
INSERT INTO cleanup_history
(entity_type, records_deleted, execution_time_ms, success)
VALUES (?, ?, ?, 1)
""", (entity_type, records_deleted + records_anonymized, execution_time_ms))
logger.info(
f"Cleanup completed for {entity_type}: "
f"deleted={records_deleted}, archived={records_archived}, "
f"anonymized={records_anonymized}"
)
except Exception as e:
logger.error(f"Cleanup failed for {entity_type}: {e}")
errors.append(str(e))
# Record failure in history
try:
with self._transaction() as cursor:
execution_time_ms = int((datetime.now() - start_time).total_seconds() * 1000)
cursor.execute("""
INSERT INTO cleanup_history
(entity_type, records_deleted, execution_time_ms, success, error_message)
VALUES (?, 0, ?, 0, ?)
""", (entity_type, execution_time_ms, str(e)))
except Exception:
pass # Best effort
return CleanupResult(
entity_type=entity_type,
records_scanned=records_scanned,
records_deleted=0,
records_archived=0,
records_anonymized=0,
execution_time_ms=int((datetime.now() - start_time).total_seconds() * 1000),
errors=errors,
success=False
)
execution_time_ms = int((datetime.now() - start_time).total_seconds() * 1000)
return CleanupResult(
entity_type=entity_type,
records_scanned=records_scanned,
records_deleted=records_deleted,
records_archived=records_archived,
records_anonymized=records_anonymized,
execution_time_ms=execution_time_ms,
errors=errors,
success=len(errors) == 0
)
def generate_compliance_report(self) -> ComplianceReport:
"""
Generate compliance report for audit purposes
Returns:
Compliance report with statistics and violations
"""
with self._get_connection() as conn:
cursor = conn.cursor()
# Total audit logs
cursor.execute("SELECT COUNT(*) as count FROM audit_log")
total_logs = cursor.fetchone()['count']
# Date range
cursor.execute("""
SELECT
MIN(timestamp) as oldest,
MAX(timestamp) as newest
FROM audit_log
""")
row = cursor.fetchone()
oldest_log_date = datetime.fromisoformat(row['oldest']) if row['oldest'] else None
newest_log_date = datetime.fromisoformat(row['newest']) if row['newest'] else None
# Logs by entity type
cursor.execute("""
SELECT entity_type, COUNT(*) as count
FROM audit_log
GROUP BY entity_type
""")
logs_by_entity_type = {row['entity_type']: row['count'] for row in cursor.fetchall()}
# Check for retention violations
violations = []
policies = self.load_retention_policies()
for entity_type, policy in policies.items():
if policy.retention_days == RetentionPeriod.PERMANENT.value:
continue
cutoff_date = datetime.now() - timedelta(days=policy.retention_days)
cursor.execute("""
SELECT COUNT(*) as count
FROM audit_log
WHERE entity_type = ? AND timestamp < ?
""", (entity_type, cutoff_date.isoformat()))
expired_count = cursor.fetchone()['count']
if expired_count > 0:
violations.append(
f"{entity_type}: {expired_count} logs exceed retention period "
f"of {policy.retention_days} days"
)
# Archived logs count (count .gz files)
archived_count = len(list(self.archive_dir.glob("audit_log_*.json.gz")))
# Storage size
storage_size_mb = 0.0
db_size = self.db_path.stat().st_size if self.db_path.exists() else 0
storage_size_mb = db_size / (1024 * 1024)
# Archive size
for archive_file in self.archive_dir.glob("*.gz"):
storage_size_mb += archive_file.stat().st_size / (1024 * 1024)
is_compliant = len(violations) == 0
return ComplianceReport(
report_date=datetime.now(),
total_audit_logs=total_logs,
oldest_log_date=oldest_log_date,
newest_log_date=newest_log_date,
logs_by_entity_type=logs_by_entity_type,
retention_violations=violations,
archived_logs_count=archived_count,
storage_size_mb=round(storage_size_mb, 2),
is_compliant=is_compliant
)
def restore_from_archive(
self,
archive_file: Path,
verify_only: bool = False
) -> int:
"""
Restore logs from archive file
Args:
archive_file: Path to archive file
verify_only: If True, only verify archive integrity
Returns:
Number of logs restored (or that would be restored)
"""
if not archive_file.exists():
raise FileNotFoundError(f"Archive file not found: {archive_file}")
try:
with gzip.open(archive_file, 'rt', encoding='utf-8') as f:
logs = json.load(f)
if verify_only:
logger.info(f"Archive {archive_file.name} contains {len(logs)} logs")
return len(logs)
# Restore logs
with self._transaction() as cursor:
restored_count = 0
for log in logs:
# Check if log already exists
cursor.execute("""
SELECT id FROM audit_log
WHERE id = ?
""", (log['id'],))
if cursor.fetchone():
continue # Skip duplicates
# Insert log
cursor.execute("""
INSERT INTO audit_log
(id, timestamp, action, entity_type, entity_id, user, details, success, error_message)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
log['id'],
log['timestamp'],
log['action'],
log['entity_type'],
log.get('entity_id'),
log.get('user'),
log.get('details'),
log.get('success', 1),
log.get('error_message')
))
restored_count += 1
logger.info(f"Restored {restored_count} logs from {archive_file.name}")
return restored_count
except Exception as e:
logger.error(f"Failed to restore from archive {archive_file}: {e}")
raise
# Global instance for convenience
_global_manager: Optional[AuditLogRetentionManager] = None
def get_retention_manager(
db_path: Optional[Path] = None,
archive_dir: Optional[Path] = None
) -> AuditLogRetentionManager:
"""
Get global retention manager instance (singleton pattern)
Args:
db_path: Database path (only used on first call)
archive_dir: Archive directory (only used on first call)
Returns:
Global AuditLogRetentionManager instance
"""
global _global_manager
if _global_manager is None:
if db_path is None:
from utils.config import get_config
config = get_config()
db_path = config.database.path
_global_manager = AuditLogRetentionManager(db_path, archive_dir)
return _global_manager
def reset_retention_manager() -> None:
"""Reset global retention manager (mainly for testing)"""
global _global_manager
_global_manager = None