Release v1.8.0: Add transcript-fixer skill

## New Skill: transcript-fixer v1.0.0 Correct speech-to-text (ASR/STT) transcription errors through dictionary-based rules and AI-powered corrections with automatic pattern learning. **Features:** - Two-stage correction pipeline (dictionary + AI) - Automatic pattern detection and learning - Domain-specific dictionaries (general, embodied_ai, finance, medical) - SQLite-based correction repository - Team collaboration with import/export - GLM API integration for AI corrections - Cost optimization through dictionary promotion **Use cases:** - Correcting meeting notes, lecture recordings, or interview transcripts - Fixing Chinese/English homophone errors and technical terminology - Building domain-specific correction dictionaries - Improving transcript accuracy through iterative learning **Documentation:** - Complete workflow guides in references/ - SQL query templates - Troubleshooting guide - Team collaboration patterns - API setup instructions **Marketplace updates:** - Updated marketplace to v1.8.0 - Added transcript-fixer plugin (category: productivity) - Updated README.md with skill description and use cases - Updated CLAUDE.md with skill listing and counts 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-28 13:16:37 +08:00
parent d1041ac203
commit bd0aa12004
44 changed files with 7432 additions and 8 deletions
--- a/transcript-fixer/scripts/utils/init.py
+++ b/transcript-fixer/scripts/utils/init.py
@@ -0,0 +1,16 @@
+"""
+Utils Module - Utility Functions and Tools
+
+This module contains utility functions:
+- diff_generator: Multi-format diff report generation
+- validation: Configuration validation
+"""
+
+from .diff_generator import generate_full_report
+from .validation import validate_configuration, print_validation_summary
+
+__all__ = [
+    'generate_full_report',
+    'validate_configuration',
+    'print_validation_summary',
+]
--- a/transcript-fixer/scripts/utils/diff_formats/init.py
+++ b/transcript-fixer/scripts/utils/diff_formats/init.py
@@ -0,0 +1,18 @@
+"""
+Diff format generators for transcript comparison
+"""
+
+from .unified_format import generate_unified_diff
+from .html_format import generate_html_diff
+from .inline_format import generate_inline_diff
+from .markdown_format import generate_markdown_report
+from .change_extractor import extract_changes, generate_change_summary
+
+__all__ = [
+    'generate_unified_diff',
+    'generate_html_diff',
+    'generate_inline_diff',
+    'generate_markdown_report',
+    'extract_changes',
+    'generate_change_summary',
+]
--- a/transcript-fixer/scripts/utils/diff_formats/change_extractor.py
+++ b/transcript-fixer/scripts/utils/diff_formats/change_extractor.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""
+Change extraction and summarization
+
+SINGLE RESPONSIBILITY: Extract and summarize changes between text versions
+"""
+
+from __future__ import annotations
+
+import difflib
+
+from .text_splitter import split_into_words
+
+
+def extract_changes(original: str, fixed: str) -> list[dict]:
+    """
+    Extract all changes and return change list
+
+    Args:
+        original: Original text
+        fixed: Fixed text
+
+    Returns:
+        List of change dictionaries with type, context, and content
+    """
+    original_words = split_into_words(original)
+    fixed_words = split_into_words(fixed)
+
+    diff = difflib.SequenceMatcher(None, original_words, fixed_words)
+    changes = []
+
+    for tag, i1, i2, j1, j2 in diff.get_opcodes():
+        if tag == 'replace':
+            original_text = ''.join(original_words[i1:i2])
+            fixed_text = ''.join(fixed_words[j1:j2])
+            changes.append({
+                'type': 'replace',
+                'original': original_text,
+                'fixed': fixed_text,
+                'context_before': ''.join(original_words[max(0, i1-5):i1]),
+                'context_after': ''.join(original_words[i2:min(len(original_words), i2+5)])
+            })
+        elif tag == 'delete':
+            original_text = ''.join(original_words[i1:i2])
+            changes.append({
+                'type': 'delete',
+                'original': original_text,
+                'fixed': '',
+                'context_before': ''.join(original_words[max(0, i1-5):i1]),
+                'context_after': ''.join(original_words[i2:min(len(original_words), i2+5)])
+            })
+        elif tag == 'insert':
+            fixed_text = ''.join(fixed_words[j1:j2])
+            changes.append({
+                'type': 'insert',
+                'original': '',
+                'fixed': fixed_text,
+                'context_before': ''.join(fixed_words[max(0, j1-5):j1]) if j1 > 0 else '',
+                'context_after': ''.join(fixed_words[j2:min(len(fixed_words), j2+5)])
+            })
+
+    return changes
+
+
+def generate_change_summary(changes: list[dict]) -> str:
+    """
+    Generate change summary
+
+    Args:
+        changes: List of change dictionaries
+
+    Returns:
+        Formatted summary string
+    """
+    result = []
+    result.append("=" * 80)
+    result.append(f"修改摘要 (共 {len(changes)} 处修改)")
+    result.append("=" * 80)
+    result.append("")
+
+    for i, change in enumerate(changes, 1):
+        change_type = {
+            'replace': '替换',
+            'delete': '删除',
+            'insert': '添加'
+        }[change['type']]
+
+        result.append(f"[{i}] {change_type}")
+
+        if change['original']:
+            result.append(f"  原文: {change['original']}")
+        if change['fixed']:
+            result.append(f"  修复: {change['fixed']}")
+
+        # Show context
+        context = change['context_before'] + "【修改处】" + change['context_after']
+        if context.strip():
+            result.append(f"  上下文: ...{context}...")
+
+        result.append("")
+
+    return '\n'.join(result)
--- a/transcript-fixer/scripts/utils/diff_formats/html_format.py
+++ b/transcript-fixer/scripts/utils/diff_formats/html_format.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+"""
+HTML diff format generator
+
+SINGLE RESPONSIBILITY: Generate HTML side-by-side comparison
+"""
+
+from __future__ import annotations
+
+import difflib
+
+
+def generate_html_diff(original: str, fixed: str) -> str:
+    """
+    Generate HTML format comparison report (side-by-side)
+
+    Args:
+        original: Original text
+        fixed: Fixed text
+
+    Returns:
+        HTML format string with side-by-side comparison
+    """
+    original_lines = original.splitlines(keepends=True)
+    fixed_lines = fixed.splitlines(keepends=True)
+
+    differ = difflib.HtmlDiff(wrapcolumn=80)
+    html = differ.make_file(
+        original_lines,
+        fixed_lines,
+        fromdesc='原始版本',
+        todesc='修复版本',
+        context=True,
+        numlines=3
+    )
+
+    return html
--- a/transcript-fixer/scripts/utils/diff_formats/inline_format.py
+++ b/transcript-fixer/scripts/utils/diff_formats/inline_format.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+"""
+Inline diff format generator
+
+SINGLE RESPONSIBILITY: Generate inline diff with change markers
+"""
+
+from __future__ import annotations
+
+import difflib
+
+from .text_splitter import split_into_words
+
+
+def generate_inline_diff(original: str, fixed: str) -> str:
+    """
+    Generate inline diff marking deletions and additions
+
+    Format:
+        - Normal words: unchanged
+        - Deletions: [-word-]
+        - Additions: [+word+]
+
+    Args:
+        original: Original text
+        fixed: Fixed text
+
+    Returns:
+        Inline diff string with markers
+    """
+    original_words = split_into_words(original)
+    fixed_words = split_into_words(fixed)
+
+    diff = difflib.ndiff(original_words, fixed_words)
+
+    result = []
+    result.append("=" * 80)
+    result.append("行内词语级别对比 (- 删除, + 添加, ? 修改标记)")
+    result.append("=" * 80)
+    result.append("")
+
+    current_line = []
+    for item in diff:
+        marker = item[0]
+        word = item[2:]
+
+        if marker == ' ':
+            current_line.append(word)
+        elif marker == '-':
+            current_line.append(f"[-{word}-]")
+        elif marker == '+':
+            current_line.append(f"[+{word}+]")
+        elif marker == '?':
+            # Skip change marker lines
+            continue
+
+        # Wrap at 80 characters
+        if len(''.join(current_line)) > 80:
+            result.append(''.join(current_line))
+            current_line = []
+
+    if current_line:
+        result.append(''.join(current_line))
+
+    return '\n'.join(result)
--- a/transcript-fixer/scripts/utils/diff_formats/markdown_format.py
+++ b/transcript-fixer/scripts/utils/diff_formats/markdown_format.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+"""
+Markdown report generator
+
+SINGLE RESPONSIBILITY: Generate detailed Markdown comparison report
+"""
+
+from __future__ import annotations
+
+from datetime import datetime
+from pathlib import Path
+
+from .change_extractor import extract_changes, generate_change_summary
+
+
+def generate_markdown_report(
+    original_file: str,
+    stage1_file: str,
+    stage2_file: str,
+    original: str,
+    stage1: str,
+    stage2: str
+) -> str:
+    """
+    Generate comprehensive Markdown comparison report
+
+    Args:
+        original_file: Original file path
+        stage1_file: Stage 1 file path
+        stage2_file: Stage 2 file path
+        original: Original text content
+        stage1: Stage 1 text content
+        stage2: Stage 2 text content
+
+    Returns:
+        Formatted Markdown report string
+    """
+    original_path = Path(original_file)
+    stage1_path = Path(stage1_file)
+    stage2_path = Path(stage2_file)
+
+    # Extract changes for each stage
+    changes_stage1 = extract_changes(original, stage1)
+    changes_stage2 = extract_changes(stage1, stage2)
+    changes_total = extract_changes(original, stage2)
+
+    # Generate summaries
+    summary_stage1 = generate_change_summary(changes_stage1)
+    summary_stage2 = generate_change_summary(changes_stage2)
+    summary_total = generate_change_summary(changes_total)
+
+    # Build report
+    report = f"""# 会议记录修复对比报告
+
+## 文件信息
+
+- **原始文件**: {original_path.name}
+- **阶段1修复**: {stage1_path.name}
+- **阶段2修复**: {stage2_path.name}
+- **生成时间**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+
+## 修改统计
+
+| 阶段 | 修改数量 | 说明 |
+|------|---------|------|
+| 阶段1: 词典修复 | {len(changes_stage1)} | 基于预定义词典的批量替换 |
+| 阶段2: AI修复 | {len(changes_stage2)} | GLM-4.6智能纠错 |
+| **总计** | **{len(changes_total)}** | **原始→最终版本** |
+
+---
+
+# 阶段1: 词典修复详情
+
+{summary_stage1}
+
+---
+
+# 阶段2: AI智能修复详情
+
+{summary_stage2}
+
+---
+
+# 总体修改详情 (原始→最终)
+
+{summary_total}
+
+---
+
+## 使用说明
+
+1. **查看修改**: 每处修改都包含上下文,便于理解修改原因
+2. **人工审核**: 重点审核标记为"替换"的修改
+3. **专业术语**: 特别注意公司名、人名、技术术语的修改
+
+## 建议审核重点
+
+- [ ] 专业术语(具身智能、机器人等)
+- [ ] 人名和公司名
+- [ ] 数字(金额、时间等)
+- [ ] 上下文是否通顺
+"""
+
+    return report
--- a/transcript-fixer/scripts/utils/diff_formats/text_splitter.py
+++ b/transcript-fixer/scripts/utils/diff_formats/text_splitter.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+"""
+Text splitter utility for word-level diff generation
+
+SINGLE RESPONSIBILITY: Split text into words while preserving structure
+"""
+
+from __future__ import annotations
+
+import re
+
+
+def split_into_words(text: str) -> list[str]:
+    """
+    Split text into words, preserving whitespace and punctuation
+
+    This enables word-level diff generation for Chinese and English text
+
+    Args:
+        text: Input text to split
+
+    Returns:
+        List of word tokens (Chinese words, English words, numbers, punctuation)
+    """
+    # Pattern: Chinese chars, English words, numbers, non-alphanumeric chars
+    pattern = r'[\u4e00-\u9fff]+|[a-zA-Z]+|[0-9]+|[^\u4e00-\u9fffa-zA-Z0-9]'
+    return re.findall(pattern, text)
+
+
+def read_file(file_path: str) -> str:
+    """Read file contents"""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return f.read()
--- a/transcript-fixer/scripts/utils/diff_formats/unified_format.py
+++ b/transcript-fixer/scripts/utils/diff_formats/unified_format.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+"""
+Unified diff format generator
+
+SINGLE RESPONSIBILITY: Generate unified diff format output
+"""
+
+from __future__ import annotations
+
+import difflib
+
+from .text_splitter import split_into_words
+
+
+def generate_unified_diff(
+    original: str,
+    fixed: str,
+    original_label: str = "原始版本",
+    fixed_label: str = "修复版本"
+) -> str:
+    """
+    Generate unified format diff report
+
+    Args:
+        original: Original text
+        fixed: Fixed text
+        original_label: Label for original version
+        fixed_label: Label for fixed version
+
+    Returns:
+        Unified diff format string
+    """
+    original_words = split_into_words(original)
+    fixed_words = split_into_words(fixed)
+
+    diff = difflib.unified_diff(
+        original_words,
+        fixed_words,
+        fromfile=original_label,
+        tofile=fixed_label,
+        lineterm=''
+    )
+
+    return '\n'.join(diff)
--- a/transcript-fixer/scripts/utils/diff_generator.py
+++ b/transcript-fixer/scripts/utils/diff_generator.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""
+Generate word-level correction comparison reports
+Orchestrates multiple diff formats for visualization
+
+SINGLE RESPONSIBILITY: Coordinate diff generation workflow
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+from .diff_formats import (
+    generate_unified_diff,
+    generate_html_diff,
+    generate_inline_diff,
+    generate_markdown_report,
+)
+from .diff_formats.text_splitter import read_file
+
+
+def generate_full_report(
+    original_file: str,
+    stage1_file: str,
+    stage2_file: str,
+    output_dir: str = None
+):
+    """
+    Generate comprehensive comparison report
+
+    Creates 4 output files:
+        1. Markdown format detailed report
+        2. Unified diff format
+        3. HTML side-by-side comparison
+        4. Inline marked comparison
+
+    Args:
+        original_file: Path to original transcript
+        stage1_file: Path to stage 1 (dictionary) corrected version
+        stage2_file: Path to stage 2 (AI) corrected version
+        output_dir: Optional output directory (defaults to original file location)
+    """
+    original_path = Path(original_file)
+    stage1_path = Path(stage1_file)
+    stage2_path = Path(stage2_file)
+
+    # Determine output directory
+    if output_dir:
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+    else:
+        output_path = original_path.parent
+
+    base_name = original_path.stem
+
+    # Read files
+    print(f"📖 读取文件...")
+    original = read_file(original_file)
+    stage1 = read_file(stage1_file)
+    stage2 = read_file(stage2_file)
+
+    # Generate reports
+    print(f"📝 生成对比报告...")
+
+    # 1. Markdown report
+    print(f"   生成Markdown报告...")
+    md_report = generate_markdown_report(
+        original_file, stage1_file, stage2_file,
+        original, stage1, stage2
+    )
+    md_file = output_path / f"{base_name}_对比报告.md"
+    with open(md_file, 'w', encoding='utf-8') as f:
+        f.write(md_report)
+    print(f"   ✓ Markdown报告: {md_file.name}")
+
+    # 2. Unified Diff
+    print(f"   生成Unified Diff...")
+    unified_diff = generate_unified_diff(original, stage2)
+    diff_file = output_path / f"{base_name}_unified.diff"
+    with open(diff_file, 'w', encoding='utf-8') as f:
+        f.write(unified_diff)
+    print(f"   ✓ Unified Diff: {diff_file.name}")
+
+    # 3. HTML comparison
+    print(f"   生成HTML对比...")
+    html_diff = generate_html_diff(original, stage2)
+    html_file = output_path / f"{base_name}_对比.html"
+    with open(html_file, 'w', encoding='utf-8') as f:
+        f.write(html_diff)
+    print(f"   ✓ HTML对比: {html_file.name}")
+
+    # 4. Inline diff
+    print(f"   生成行内diff...")
+    inline_diff = generate_inline_diff(original, stage2)
+    inline_file = output_path / f"{base_name}_行内对比.txt"
+    with open(inline_file, 'w', encoding='utf-8') as f:
+        f.write(inline_diff)
+    print(f"   ✓ 行内对比: {inline_file.name}")
+
+    # Summary
+    print(f"\n✅ 对比报告生成完成!")
+    print(f"📂 输出目录: {output_path}")
+    print(f"\n生成的文件:")
+    print(f"   1. {md_file.name} - Markdown格式详细报告")
+    print(f"   2. {diff_file.name} - Unified Diff格式")
+    print(f"   3. {html_file.name} - HTML并排对比")
+    print(f"   4. {inline_file.name} - 行内标记对比")
+
+
+def main():
+    """CLI entry point"""
+    if len(sys.argv) < 4:
+        print("用法: python generate_diff_report.py <原始文件> <阶段1文件> <阶段2文件> [输出目录]")
+        print()
+        print("示例:")
+        print("  python generate_diff_report.py \\")
+        print("    原始.md \\")
+        print("    原始_阶段1_词典修复.md \\")
+        print("    原始_阶段2_AI修复.md")
+        sys.exit(1)
+
+    original_file = sys.argv[1]
+    stage1_file = sys.argv[2]
+    stage2_file = sys.argv[3]
+    output_dir = sys.argv[4] if len(sys.argv) > 4 else None
+
+    generate_full_report(original_file, stage1_file, stage2_file, output_dir)
+
+
+if __name__ == "__main__":
+    main()
--- a/transcript-fixer/scripts/utils/logging_config.py
+++ b/transcript-fixer/scripts/utils/logging_config.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+"""
+Logging Configuration for Transcript Fixer
+
+Provides structured logging with rotation, levels, and audit trails.
+"""
+
+import logging
+import logging.handlers
+import sys
+from pathlib import Path
+from typing import Optional
+
+
+def setup_logging(
+    log_dir: Optional[Path] = None,
+    level: str = "INFO",
+    enable_console: bool = True,
+    enable_file: bool = True,
+    enable_audit: bool = True
+) -> None:
+    """
+    Configure logging for the application.
+
+    Args:
+        log_dir: Directory for log files (default: ~/.transcript-fixer/logs)
+        level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+        enable_console: Enable console output
+        enable_file: Enable file logging
+        enable_audit: Enable audit logging
+
+    Example:
+        >>> setup_logging(level="DEBUG")
+        >>> logger = logging.getLogger(__name__)
+        >>> logger.info("Application started")
+    """
+    # Default log directory
+    if log_dir is None:
+        log_dir = Path.home() / ".transcript-fixer" / "logs"
+
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    # Root logger configuration
+    root_logger = logging.getLogger()
+    root_logger.setLevel(logging.DEBUG)  # Capture all, filter by handler
+
+    # Clear existing handlers
+    root_logger.handlers.clear()
+
+    # Formatters
+    detailed_formatter = logging.Formatter(
+        fmt='%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+
+    simple_formatter = logging.Formatter(
+        fmt='%(asctime)s - %(levelname)s - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+
+    # Console handler
+    if enable_console:
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(getattr(logging, level.upper()))
+        console_handler.setFormatter(simple_formatter)
+        root_logger.addHandler(console_handler)
+
+    # File handler (rotating)
+    if enable_file:
+        file_handler = logging.handlers.RotatingFileHandler(
+            filename=log_dir / "transcript-fixer.log",
+            maxBytes=10 * 1024 * 1024,  # 10MB
+            backupCount=5,
+            encoding='utf-8'
+        )
+        file_handler.setLevel(logging.DEBUG)
+        file_handler.setFormatter(detailed_formatter)
+        root_logger.addHandler(file_handler)
+
+    # Error file handler (only errors)
+    if enable_file:
+        error_handler = logging.handlers.RotatingFileHandler(
+            filename=log_dir / "errors.log",
+            maxBytes=10 * 1024 * 1024,  # 10MB
+            backupCount=3,
+            encoding='utf-8'
+        )
+        error_handler.setLevel(logging.ERROR)
+        error_handler.setFormatter(detailed_formatter)
+        root_logger.addHandler(error_handler)
+
+    # Audit handler (separate audit trail)
+    if enable_audit:
+        audit_handler = logging.handlers.RotatingFileHandler(
+            filename=log_dir / "audit.log",
+            maxBytes=50 * 1024 * 1024,  # 50MB
+            backupCount=10,
+            encoding='utf-8'
+        )
+        audit_handler.setLevel(logging.INFO)
+        audit_handler.setFormatter(detailed_formatter)
+
+        # Create audit logger
+        audit_logger = logging.getLogger('audit')
+        audit_logger.setLevel(logging.INFO)
+        audit_logger.addHandler(audit_handler)
+        audit_logger.propagate = False  # Don't propagate to root
+
+    logging.info(f"Logging configured: level={level}, log_dir={log_dir}")
+
+
+def get_audit_logger() -> logging.Logger:
+    """Get the dedicated audit logger."""
+    return logging.getLogger('audit')
+
+
+# Example usage
+if __name__ == "__main__":
+    setup_logging(level="DEBUG")
+    logger = logging.getLogger(__name__)
+
+    logger.debug("Debug message")
+    logger.info("Info message")
+    logger.warning("Warning message")
+    logger.error("Error message")
+    logger.critical("Critical message")
+
+    audit_logger = get_audit_logger()
+    audit_logger.info("User 'admin' added correction: '错误' → '正确'")
--- a/transcript-fixer/scripts/utils/validation.py
+++ b/transcript-fixer/scripts/utils/validation.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""
+Validation Utility - Configuration Health Checker
+
+SINGLE RESPONSIBILITY: Validate transcript-fixer configuration and JSON files
+
+Features:
+- Check directory structure
+- Validate JSON syntax in all config files
+- Check environment variables
+- Report statistics and health status
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+from pathlib import Path
+
+# Handle imports for both standalone and package usage
+try:
+    from core import CorrectionRepository, CorrectionService
+except ImportError:
+    # Fallback for when run from scripts directory directly
+    import sys
+    from pathlib import Path
+    sys.path.insert(0, str(Path(__file__).parent.parent))
+    from core import CorrectionRepository, CorrectionService
+
+
+def validate_configuration() -> tuple[list[str], list[str]]:
+    """
+    Validate transcript-fixer configuration.
+
+    Returns:
+        Tuple of (errors, warnings) as string lists
+    """
+    config_dir = Path.home() / ".transcript-fixer"
+    db_path = config_dir / "corrections.db"
+
+    errors = []
+    warnings = []
+
+    print("🔍 Validating transcript-fixer configuration...\n")
+
+    # Check directory exists
+    if not config_dir.exists():
+        errors.append(f"Configuration directory not found: {config_dir}")
+        print(f"❌ {errors[-1]}")
+        print("\n💡 Run: python fix_transcription.py --init")
+        return errors, warnings
+
+    print(f"✅ Configuration directory exists: {config_dir}")
+
+    # Validate SQLite database
+    if db_path.exists():
+        try:
+            repository = CorrectionRepository(db_path)
+            service = CorrectionService(repository)
+
+            # Query basic stats
+            stats = service.get_statistics()
+            print(f"✅ Database valid: {stats['total_corrections']} corrections")
+
+            # Check tables exist
+            conn = repository._get_connection()
+            cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
+            tables = [row[0] for row in cursor.fetchall()]
+
+            expected_tables = [
+                'corrections', 'context_rules', 'correction_history',
+                'correction_changes', 'learned_suggestions', 'suggestion_examples',
+                'system_config', 'audit_log'
+            ]
+
+            missing_tables = [t for t in expected_tables if t not in tables]
+            if missing_tables:
+                errors.append(f"Database missing tables: {missing_tables}")
+                print(f"❌ {errors[-1]}")
+            else:
+                print(f"✅ All {len(expected_tables)} tables present")
+
+            service.close()
+
+        except Exception as e:
+            errors.append(f"Database validation failed: {e}")
+            print(f"❌ {errors[-1]}")
+    else:
+        warnings.append("Database not found (will be created on first use)")
+        print(f"⚠️  Database not found: {db_path}")
+
+    # Check API key
+    api_key = os.getenv("GLM_API_KEY")
+    if not api_key:
+        warnings.append("GLM_API_KEY environment variable not set")
+        print("⚠️  GLM_API_KEY not set (required for Stage 2 AI corrections)")
+    else:
+        print("✅ GLM_API_KEY is set")
+
+    return errors, warnings
+
+
+def print_validation_summary(errors: list[str], warnings: list[str]) -> int:
+    """
+    Print validation summary and return exit code.
+
+    Returns:
+        0 if valid, 1 if errors found
+    """
+    print("\n" + "=" * 60)
+
+    if errors:
+        print(f"❌ {len(errors)} error(s) found:")
+        for err in errors:
+            print(f"   - {err}")
+        print("\n💡 Fix errors and run --validate again")
+        print("=" * 60)
+        return 1
+    elif warnings:
+        print(f"⚠️  {len(warnings)} warning(s):")
+        for warn in warnings:
+            print(f"   - {warn}")
+        print("\n✅ Configuration is valid (with warnings)")
+        print("=" * 60)
+        return 0
+    else:
+        print("✅ All checks passed! Configuration is valid.")
+        print("=" * 60)
+        return 0
+
+
+def main():
+    """Run validation as standalone script"""
+    errors, warnings = validate_configuration()
+    exit_code = print_validation_summary(errors, warnings)
+    sys.exit(exit_code)
+
+
+if __name__ == "__main__":
+    main()