feat: Update skill-creator and transcript-fixer

skill-creator v1.2.0 → v1.2.1: - Add critical warning about not editing skills in cache directory - Cache location (~/.claude/plugins/cache/) is read-only - Changes there are lost on cache refresh transcript-fixer v1.0.0 → v1.1.0: - Add Chinese/Japanese/Korean domain name support (火星加速器, 具身智能) - Add [CLAUDE_FALLBACK] signal for Claude Code to take over when GLM unavailable - Add Prerequisites section requiring uv for Python execution - Add Critical Workflow section for dictionary iteration - Add AI Fallback Strategy and Database Operations sections - Add Stages table (Dictionary → AI → Full pipeline) - Add ensure_deps.py script for shared virtual environment - Add database_schema.md and iteration_workflow.md references - Update domain validation from whitelist to pattern matching - Update tests for Chinese domains and security bypass attempts 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-11 13:04:27 +08:00
parent 20cc442ec4
commit 1d237fc3be
12 changed files with 556 additions and 27 deletions
--- a/transcript-fixer/scripts/core/ai_processor_async.py
+++ b/transcript-fixer/scripts/core/ai_processor_async.py
@@ -352,6 +352,13 @@ class AIProcessorAsync:
                            exc_info=True
                        )

+                # CLAUDE_FALLBACK: Signal Claude Code to take over manual correction
+                print("[CLAUDE_FALLBACK] GLM API unavailable. Claude Code should analyze this text for ASR errors:")
+                print("---")
+                print(chunk[:2000] if len(chunk) > 2000 else chunk)
+                print("---")
+                print("After fixing, MUST save corrections: --add \"错误词\" \"正确词\" --domain general")
+
                logger.warning(
                    f"Using original text for chunk {chunk_index} after all retries failed",
                    chunk_index=chunk_index
--- a/transcript-fixer/scripts/core/correction_service.py
+++ b/transcript-fixer/scripts/core/correction_service.py
@@ -32,7 +32,11 @@ class ValidationRules:
    max_text_length: int = 1000
    min_text_length: int = 1
    max_domain_length: int = 50
-    allowed_domain_pattern: str = r'^[a-zA-Z0-9_-]+$'
+    # Support Chinese, Japanese, Korean characters in domain names
+    # \u4e00-\u9fff: CJK Unified Ideographs (Chinese)
+    # \u3040-\u309f: Hiragana, \u30a0-\u30ff: Katakana (Japanese)
+    # \uac00-\ud7af: Hangul Syllables (Korean)
+    allowed_domain_pattern: str = r'^[\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af-]+$'
    max_confidence: float = 1.0
    min_confidence: float = 0.0

--- a/transcript-fixer/scripts/ensure_deps.py
+++ b/transcript-fixer/scripts/ensure_deps.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+"""Initialize shared virtual environment for transcript-fixer.
+
+Handles errors explicitly rather than letting Claude guess (per best practices).
+Creates a shared venv at ~/.transcript-fixer/.venv that can be reused across
+different working directories.
+"""
+import subprocess
+import sys
+from pathlib import Path
+
+DEPS_DIR = Path.home() / ".transcript-fixer"
+VENV_DIR = DEPS_DIR / ".venv"
+REQUIREMENTS = ["httpx[http2]>=0.24.0", "filelock>=3.13.0", "aiofiles>=23.0.0"]
+
+
+def main():
+    """Initialize shared dependencies for transcript-fixer."""
+    # Create base directory
+    try:
+        DEPS_DIR.mkdir(parents=True, exist_ok=True)
+    except PermissionError:
+        print(f"❌ Cannot create {DEPS_DIR}. Check permissions.")
+        sys.exit(1)
+
+    # Create virtual environment if not exists
+    if not VENV_DIR.exists():
+        print("🔧 Creating virtual environment...")
+        result = subprocess.run(
+            ["uv", "venv", str(VENV_DIR)],
+            capture_output=True,
+            text=True
+        )
+        if result.returncode != 0:
+            print(f"❌ Failed to create venv: {result.stderr}")
+            print("   Install uv first: curl -LsSf https://astral.sh/uv/install.sh | sh")
+            sys.exit(1)
+    else:
+        print(f"✓ Virtual environment exists at {VENV_DIR}")
+
+    # Install dependencies
+    print("📦 Installing dependencies...")
+    result = subprocess.run(
+        ["uv", "pip", "install", "--python", str(VENV_DIR / "bin" / "python")]
+        + REQUIREMENTS,
+        capture_output=True,
+        text=True
+    )
+    if result.returncode != 0:
+        print(f"❌ Failed to install: {result.stderr}")
+        sys.exit(1)
+
+    print(f"✅ Dependencies ready at {VENV_DIR}")
+    print()
+    print("Usage:")
+    print(f"  {VENV_DIR}/bin/python scripts/fix_transcription.py --input file.md --stage 3")
+    print()
+    print("Or add alias to ~/.zshrc:")
+    print(f'  alias tf="{VENV_DIR}/bin/python scripts/fix_transcription.py"')
+
+
+if __name__ == "__main__":
+    main()
--- a/transcript-fixer/scripts/tests/test_correction_service.py
+++ b/transcript-fixer/scripts/tests/test_correction_service.py
@@ -84,6 +84,14 @@ class TestCorrectionService(unittest.TestCase):
        self.service.validate_domain_name("embodied_ai")
        self.service.validate_domain_name("test-domain-123")

+    def test_validate_chinese_domain(self):
+        """Test acceptance of Chinese domain names."""
+        # Should not raise - Chinese characters are valid
+        self.service.validate_domain_name("火星加速器")
+        self.service.validate_domain_name("具身智能")
+        self.service.validate_domain_name("中文域名-123")
+        self.service.validate_domain_name("混合domain中文")
+
    # ==================== Correction Operations Tests ====================

    def test_add_correction(self):
--- a/transcript-fixer/scripts/tests/test_domain_validator.py
+++ b/transcript-fixer/scripts/tests/test_domain_validator.py
@@ -40,19 +40,27 @@ from utils.domain_validator import (


 class TestDomainValidation:
-    """Test domain whitelist validation"""
+    """Test domain pattern validation"""

    def test_valid_domains(self):
-        """Test all valid domains are accepted"""
+        """Test predefined domains are accepted"""
        for domain in VALID_DOMAINS:
            result = validate_domain(domain)
            assert result == domain

-    def test_case_insensitive(self):
-        """Test domain validation is case-insensitive"""
-        assert validate_domain("GENERAL") == "general"
-        assert validate_domain("General") == "general"
-        assert validate_domain("embodied_AI") == "embodied_ai"
+    def test_custom_domains(self):
+        """Test custom domain names are accepted"""
+        assert validate_domain("my_custom_domain") == "my_custom_domain"
+        assert validate_domain("test-domain-123") == "test-domain-123"
+        assert validate_domain("domain1") == "domain1"
+        assert validate_domain("export_test") == "export_test"
+
+    def test_chinese_domains(self):
+        """Test Chinese domain names are accepted"""
+        assert validate_domain("火星加速器") == "火星加速器"
+        assert validate_domain("具身智能") == "具身智能"
+        assert validate_domain("中文域名") == "中文域名"
+        assert validate_domain("混合domain中文") == "混合domain中文"

    def test_whitespace_trimmed(self):
        """Test whitespace is trimmed"""
@@ -70,7 +78,7 @@ class TestDomainValidation:
        ]

        for malicious in malicious_inputs:
-            with pytest.raises(ValidationError, match="Invalid domain"):
+            with pytest.raises(ValidationError):
                validate_domain(malicious)

    def test_empty_domain(self):
@@ -81,6 +89,12 @@ class TestDomainValidation:
        with pytest.raises(ValidationError, match="cannot be empty"):
            validate_domain("   ")

+    def test_domain_too_long(self):
+        """Test domain length limit"""
+        long_domain = "a" * 51
+        with pytest.raises(ValidationError, match="too long"):
+            validate_domain(long_domain)
+

 class TestSourceValidation:
    """Test source whitelist validation"""
@@ -163,7 +177,7 @@ class TestCorrectionInputsValidation:

    def test_invalid_domain_in_full_validation(self):
        """Test invalid domain is rejected in full validation"""
-        with pytest.raises(ValidationError, match="Invalid domain"):
+        with pytest.raises(ValidationError):
            validate_correction_inputs(
                from_text="test",
                to_text="test",
@@ -286,10 +300,10 @@ class TestSecurityScenarios:
    def test_domain_bypass_attempts(self):
        """Test various domain bypass attempts"""
        bypass_attempts = [
-            "general\x00hacked",     # Null byte injection
            "general\nmalicious",    # Newline injection
-            "general -- comment",    # SQL comment
-            "general' UNION",        # SQL union
+            "general -- comment",    # SQL comment (space is invalid)
+            "general' UNION",        # SQL union (quote is invalid)
+            "../etc/passwd",         # Path traversal
        ]

        for attempt in bypass_attempts:
--- a/transcript-fixer/scripts/utils/domain_validator.py
+++ b/transcript-fixer/scripts/utils/domain_validator.py
@@ -20,8 +20,8 @@ from __future__ import annotations
 from typing import Final, Set
 import re

-# Domain whitelist - ONLY these values are allowed
-VALID_DOMAINS: Final[Set[str]] = {
+# Predefined domains (for documentation/reference, not enforced as whitelist)
+PREDEFINED_DOMAINS: Final[Set[str]] = {
    'general',
    'embodied_ai',
    'finance',
@@ -30,6 +30,16 @@ VALID_DOMAINS: Final[Set[str]] = {
    'technical',
 }

+# Domain validation pattern - supports Chinese, Japanese, Korean characters
+# \u4e00-\u9fff: CJK Unified Ideographs (Chinese)
+# \u3040-\u309f: Hiragana, \u30a0-\u30ff: Katakana (Japanese)
+# \uac00-\ud7af: Hangul Syllables (Korean)
+DOMAIN_PATTERN: Final[str] = r'^[\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af-]+$'
+MAX_DOMAIN_LENGTH: Final[int] = 50
+
+# Keep VALID_DOMAINS as alias for backward compatibility
+VALID_DOMAINS = PREDEFINED_DOMAINS
+
 # Source whitelist
 VALID_SOURCES: Final[Set[str]] = {
    'manual',
@@ -53,42 +63,61 @@ class ValidationError(Exception):

 def validate_domain(domain: str) -> str:
    """
-    Validate domain against whitelist.
+    Validate domain name using pattern matching.

    CRITICAL: Prevents SQL injection via domain parameter.
-    Domain is used in WHERE clauses - must be whitelisted.
+    Domain is used in WHERE clauses - must match safe pattern.
+
+    Supports:
+    - Alphanumeric characters (a-z, A-Z, 0-9)
+    - Underscores and hyphens
+    - Chinese, Japanese, Korean characters

    Args:
        domain: Domain string to validate

    Returns:
-        Validated domain (guaranteed to be in whitelist)
+        Validated domain (guaranteed to match safe pattern)

    Raises:
-        ValidationError: If domain not in whitelist
+        ValidationError: If domain contains invalid characters

    Examples:
        >>> validate_domain('general')
        'general'

+        >>> validate_domain('火星加速器')
+        '火星加速器'
+
        >>> validate_domain('hacked"; DROP TABLE corrections--')
        ValidationError: Invalid domain
    """
    if not domain:
        raise ValidationError("Domain cannot be empty")

-    domain = domain.strip().lower()
+    domain = domain.strip()

    # Check again after stripping (whitespace-only input)
    if not domain:
        raise ValidationError("Domain cannot be empty")

-    if domain not in VALID_DOMAINS:
+    # Check length
+    if len(domain) > MAX_DOMAIN_LENGTH:
        raise ValidationError(
-            f"Invalid domain: '{domain}'. "
-            f"Valid domains: {sorted(VALID_DOMAINS)}"
+            f"Domain name too long: {len(domain)} chars (max: {MAX_DOMAIN_LENGTH})"
        )

+    # Check pattern (supports Chinese and other CJK characters)
+    if not re.match(DOMAIN_PATTERN, domain):
+        raise ValidationError(
+            f"Domain name contains invalid characters: {domain}. "
+            f"Allowed: alphanumeric, underscore, hyphen, Chinese/Japanese/Korean characters"
+        )
+
+    # Check for path traversal attempts
+    if '..' in domain or '/' in domain or '\\' in domain:
+        raise ValidationError(f"Domain name contains path traversal: {domain}")
+
    return domain