feat: Update skill-creator and transcript-fixer
skill-creator v1.2.0 → v1.2.1: - Add critical warning about not editing skills in cache directory - Cache location (~/.claude/plugins/cache/) is read-only - Changes there are lost on cache refresh transcript-fixer v1.0.0 → v1.1.0: - Add Chinese/Japanese/Korean domain name support (火星加速器, 具身智能) - Add [CLAUDE_FALLBACK] signal for Claude Code to take over when GLM unavailable - Add Prerequisites section requiring uv for Python execution - Add Critical Workflow section for dictionary iteration - Add AI Fallback Strategy and Database Operations sections - Add Stages table (Dictionary → AI → Full pipeline) - Add ensure_deps.py script for shared virtual environment - Add database_schema.md and iteration_workflow.md references - Update domain validation from whitelist to pattern matching - Update tests for Chinese domains and security bypass attempts 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -352,6 +352,13 @@ class AIProcessorAsync:
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
# CLAUDE_FALLBACK: Signal Claude Code to take over manual correction
|
||||
print("[CLAUDE_FALLBACK] GLM API unavailable. Claude Code should analyze this text for ASR errors:")
|
||||
print("---")
|
||||
print(chunk[:2000] if len(chunk) > 2000 else chunk)
|
||||
print("---")
|
||||
print("After fixing, MUST save corrections: --add \"错误词\" \"正确词\" --domain general")
|
||||
|
||||
logger.warning(
|
||||
f"Using original text for chunk {chunk_index} after all retries failed",
|
||||
chunk_index=chunk_index
|
||||
|
||||
@@ -32,7 +32,11 @@ class ValidationRules:
|
||||
max_text_length: int = 1000
|
||||
min_text_length: int = 1
|
||||
max_domain_length: int = 50
|
||||
allowed_domain_pattern: str = r'^[a-zA-Z0-9_-]+$'
|
||||
# Support Chinese, Japanese, Korean characters in domain names
|
||||
# \u4e00-\u9fff: CJK Unified Ideographs (Chinese)
|
||||
# \u3040-\u309f: Hiragana, \u30a0-\u30ff: Katakana (Japanese)
|
||||
# \uac00-\ud7af: Hangul Syllables (Korean)
|
||||
allowed_domain_pattern: str = r'^[\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af-]+$'
|
||||
max_confidence: float = 1.0
|
||||
min_confidence: float = 0.0
|
||||
|
||||
|
||||
63
transcript-fixer/scripts/ensure_deps.py
Normal file
63
transcript-fixer/scripts/ensure_deps.py
Normal file
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Initialize shared virtual environment for transcript-fixer.
|
||||
|
||||
Handles errors explicitly rather than letting Claude guess (per best practices).
|
||||
Creates a shared venv at ~/.transcript-fixer/.venv that can be reused across
|
||||
different working directories.
|
||||
"""
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
DEPS_DIR = Path.home() / ".transcript-fixer"
|
||||
VENV_DIR = DEPS_DIR / ".venv"
|
||||
REQUIREMENTS = ["httpx[http2]>=0.24.0", "filelock>=3.13.0", "aiofiles>=23.0.0"]
|
||||
|
||||
|
||||
def main():
|
||||
"""Initialize shared dependencies for transcript-fixer."""
|
||||
# Create base directory
|
||||
try:
|
||||
DEPS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
except PermissionError:
|
||||
print(f"❌ Cannot create {DEPS_DIR}. Check permissions.")
|
||||
sys.exit(1)
|
||||
|
||||
# Create virtual environment if not exists
|
||||
if not VENV_DIR.exists():
|
||||
print("🔧 Creating virtual environment...")
|
||||
result = subprocess.run(
|
||||
["uv", "venv", str(VENV_DIR)],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
if result.returncode != 0:
|
||||
print(f"❌ Failed to create venv: {result.stderr}")
|
||||
print(" Install uv first: curl -LsSf https://astral.sh/uv/install.sh | sh")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f"✓ Virtual environment exists at {VENV_DIR}")
|
||||
|
||||
# Install dependencies
|
||||
print("📦 Installing dependencies...")
|
||||
result = subprocess.run(
|
||||
["uv", "pip", "install", "--python", str(VENV_DIR / "bin" / "python")]
|
||||
+ REQUIREMENTS,
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
if result.returncode != 0:
|
||||
print(f"❌ Failed to install: {result.stderr}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"✅ Dependencies ready at {VENV_DIR}")
|
||||
print()
|
||||
print("Usage:")
|
||||
print(f" {VENV_DIR}/bin/python scripts/fix_transcription.py --input file.md --stage 3")
|
||||
print()
|
||||
print("Or add alias to ~/.zshrc:")
|
||||
print(f' alias tf="{VENV_DIR}/bin/python scripts/fix_transcription.py"')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -84,6 +84,14 @@ class TestCorrectionService(unittest.TestCase):
|
||||
self.service.validate_domain_name("embodied_ai")
|
||||
self.service.validate_domain_name("test-domain-123")
|
||||
|
||||
def test_validate_chinese_domain(self):
|
||||
"""Test acceptance of Chinese domain names."""
|
||||
# Should not raise - Chinese characters are valid
|
||||
self.service.validate_domain_name("火星加速器")
|
||||
self.service.validate_domain_name("具身智能")
|
||||
self.service.validate_domain_name("中文域名-123")
|
||||
self.service.validate_domain_name("混合domain中文")
|
||||
|
||||
# ==================== Correction Operations Tests ====================
|
||||
|
||||
def test_add_correction(self):
|
||||
|
||||
@@ -40,19 +40,27 @@ from utils.domain_validator import (
|
||||
|
||||
|
||||
class TestDomainValidation:
|
||||
"""Test domain whitelist validation"""
|
||||
"""Test domain pattern validation"""
|
||||
|
||||
def test_valid_domains(self):
|
||||
"""Test all valid domains are accepted"""
|
||||
"""Test predefined domains are accepted"""
|
||||
for domain in VALID_DOMAINS:
|
||||
result = validate_domain(domain)
|
||||
assert result == domain
|
||||
|
||||
def test_case_insensitive(self):
|
||||
"""Test domain validation is case-insensitive"""
|
||||
assert validate_domain("GENERAL") == "general"
|
||||
assert validate_domain("General") == "general"
|
||||
assert validate_domain("embodied_AI") == "embodied_ai"
|
||||
def test_custom_domains(self):
|
||||
"""Test custom domain names are accepted"""
|
||||
assert validate_domain("my_custom_domain") == "my_custom_domain"
|
||||
assert validate_domain("test-domain-123") == "test-domain-123"
|
||||
assert validate_domain("domain1") == "domain1"
|
||||
assert validate_domain("export_test") == "export_test"
|
||||
|
||||
def test_chinese_domains(self):
|
||||
"""Test Chinese domain names are accepted"""
|
||||
assert validate_domain("火星加速器") == "火星加速器"
|
||||
assert validate_domain("具身智能") == "具身智能"
|
||||
assert validate_domain("中文域名") == "中文域名"
|
||||
assert validate_domain("混合domain中文") == "混合domain中文"
|
||||
|
||||
def test_whitespace_trimmed(self):
|
||||
"""Test whitespace is trimmed"""
|
||||
@@ -70,7 +78,7 @@ class TestDomainValidation:
|
||||
]
|
||||
|
||||
for malicious in malicious_inputs:
|
||||
with pytest.raises(ValidationError, match="Invalid domain"):
|
||||
with pytest.raises(ValidationError):
|
||||
validate_domain(malicious)
|
||||
|
||||
def test_empty_domain(self):
|
||||
@@ -81,6 +89,12 @@ class TestDomainValidation:
|
||||
with pytest.raises(ValidationError, match="cannot be empty"):
|
||||
validate_domain(" ")
|
||||
|
||||
def test_domain_too_long(self):
|
||||
"""Test domain length limit"""
|
||||
long_domain = "a" * 51
|
||||
with pytest.raises(ValidationError, match="too long"):
|
||||
validate_domain(long_domain)
|
||||
|
||||
|
||||
class TestSourceValidation:
|
||||
"""Test source whitelist validation"""
|
||||
@@ -163,7 +177,7 @@ class TestCorrectionInputsValidation:
|
||||
|
||||
def test_invalid_domain_in_full_validation(self):
|
||||
"""Test invalid domain is rejected in full validation"""
|
||||
with pytest.raises(ValidationError, match="Invalid domain"):
|
||||
with pytest.raises(ValidationError):
|
||||
validate_correction_inputs(
|
||||
from_text="test",
|
||||
to_text="test",
|
||||
@@ -286,10 +300,10 @@ class TestSecurityScenarios:
|
||||
def test_domain_bypass_attempts(self):
|
||||
"""Test various domain bypass attempts"""
|
||||
bypass_attempts = [
|
||||
"general\x00hacked", # Null byte injection
|
||||
"general\nmalicious", # Newline injection
|
||||
"general -- comment", # SQL comment
|
||||
"general' UNION", # SQL union
|
||||
"general -- comment", # SQL comment (space is invalid)
|
||||
"general' UNION", # SQL union (quote is invalid)
|
||||
"../etc/passwd", # Path traversal
|
||||
]
|
||||
|
||||
for attempt in bypass_attempts:
|
||||
|
||||
@@ -20,8 +20,8 @@ from __future__ import annotations
|
||||
from typing import Final, Set
|
||||
import re
|
||||
|
||||
# Domain whitelist - ONLY these values are allowed
|
||||
VALID_DOMAINS: Final[Set[str]] = {
|
||||
# Predefined domains (for documentation/reference, not enforced as whitelist)
|
||||
PREDEFINED_DOMAINS: Final[Set[str]] = {
|
||||
'general',
|
||||
'embodied_ai',
|
||||
'finance',
|
||||
@@ -30,6 +30,16 @@ VALID_DOMAINS: Final[Set[str]] = {
|
||||
'technical',
|
||||
}
|
||||
|
||||
# Domain validation pattern - supports Chinese, Japanese, Korean characters
|
||||
# \u4e00-\u9fff: CJK Unified Ideographs (Chinese)
|
||||
# \u3040-\u309f: Hiragana, \u30a0-\u30ff: Katakana (Japanese)
|
||||
# \uac00-\ud7af: Hangul Syllables (Korean)
|
||||
DOMAIN_PATTERN: Final[str] = r'^[\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af-]+$'
|
||||
MAX_DOMAIN_LENGTH: Final[int] = 50
|
||||
|
||||
# Keep VALID_DOMAINS as alias for backward compatibility
|
||||
VALID_DOMAINS = PREDEFINED_DOMAINS
|
||||
|
||||
# Source whitelist
|
||||
VALID_SOURCES: Final[Set[str]] = {
|
||||
'manual',
|
||||
@@ -53,42 +63,61 @@ class ValidationError(Exception):
|
||||
|
||||
def validate_domain(domain: str) -> str:
|
||||
"""
|
||||
Validate domain against whitelist.
|
||||
Validate domain name using pattern matching.
|
||||
|
||||
CRITICAL: Prevents SQL injection via domain parameter.
|
||||
Domain is used in WHERE clauses - must be whitelisted.
|
||||
Domain is used in WHERE clauses - must match safe pattern.
|
||||
|
||||
Supports:
|
||||
- Alphanumeric characters (a-z, A-Z, 0-9)
|
||||
- Underscores and hyphens
|
||||
- Chinese, Japanese, Korean characters
|
||||
|
||||
Args:
|
||||
domain: Domain string to validate
|
||||
|
||||
Returns:
|
||||
Validated domain (guaranteed to be in whitelist)
|
||||
Validated domain (guaranteed to match safe pattern)
|
||||
|
||||
Raises:
|
||||
ValidationError: If domain not in whitelist
|
||||
ValidationError: If domain contains invalid characters
|
||||
|
||||
Examples:
|
||||
>>> validate_domain('general')
|
||||
'general'
|
||||
|
||||
>>> validate_domain('火星加速器')
|
||||
'火星加速器'
|
||||
|
||||
>>> validate_domain('hacked"; DROP TABLE corrections--')
|
||||
ValidationError: Invalid domain
|
||||
"""
|
||||
if not domain:
|
||||
raise ValidationError("Domain cannot be empty")
|
||||
|
||||
domain = domain.strip().lower()
|
||||
domain = domain.strip()
|
||||
|
||||
# Check again after stripping (whitespace-only input)
|
||||
if not domain:
|
||||
raise ValidationError("Domain cannot be empty")
|
||||
|
||||
if domain not in VALID_DOMAINS:
|
||||
# Check length
|
||||
if len(domain) > MAX_DOMAIN_LENGTH:
|
||||
raise ValidationError(
|
||||
f"Invalid domain: '{domain}'. "
|
||||
f"Valid domains: {sorted(VALID_DOMAINS)}"
|
||||
f"Domain name too long: {len(domain)} chars (max: {MAX_DOMAIN_LENGTH})"
|
||||
)
|
||||
|
||||
# Check pattern (supports Chinese and other CJK characters)
|
||||
if not re.match(DOMAIN_PATTERN, domain):
|
||||
raise ValidationError(
|
||||
f"Domain name contains invalid characters: {domain}. "
|
||||
f"Allowed: alphanumeric, underscore, hyphen, Chinese/Japanese/Korean characters"
|
||||
)
|
||||
|
||||
# Check for path traversal attempts
|
||||
if '..' in domain or '/' in domain or '\\' in domain:
|
||||
raise ValidationError(f"Domain name contains path traversal: {domain}")
|
||||
|
||||
return domain
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user