diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 91b9088..30c17a4 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -15,7 +15,7 @@ "description": "Essential meta-skill for creating effective Claude Code skills with initialization scripts, validation, packaging, marketplace registration, and privacy best practices", "source": "./", "strict": false, - "version": "1.2.0", + "version": "1.2.1", "category": "developer-tools", "keywords": ["skill-creation", "claude-code", "development", "tooling", "workflow", "meta-skill", "essential"], "skills": ["./skill-creator"] @@ -152,10 +152,10 @@ }, { "name": "transcript-fixer", - "description": "Corrects speech-to-text (ASR/STT) transcription errors in meeting notes, lecture recordings, interviews, and voice memos through dictionary-based rules and AI corrections. Use when users mention transcript correction, ASR errors, speech-to-text mistakes, homophone errors, or working with transcription files", + "description": "Corrects speech-to-text (ASR/STT) transcription errors in meeting notes, lecture recordings, interviews, and voice memos through dictionary-based rules and AI corrections. Supports Chinese domain names, AI fallback to Claude Code, and iterative dictionary building. Use when users mention transcript correction, ASR errors, speech-to-text mistakes, homophone errors, or working with transcription files", "source": "./", "strict": false, - "version": "1.0.0", + "version": "1.1.0", "category": "productivity", "keywords": ["transcription", "asr", "stt", "speech-to-text", "correction", "ai", "meeting-notes", "nlp"], "skills": ["./transcript-fixer"] diff --git a/CHANGELOG.md b/CHANGELOG.md index d4ed6c7..aea0706 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -58,6 +58,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added corresponding use case sections to README.zh-CN.md - Added installation commands for both new skills - Added quick links for documentation references +- **skill-creator** v1.2.0 → v1.2.1: Added cache directory warning + - Added critical warning about not editing skills in `~/.claude/plugins/cache/` + - Explains that cache is read-only and changes are lost on refresh + - Provides correct vs wrong path examples +- **transcript-fixer** v1.0.0 → v1.1.0: Enhanced with Chinese domain support and AI fallback + - Added Chinese/Japanese/Korean character support for domain names (e.g., `火星加速器`, `具身智能`) + - Added `[CLAUDE_FALLBACK]` signal when GLM API is unavailable for Claude Code to take over + - Added Prerequisites section requiring `uv` for Python execution + - Added Critical Workflow section for dictionary iteration best practices + - Added AI Fallback Strategy section with manual correction guidance + - Added Database Operations section with schema reference requirement + - Added Stages table for quick reference (Dictionary → AI → Full pipeline) + - Added new bundled script: `ensure_deps.py` for shared virtual environment + - Added new bundled references: `database_schema.md`, `iteration_workflow.md` + - Updated domain validation from whitelist to pattern matching + - Updated tests for Chinese domain names and security bypass attempts ## [youtube-downloader-1.1.0] - 2025-11-19 diff --git a/skill-creator/SKILL.md b/skill-creator/SKILL.md index b8ab8c0..e033fdd 100644 --- a/skill-creator/SKILL.md +++ b/skill-creator/SKILL.md @@ -110,6 +110,24 @@ Skills use a three-level loading system to manage context efficiently: Anthropic has wrote skill authoring best practices, you SHOULD retrieve it before you create or update any skills, the link is https://docs.claude.com/en/docs/agents-and-tools/agent-skills/best-practices.md +## ⚠️ CRITICAL: Edit Skills at Source Location + +**NEVER edit skills in `~/.claude/plugins/cache/`** — that's a read-only cache directory. All changes there are: +- Lost when cache refreshes +- Not synced to source control +- Wasted effort requiring manual re-merge + +**ALWAYS verify you're editing the source repository:** +```bash +# WRONG - cache location (read-only copy) +~/.claude/plugins/cache/daymade-skills/my-skill/1.0.0/my-skill/SKILL.md + +# RIGHT - source repository +/path/to/your/claude-code-skills/my-skill/SKILL.md +``` + +**Before any edit**, confirm the file path does NOT contain `/cache/` or `/plugins/cache/`. + ## Skill Creation Process To create a skill, follow the "Skill Creation Process" in order, skipping steps only if there is a clear reason why they are not applicable. diff --git a/transcript-fixer/SKILL.md b/transcript-fixer/SKILL.md index 21da251..13984da 100644 --- a/transcript-fixer/SKILL.md +++ b/transcript-fixer/SKILL.md @@ -14,6 +14,19 @@ Correct speech-to-text transcription errors through dictionary-based rules, AI-p - Fixing Chinese/English homophone errors or technical terminology - Collaborating on shared correction knowledge bases +## Prerequisites + +**Python execution must use `uv`** - never use system Python directly. + +If `uv` is not installed: +```bash +# macOS/Linux +curl -LsSf https://astral.sh/uv/install.sh | sh + +# Windows PowerShell +powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex" +``` + ## Quick Start **Recommended: Use Enhanced Wrapper** (auto-detects API key, opens HTML diff): @@ -88,19 +101,62 @@ Three-stage pipeline stores corrections in `~/.transcript-fixer/corrections.db`: 4. **Review learned patterns**: `--review-learned` and `--approve` high-confidence suggestions **Stages**: Dictionary (instant, free) → AI via GLM API (parallel) → Full pipeline -**Domains**: `general`, `embodied_ai`, `finance`, `medical` (isolates corrections) +**Domains**: `general`, `embodied_ai`, `finance`, `medical`, or custom names including Chinese (e.g., `火星加速器`, `具身智能`) **Learning**: Patterns appearing ≥3 times at ≥80% confidence move from AI to dictionary See `references/workflow_guide.md` for detailed workflows, `references/script_parameters.md` for complete CLI reference, and `references/team_collaboration.md` for collaboration patterns. +## Critical Workflow: Dictionary Iteration + +**MUST save corrections after each fix.** This is the skill's core value. + +After fixing errors manually, immediately save to dictionary: +```bash +uv run scripts/fix_transcription.py --add "错误词" "正确词" --domain general +``` + +See `references/iteration_workflow.md` for complete iteration guide with checklist. + +## AI Fallback Strategy + +When GLM API is unavailable (503, network issues), the script outputs `[CLAUDE_FALLBACK]` marker. + +Claude Code should then: +1. Analyze the text directly for ASR errors +2. Fix using Edit tool +3. **MUST save corrections to dictionary** with `--add` + +## Database Operations + +**MUST read `references/database_schema.md` before any database operations.** + +Quick reference: +```bash +# View all corrections +sqlite3 ~/.transcript-fixer/corrections.db "SELECT * FROM active_corrections;" + +# Check schema version +sqlite3 ~/.transcript-fixer/corrections.db "SELECT value FROM system_config WHERE key='schema_version';" +``` + +## Stages + +| Stage | Description | Speed | Cost | +|-------|-------------|-------|------| +| 1 | Dictionary only | Instant | Free | +| 2 | AI only | ~10s | API calls | +| 3 | Full pipeline | ~10s | API calls | + ## Bundled Resources **Scripts:** +- `ensure_deps.py` - Initialize shared virtual environment (run once, optional) - `fix_transcript_enhanced.py` - Enhanced wrapper (recommended for interactive use) - `fix_transcription.py` - Core CLI (for automation) - `examples/bulk_import.py` - Bulk import example **References** (load as needed): +- **Critical**: `database_schema.md` (read before DB operations), `iteration_workflow.md` (dictionary iteration best practices) - Getting started: `installation_setup.md`, `glm_api_setup.md`, `workflow_guide.md` - Daily use: `quick_reference.md`, `script_parameters.md`, `dictionary_guide.md` - Advanced: `sql_queries.md`, `file_formats.md`, `architecture.md`, `best_practices.md` diff --git a/transcript-fixer/references/database_schema.md b/transcript-fixer/references/database_schema.md new file mode 100644 index 0000000..55873fc --- /dev/null +++ b/transcript-fixer/references/database_schema.md @@ -0,0 +1,190 @@ +# Database Schema Reference + +**MUST read this before any database operations.** + +Database location: `~/.transcript-fixer/corrections.db` + +## Core Tables + +### corrections + +Main storage for correction mappings. + +| Column | Type | Description | +|--------|------|-------------| +| id | INTEGER | Primary key | +| from_text | TEXT | Error text to match (NOT NULL) | +| to_text | TEXT | Correct replacement (NOT NULL) | +| domain | TEXT | Domain: general, embodied_ai, finance, medical | +| source | TEXT | 'manual', 'learned', 'imported' | +| confidence | REAL | 0.0-1.0 | +| added_by | TEXT | Username | +| added_at | TIMESTAMP | Creation time | +| usage_count | INTEGER | Times this correction was applied | +| last_used | TIMESTAMP | Last time used | +| notes | TEXT | Optional notes | +| is_active | BOOLEAN | Active flag (1=active, 0=disabled) | + +**Constraint**: `UNIQUE(from_text, domain)` + +### context_rules + +Regex-based context-aware correction rules. + +| Column | Type | Description | +|--------|------|-------------| +| id | INTEGER | Primary key | +| pattern | TEXT | Regex pattern (UNIQUE) | +| replacement | TEXT | Replacement text | +| description | TEXT | Rule description | +| priority | INTEGER | Higher = processed first | +| is_active | BOOLEAN | Active flag | + +### learned_suggestions + +AI-learned patterns pending user review. + +| Column | Type | Description | +|--------|------|-------------| +| id | INTEGER | Primary key | +| from_text | TEXT | Detected error | +| to_text | TEXT | Suggested correction | +| domain | TEXT | Domain | +| frequency | INTEGER | Occurrence count (≥1) | +| confidence | REAL | AI confidence (0.0-1.0) | +| first_seen | TIMESTAMP | First occurrence | +| last_seen | TIMESTAMP | Last occurrence | +| status | TEXT | 'pending', 'approved', 'rejected' | +| reviewed_at | TIMESTAMP | Review time | +| reviewed_by | TEXT | Reviewer | + +**Constraint**: `UNIQUE(from_text, to_text, domain)` + +### correction_history + +Audit log for all correction runs. + +| Column | Type | Description | +|--------|------|-------------| +| id | INTEGER | Primary key | +| filename | TEXT | Input file name | +| domain | TEXT | Domain used | +| run_timestamp | TIMESTAMP | When run | +| original_length | INTEGER | Original text length | +| stage1_changes | INTEGER | Dictionary changes count | +| stage2_changes | INTEGER | AI changes count | +| model | TEXT | AI model used | +| execution_time_ms | INTEGER | Processing time | +| success | BOOLEAN | Success flag | +| error_message | TEXT | Error if failed | + +### correction_changes + +Detailed changes made in each correction run. + +| Column | Type | Description | +|--------|------|-------------| +| id | INTEGER | Primary key | +| history_id | INTEGER | FK → correction_history.id | +| line_number | INTEGER | Line where change occurred | +| from_text | TEXT | Original text | +| to_text | TEXT | Corrected text | +| rule_type | TEXT | 'context', 'dictionary', 'ai' | +| rule_id | INTEGER | Reference to rule used | +| context_before | TEXT | Text before change | +| context_after | TEXT | Text after change | + +### system_config + +Key-value configuration store. + +| Column | Type | Description | +|--------|------|-------------| +| key | TEXT | Config key (PRIMARY KEY) | +| value | TEXT | Config value | +| value_type | TEXT | 'string', 'int', 'float', 'boolean', 'json' | +| description | TEXT | What this config does | +| updated_at | TIMESTAMP | Last update | + +**Default configs**: +- `schema_version`: '2.0' +- `api_model`: 'GLM-4.6' +- `learning_frequency_threshold`: 3 +- `learning_confidence_threshold`: 0.8 +- `history_retention_days`: 90 + +### audit_log + +Comprehensive operations trail. + +| Column | Type | Description | +|--------|------|-------------| +| id | INTEGER | Primary key | +| timestamp | TIMESTAMP | When occurred | +| action | TEXT | Action type | +| entity_type | TEXT | Table affected | +| entity_id | INTEGER | Row ID | +| user | TEXT | Who did it | +| details | TEXT | JSON details | +| success | BOOLEAN | Success flag | +| error_message | TEXT | Error if failed | + +## Views + +### active_corrections + +Active corrections only, ordered by domain and from_text. + +```sql +SELECT * FROM active_corrections; +``` + +### pending_suggestions + +Suggestions awaiting review, with example count. + +```sql +SELECT * FROM pending_suggestions WHERE confidence > 0.8; +``` + +### correction_statistics + +Statistics per domain. + +```sql +SELECT * FROM correction_statistics; +``` + +## Common Queries + +```sql +-- List all active corrections +SELECT from_text, to_text, domain FROM active_corrections; + +-- Check pending high-confidence suggestions +SELECT * FROM pending_suggestions WHERE confidence > 0.8 ORDER BY frequency DESC; + +-- Domain statistics +SELECT domain, total_corrections, total_usage FROM correction_statistics; + +-- Recent correction history +SELECT filename, stage1_changes, stage2_changes, run_timestamp +FROM correction_history +ORDER BY run_timestamp DESC LIMIT 10; + +-- Add new correction (use CLI instead for safety) +INSERT INTO corrections (from_text, to_text, domain, source, confidence, added_by) +VALUES ('错误词', '正确词', 'general', 'manual', 1.0, 'user'); + +-- Disable a correction +UPDATE corrections SET is_active = 0 WHERE id = ?; +``` + +## Schema Version + +Check current version: +```sql +SELECT value FROM system_config WHERE key = 'schema_version'; +``` + +For complete schema including indexes and constraints, see `scripts/core/schema.sql`. diff --git a/transcript-fixer/references/iteration_workflow.md b/transcript-fixer/references/iteration_workflow.md new file mode 100644 index 0000000..ba94a46 --- /dev/null +++ b/transcript-fixer/references/iteration_workflow.md @@ -0,0 +1,124 @@ +# Dictionary Iteration Workflow + +The core value of transcript-fixer is building a personalized correction dictionary that improves over time. + +## The Core Loop + +``` +┌─────────────────────────────────────────────────┐ +│ 1. Fix transcript (manual or Stage 3) │ +│ ↓ │ +│ 2. Identify new ASR errors during fixing │ +│ ↓ │ +│ 3. IMMEDIATELY save to dictionary │ +│ ↓ │ +│ 4. Next time: Stage 1 auto-corrects these │ +└─────────────────────────────────────────────────┘ +``` + +**Key principle**: Every correction you make should be saved to the dictionary. This transforms one-time work into permanent value. + +## Workflow Checklist + +Copy this checklist when correcting transcripts: + +``` +Correction Progress: +- [ ] Run correction: --input file.md --stage 3 +- [ ] Review output file for remaining ASR errors +- [ ] Fix errors manually with Edit tool +- [ ] Save EACH correction to dictionary with --add +- [ ] Verify with --list that corrections were saved +- [ ] Next time: Stage 1 handles these automatically +``` + +## Save Corrections Immediately + +After fixing any transcript, save corrections: + +```bash +# Single correction +uv run scripts/fix_transcription.py --add "错误词" "正确词" --domain general + +# Multiple corrections - run command for each +uv run scripts/fix_transcription.py --add "片片总" "翩翩总" --domain general +uv run scripts/fix_transcription.py --add "姐弟" "结业" --domain general +uv run scripts/fix_transcription.py --add "自杀性" "自嗨性" --domain general +uv run scripts/fix_transcription.py --add "被看" "被砍" --domain general +uv run scripts/fix_transcription.py --add "单反过" "单访过" --domain general +``` + +## Verify Dictionary + +Always verify corrections were saved: + +```bash +# List all corrections in current domain +uv run scripts/fix_transcription.py --list + +# Direct database query +sqlite3 ~/.transcript-fixer/corrections.db \ + "SELECT from_text, to_text, domain FROM active_corrections ORDER BY added_at DESC LIMIT 10;" +``` + +## Domain Selection + +Choose the right domain for corrections: + +| Domain | Use Case | +|--------|----------| +| `general` | Common ASR errors, names, general vocabulary | +| `embodied_ai` | 具身智能、机器人、AI 相关术语 | +| `finance` | 财务、投资、金融术语 | +| `medical` | 医疗、健康相关术语 | +| `火星加速器` | Custom Chinese domain name (any valid name works) | + +```bash +# Domain-specific correction +uv run scripts/fix_transcription.py --add "股价系统" "框架系统" --domain embodied_ai +uv run scripts/fix_transcription.py --add "片片总" "翩翩总" --domain 火星加速器 +``` + +## Common ASR Error Patterns + +Build your dictionary with these common patterns: + +| Type | Examples | +|------|----------| +| **Homophones** | 赢→营, 减→剪, 被看→被砍, 营业→营的 | +| **Names** | 片片→翩翩, 亮亮→亮哥 | +| **Technical** | 巨升智能→具身智能, 股价→框架 | +| **English** | log→vlog | +| **Broken words** | 姐弟→结业, 单反→单访 | + +## When GLM API Fails + +If you see `[CLAUDE_FALLBACK]` output, the GLM API is unavailable. + +Steps: +1. Claude Code should analyze the text directly for ASR errors +2. Fix using Edit tool +3. **MUST save corrections to dictionary** - this is critical +4. Dictionary corrections work even without AI + +## Auto-Learning Feature + +After running Stage 3 multiple times: + +```bash +# Check learned patterns +uv run scripts/fix_transcription.py --review-learned + +# Approve high-confidence patterns +uv run scripts/fix_transcription.py --approve "错误词" "正确词" +``` + +Patterns appearing ≥3 times at ≥80% confidence are suggested for review. + +## Best Practices + +1. **Save immediately**: Don't batch corrections - save each one right after fixing +2. **Be specific**: Use exact phrases, not partial words +3. **Use domains**: Organize corrections by topic for better precision +4. **Verify**: Always run --list to confirm saves +5. **Review suggestions**: Periodically check --review-learned for auto-detected patterns diff --git a/transcript-fixer/scripts/core/ai_processor_async.py b/transcript-fixer/scripts/core/ai_processor_async.py index ef715ce..d05f2d7 100644 --- a/transcript-fixer/scripts/core/ai_processor_async.py +++ b/transcript-fixer/scripts/core/ai_processor_async.py @@ -352,6 +352,13 @@ class AIProcessorAsync: exc_info=True ) + # CLAUDE_FALLBACK: Signal Claude Code to take over manual correction + print("[CLAUDE_FALLBACK] GLM API unavailable. Claude Code should analyze this text for ASR errors:") + print("---") + print(chunk[:2000] if len(chunk) > 2000 else chunk) + print("---") + print("After fixing, MUST save corrections: --add \"错误词\" \"正确词\" --domain general") + logger.warning( f"Using original text for chunk {chunk_index} after all retries failed", chunk_index=chunk_index diff --git a/transcript-fixer/scripts/core/correction_service.py b/transcript-fixer/scripts/core/correction_service.py index 2b0e4e6..4d53c50 100644 --- a/transcript-fixer/scripts/core/correction_service.py +++ b/transcript-fixer/scripts/core/correction_service.py @@ -32,7 +32,11 @@ class ValidationRules: max_text_length: int = 1000 min_text_length: int = 1 max_domain_length: int = 50 - allowed_domain_pattern: str = r'^[a-zA-Z0-9_-]+$' + # Support Chinese, Japanese, Korean characters in domain names + # \u4e00-\u9fff: CJK Unified Ideographs (Chinese) + # \u3040-\u309f: Hiragana, \u30a0-\u30ff: Katakana (Japanese) + # \uac00-\ud7af: Hangul Syllables (Korean) + allowed_domain_pattern: str = r'^[\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af-]+$' max_confidence: float = 1.0 min_confidence: float = 0.0 diff --git a/transcript-fixer/scripts/ensure_deps.py b/transcript-fixer/scripts/ensure_deps.py new file mode 100644 index 0000000..36d22d4 --- /dev/null +++ b/transcript-fixer/scripts/ensure_deps.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +"""Initialize shared virtual environment for transcript-fixer. + +Handles errors explicitly rather than letting Claude guess (per best practices). +Creates a shared venv at ~/.transcript-fixer/.venv that can be reused across +different working directories. +""" +import subprocess +import sys +from pathlib import Path + +DEPS_DIR = Path.home() / ".transcript-fixer" +VENV_DIR = DEPS_DIR / ".venv" +REQUIREMENTS = ["httpx[http2]>=0.24.0", "filelock>=3.13.0", "aiofiles>=23.0.0"] + + +def main(): + """Initialize shared dependencies for transcript-fixer.""" + # Create base directory + try: + DEPS_DIR.mkdir(parents=True, exist_ok=True) + except PermissionError: + print(f"❌ Cannot create {DEPS_DIR}. Check permissions.") + sys.exit(1) + + # Create virtual environment if not exists + if not VENV_DIR.exists(): + print("🔧 Creating virtual environment...") + result = subprocess.run( + ["uv", "venv", str(VENV_DIR)], + capture_output=True, + text=True + ) + if result.returncode != 0: + print(f"❌ Failed to create venv: {result.stderr}") + print(" Install uv first: curl -LsSf https://astral.sh/uv/install.sh | sh") + sys.exit(1) + else: + print(f"✓ Virtual environment exists at {VENV_DIR}") + + # Install dependencies + print("📦 Installing dependencies...") + result = subprocess.run( + ["uv", "pip", "install", "--python", str(VENV_DIR / "bin" / "python")] + + REQUIREMENTS, + capture_output=True, + text=True + ) + if result.returncode != 0: + print(f"❌ Failed to install: {result.stderr}") + sys.exit(1) + + print(f"✅ Dependencies ready at {VENV_DIR}") + print() + print("Usage:") + print(f" {VENV_DIR}/bin/python scripts/fix_transcription.py --input file.md --stage 3") + print() + print("Or add alias to ~/.zshrc:") + print(f' alias tf="{VENV_DIR}/bin/python scripts/fix_transcription.py"') + + +if __name__ == "__main__": + main() diff --git a/transcript-fixer/scripts/tests/test_correction_service.py b/transcript-fixer/scripts/tests/test_correction_service.py index 76d0e16..d81963d 100644 --- a/transcript-fixer/scripts/tests/test_correction_service.py +++ b/transcript-fixer/scripts/tests/test_correction_service.py @@ -84,6 +84,14 @@ class TestCorrectionService(unittest.TestCase): self.service.validate_domain_name("embodied_ai") self.service.validate_domain_name("test-domain-123") + def test_validate_chinese_domain(self): + """Test acceptance of Chinese domain names.""" + # Should not raise - Chinese characters are valid + self.service.validate_domain_name("火星加速器") + self.service.validate_domain_name("具身智能") + self.service.validate_domain_name("中文域名-123") + self.service.validate_domain_name("混合domain中文") + # ==================== Correction Operations Tests ==================== def test_add_correction(self): diff --git a/transcript-fixer/scripts/tests/test_domain_validator.py b/transcript-fixer/scripts/tests/test_domain_validator.py index 614409a..aacd2cb 100644 --- a/transcript-fixer/scripts/tests/test_domain_validator.py +++ b/transcript-fixer/scripts/tests/test_domain_validator.py @@ -40,19 +40,27 @@ from utils.domain_validator import ( class TestDomainValidation: - """Test domain whitelist validation""" + """Test domain pattern validation""" def test_valid_domains(self): - """Test all valid domains are accepted""" + """Test predefined domains are accepted""" for domain in VALID_DOMAINS: result = validate_domain(domain) assert result == domain - def test_case_insensitive(self): - """Test domain validation is case-insensitive""" - assert validate_domain("GENERAL") == "general" - assert validate_domain("General") == "general" - assert validate_domain("embodied_AI") == "embodied_ai" + def test_custom_domains(self): + """Test custom domain names are accepted""" + assert validate_domain("my_custom_domain") == "my_custom_domain" + assert validate_domain("test-domain-123") == "test-domain-123" + assert validate_domain("domain1") == "domain1" + assert validate_domain("export_test") == "export_test" + + def test_chinese_domains(self): + """Test Chinese domain names are accepted""" + assert validate_domain("火星加速器") == "火星加速器" + assert validate_domain("具身智能") == "具身智能" + assert validate_domain("中文域名") == "中文域名" + assert validate_domain("混合domain中文") == "混合domain中文" def test_whitespace_trimmed(self): """Test whitespace is trimmed""" @@ -70,7 +78,7 @@ class TestDomainValidation: ] for malicious in malicious_inputs: - with pytest.raises(ValidationError, match="Invalid domain"): + with pytest.raises(ValidationError): validate_domain(malicious) def test_empty_domain(self): @@ -81,6 +89,12 @@ class TestDomainValidation: with pytest.raises(ValidationError, match="cannot be empty"): validate_domain(" ") + def test_domain_too_long(self): + """Test domain length limit""" + long_domain = "a" * 51 + with pytest.raises(ValidationError, match="too long"): + validate_domain(long_domain) + class TestSourceValidation: """Test source whitelist validation""" @@ -163,7 +177,7 @@ class TestCorrectionInputsValidation: def test_invalid_domain_in_full_validation(self): """Test invalid domain is rejected in full validation""" - with pytest.raises(ValidationError, match="Invalid domain"): + with pytest.raises(ValidationError): validate_correction_inputs( from_text="test", to_text="test", @@ -286,10 +300,10 @@ class TestSecurityScenarios: def test_domain_bypass_attempts(self): """Test various domain bypass attempts""" bypass_attempts = [ - "general\x00hacked", # Null byte injection "general\nmalicious", # Newline injection - "general -- comment", # SQL comment - "general' UNION", # SQL union + "general -- comment", # SQL comment (space is invalid) + "general' UNION", # SQL union (quote is invalid) + "../etc/passwd", # Path traversal ] for attempt in bypass_attempts: diff --git a/transcript-fixer/scripts/utils/domain_validator.py b/transcript-fixer/scripts/utils/domain_validator.py index a643d5d..ae66711 100644 --- a/transcript-fixer/scripts/utils/domain_validator.py +++ b/transcript-fixer/scripts/utils/domain_validator.py @@ -20,8 +20,8 @@ from __future__ import annotations from typing import Final, Set import re -# Domain whitelist - ONLY these values are allowed -VALID_DOMAINS: Final[Set[str]] = { +# Predefined domains (for documentation/reference, not enforced as whitelist) +PREDEFINED_DOMAINS: Final[Set[str]] = { 'general', 'embodied_ai', 'finance', @@ -30,6 +30,16 @@ VALID_DOMAINS: Final[Set[str]] = { 'technical', } +# Domain validation pattern - supports Chinese, Japanese, Korean characters +# \u4e00-\u9fff: CJK Unified Ideographs (Chinese) +# \u3040-\u309f: Hiragana, \u30a0-\u30ff: Katakana (Japanese) +# \uac00-\ud7af: Hangul Syllables (Korean) +DOMAIN_PATTERN: Final[str] = r'^[\w\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af-]+$' +MAX_DOMAIN_LENGTH: Final[int] = 50 + +# Keep VALID_DOMAINS as alias for backward compatibility +VALID_DOMAINS = PREDEFINED_DOMAINS + # Source whitelist VALID_SOURCES: Final[Set[str]] = { 'manual', @@ -53,42 +63,61 @@ class ValidationError(Exception): def validate_domain(domain: str) -> str: """ - Validate domain against whitelist. + Validate domain name using pattern matching. CRITICAL: Prevents SQL injection via domain parameter. - Domain is used in WHERE clauses - must be whitelisted. + Domain is used in WHERE clauses - must match safe pattern. + + Supports: + - Alphanumeric characters (a-z, A-Z, 0-9) + - Underscores and hyphens + - Chinese, Japanese, Korean characters Args: domain: Domain string to validate Returns: - Validated domain (guaranteed to be in whitelist) + Validated domain (guaranteed to match safe pattern) Raises: - ValidationError: If domain not in whitelist + ValidationError: If domain contains invalid characters Examples: >>> validate_domain('general') 'general' + >>> validate_domain('火星加速器') + '火星加速器' + >>> validate_domain('hacked"; DROP TABLE corrections--') ValidationError: Invalid domain """ if not domain: raise ValidationError("Domain cannot be empty") - domain = domain.strip().lower() + domain = domain.strip() # Check again after stripping (whitespace-only input) if not domain: raise ValidationError("Domain cannot be empty") - if domain not in VALID_DOMAINS: + # Check length + if len(domain) > MAX_DOMAIN_LENGTH: raise ValidationError( - f"Invalid domain: '{domain}'. " - f"Valid domains: {sorted(VALID_DOMAINS)}" + f"Domain name too long: {len(domain)} chars (max: {MAX_DOMAIN_LENGTH})" ) + # Check pattern (supports Chinese and other CJK characters) + if not re.match(DOMAIN_PATTERN, domain): + raise ValidationError( + f"Domain name contains invalid characters: {domain}. " + f"Allowed: alphanumeric, underscore, hyphen, Chinese/Japanese/Korean characters" + ) + + # Check for path traversal attempts + if '..' in domain or '/' in domain or '\\' in domain: + raise ValidationError(f"Domain name contains path traversal: {domain}") + return domain