fix: prevent dictionary false positives + add tunnel-doctor WSL/Go findings

transcript-fixer: - Add common_words.py safety system (blocks common Chinese words from dictionary) - Add --audit command to scan existing dictionary for risky rules - Add --force flag to override safety checks explicitly - Fix substring corruption (产线数据→产线束据, 现金流→现现金流) - Unified position-aware replacement with _already_corrected() check - 69 tests covering all production false positive scenarios tunnel-doctor: - Add Step 5A: Tailscale SSH proxy silent failure on WSL - Add Step 5B: App Store vs Standalone Tailscale on macOS - Add Go net/http NO_PROXY CIDR incompatibility warning - Add utun interface identification (MTU 1280=Tailscale, 4064=Shadowrocket) - Fix "Four→Five Conflict Layers" inconsistency in reference doc - Add complete working Shadowrocket config reference Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 15:56:38 +08:00
parent d4634cb00b
commit a496c91cae
12 changed files with 1596 additions and 44 deletions
--- a/transcript-fixer/scripts/cli/init.py
+++ b/transcript-fixer/scripts/cli/init.py
@@ -9,6 +9,7 @@ This module contains command handlers and argument parsing:
 from .commands import (
    cmd_init,
    cmd_add_correction,
+    cmd_audit,
    cmd_list_corrections,
    cmd_run_correction,
    cmd_review_learned,
@@ -25,6 +26,7 @@ from .argument_parser import create_argument_parser
 __all__ = [
    'cmd_init',
    'cmd_add_correction',
+    'cmd_audit',
    'cmd_list_corrections',
    'cmd_run_correction',
    'cmd_review_learned',
--- a/transcript-fixer/scripts/cli/argument_parser.py
+++ b/transcript-fixer/scripts/cli/argument_parser.py
@@ -37,12 +37,24 @@ def create_argument_parser() -> argparse.ArgumentParser:
        dest="add_correction",
        help="Add correction"
    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        default=False,
+        help="Force --add even when safety checks detect risks (common word, substring collision)"
+    )
    parser.add_argument(
        "--list",
        action="store_true",
        dest="list_corrections",
        help="List all corrections"
    )
+    parser.add_argument(
+        "--audit",
+        action="store_true",
+        dest="audit_dictionary",
+        help="Audit all active corrections for false positive risks (common words, short text, substring collisions)"
+    )

    # Correction workflow
    parser.add_argument(
--- a/transcript-fixer/scripts/cli/commands.py
+++ b/transcript-fixer/scripts/cli/commands.py
@@ -43,16 +43,85 @@ def cmd_init(args: argparse.Namespace) -> None:


 def cmd_add_correction(args: argparse.Namespace) -> None:
-    """Add a single correction"""
+    """Add a single correction with safety checks"""
    service = _get_service()
+    force = getattr(args, 'force', False)
    try:
-        service.add_correction(args.from_text, args.to_text, args.domain)
-        print(f"✅ Added: '{args.from_text}' → '{args.to_text}' (domain: {args.domain})")
+        service.add_correction(
+            args.from_text, args.to_text, args.domain, force=force,
+        )
+        print(f"Added: '{args.from_text}' -> '{args.to_text}' (domain: {args.domain})")
    except Exception as e:
-        print(f"❌ Error: {e}")
+        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)


+def cmd_audit(args: argparse.Namespace) -> None:
+    """Audit all active corrections for false positive risks"""
+    service = _get_service()
+    domain = getattr(args, 'domain', None)
+
+    print(f"\nAuditing corrections" + (f" (domain: {domain})" if domain else " (all domains)") + "...")
+    print("=" * 70)
+
+    issues = service.audit_dictionary(domain)
+
+    if not issues:
+        corrections = service.get_corrections(domain)
+        print(f"\nAll {len(corrections)} corrections passed safety checks.")
+        return
+
+    # Categorize
+    error_count = 0
+    warning_count = 0
+    for from_text, warnings in issues.items():
+        for w in warnings:
+            if w.level == "error":
+                error_count += 1
+            else:
+                warning_count += 1
+
+    corrections = service.get_corrections(domain)
+    print(f"\nScanned {len(corrections)} corrections. "
+          f"Found issues in {len(issues)} rules:")
+    print(f"  Errors: {error_count} (should be removed or converted to context rules)")
+    print(f"  Warnings: {warning_count} (review recommended)")
+    print()
+
+    # Print details grouped by severity
+    for severity in ["error", "warning"]:
+        label = "ERRORS" if severity == "error" else "WARNINGS"
+        relevant = {
+            ft: [w for w in ws if w.level == severity]
+            for ft, ws in issues.items()
+        }
+        relevant = {ft: ws for ft, ws in relevant.items() if ws}
+
+        if not relevant:
+            continue
+
+        print(f"--- {label} ({len(relevant)} rules) ---")
+        for from_text, warnings in sorted(relevant.items()):
+            to_text = corrections.get(from_text, "?")
+            print(f"\n  '{from_text}' -> '{to_text}'")
+            for w in warnings:
+                print(f"    [{w.category}] {w.message}")
+                print(f"    Suggestion: {w.suggestion}")
+        print()
+
+    if error_count > 0:
+        print(
+            f"ACTION REQUIRED: {error_count} error(s) found. These rules are "
+            f"actively causing false positives and should be removed or "
+            f"converted to context rules."
+        )
+        print(
+            f"To remove a rule: "
+            f"sqlite3 ~/.transcript-fixer/corrections.db "
+            f"\"UPDATE corrections SET is_active=0 WHERE from_text='...';\""
+        )
+
+
 def cmd_list_corrections(args: argparse.Namespace) -> None:
    """List all corrections"""
    service = _get_service()