fix: prevent dictionary false positives + add tunnel-doctor WSL/Go findings

transcript-fixer: - Add common_words.py safety system (blocks common Chinese words from dictionary) - Add --audit command to scan existing dictionary for risky rules - Add --force flag to override safety checks explicitly - Fix substring corruption (产线数据→产线束据, 现金流→现现金流) - Unified position-aware replacement with _already_corrected() check - 69 tests covering all production false positive scenarios tunnel-doctor: - Add Step 5A: Tailscale SSH proxy silent failure on WSL - Add Step 5B: App Store vs Standalone Tailscale on macOS - Add Go net/http NO_PROXY CIDR incompatibility warning - Add utun interface identification (MTU 1280=Tailscale, 4064=Shadowrocket) - Fix "Four→Five Conflict Layers" inconsistency in reference doc - Add complete working Shadowrocket config reference Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 15:56:38 +08:00
parent d4634cb00b
commit a496c91cae
12 changed files with 1596 additions and 44 deletions
--- a/transcript-fixer/scripts/core/correction_service.py
+++ b/transcript-fixer/scripts/core/correction_service.py
@@ -12,6 +12,7 @@ from __future__ import annotations

 import re
 import os
+import sys
 import logging
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
@@ -23,6 +24,10 @@ from .correction_repository import (
    DatabaseError
 )

+# Import safety check for common words
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from utils.common_words import check_correction_safety, audit_corrections, SafetyWarning
+
 logger = logging.getLogger(__name__)


@@ -178,10 +183,15 @@ class CorrectionService:
        domain: str = "general",
        source: str = "manual",
        confidence: float = 1.0,
-        notes: Optional[str] = None
+        notes: Optional[str] = None,
+        force: bool = False,
    ) -> int:
        """
-        Add a correction with full validation.
+        Add a correction with full validation and safety checks.
+
+        Safety checks detect common Chinese words and substring collision
+        risks that would cause false positives. Pass force=True to bypass
+        (errors become warnings printed to stderr).

        Args:
            from_text: Original (incorrect) text
@@ -190,12 +200,13 @@ class CorrectionService:
            source: Origin of correction
            confidence: Confidence score
            notes: Optional notes
+            force: If True, downgrade safety errors to warnings

        Returns:
            ID of inserted correction

        Raises:
-            ValidationError: If validation fails
+            ValidationError: If validation or safety check fails
        """
        # Comprehensive validation
        self.validate_correction_text(from_text, "from_text")
@@ -210,6 +221,34 @@ class CorrectionService:
                f"from_text and to_text are identical: '{from_text}'"
            )

+        # Safety check: detect common words and substring collisions
+        safety_warnings = check_correction_safety(from_text, to_text, strict=True)
+
+        if safety_warnings:
+            errors = [w for w in safety_warnings if w.level == "error"]
+            warns = [w for w in safety_warnings if w.level == "warning"]
+
+            if errors and not force:
+                # Block the addition
+                msg_parts = []
+                for w in errors:
+                    msg_parts.append(f"[{w.category}] {w.message}")
+                    msg_parts.append(f"  Suggestion: {w.suggestion}")
+                raise ValidationError(
+                    f"Safety check BLOCKED adding '{from_text}' -> '{to_text}':\n"
+                    + "\n".join(msg_parts)
+                    + "\n\nUse --force to override (at your own risk)."
+                )
+
+            # Print warnings (errors downgraded by --force, or genuine warnings)
+            all_to_print = errors + warns if force else warns
+            if all_to_print:
+                for w in all_to_print:
+                    prefix = "FORCED" if w.level == "error" else "WARNING"
+                    logger.warning(
+                        f"[{prefix}] [{w.category}] {w.message} | {w.suggestion}"
+                    )
+
        # Get current user
        added_by = os.getenv("USER") or os.getenv("USERNAME") or "unknown"

@@ -431,6 +470,31 @@ class CorrectionService:

        return stats

+    # ==================== Audit Operations ====================
+
+    def audit_dictionary(
+        self,
+        domain: Optional[str] = None,
+    ) -> Dict[str, List[SafetyWarning]]:
+        """
+        Audit all active corrections for safety issues.
+
+        Scans every rule and flags:
+        - from_text that is a common Chinese word (false positive risk)
+        - from_text that is <= 2 characters (high collision risk)
+        - from_text that appears as substring in common words (collateral damage)
+        - Both from_text and to_text being common words (bidirectional risk)
+
+        Args:
+            domain: Optional domain filter (None = all domains)
+
+        Returns:
+            Dict mapping from_text to list of SafetyWarnings.
+            Only entries with issues are included.
+        """
+        corrections = self.get_corrections(domain)
+        return audit_corrections(corrections)
+
    # ==================== Helper Methods ====================

    def _detect_conflicts(
--- a/transcript-fixer/scripts/core/dictionary_processor.py
+++ b/transcript-fixer/scripts/core/dictionary_processor.py
@@ -14,9 +14,17 @@ Features:
 from __future__ import annotations

 import re
+import sys
+import logging
+from pathlib import Path
 from typing import Dict, List, Tuple
 from dataclasses import dataclass

+sys.path.insert(0, str(Path(__file__).parent.parent))
+from utils.common_words import ALL_COMMON_WORDS
+
+logger = logging.getLogger(__name__)
+

@dataclass
 class Change:
@@ -96,7 +104,16 @@ class DictionaryProcessor:
        return corrected, changes

    def _apply_dictionary(self, text: str) -> Tuple[str, List[Change]]:
-        """Apply simple dictionary replacements"""
+        """
+        Apply dictionary replacements with substring safety checks.
+
+        Safety layers (applied in order at each match site):
+        1. Superset check: if to_text already exists at the match position,
+           skip to prevent duplication (e.g., "金流"→"现金流" inside "现金流").
+           This applies to ALL rules regardless of length.
+        2. Boundary check (short rules only, <=3 chars): if the match is inside
+           a longer common word, skip to prevent collateral damage.
+        """
        changes = []
        corrected = text

@@ -104,32 +121,167 @@ class DictionaryProcessor:
            if wrong not in corrected:
                continue

-            # Find all occurrences
-            occurrences = []
-            start = 0
-            while True:
-                pos = corrected.find(wrong, start)
-                if pos == -1:
-                    break
-                line_num = corrected[:pos].count('\n') + 1
-                occurrences.append(line_num)
-                start = pos + len(wrong)
-
-            # Track changes
-            for line_num in occurrences:
-                changes.append(Change(
-                    line_number=line_num,
-                    from_text=wrong,
-                    to_text=correct,
-                    rule_type="dictionary",
-                    rule_name="corrections_dict"
-                ))
-
-            # Apply replacement
-            corrected = corrected.replace(wrong, correct)
+            # All rules go through position-aware replacement to get
+            # the superset check. Short rules additionally get the
+            # boundary check against common words.
+            needs_boundary_check = len(wrong) <= 3
+            corrected, new_changes = self._apply_with_safety_checks(
+                corrected, wrong, correct, needs_boundary_check,
+            )
+            changes.extend(new_changes)

        return corrected, changes

+    def _find_occurrences(self, text: str, target: str) -> List[int]:
+        """Find all line numbers where target appears in text."""
+        occurrences = []
+        start = 0
+        while True:
+            pos = text.find(target, start)
+            if pos == -1:
+                break
+            line_num = text[:pos].count('\n') + 1
+            occurrences.append(line_num)
+            start = pos + len(target)
+        return occurrences
+
+    def _apply_with_safety_checks(
+        self,
+        text: str,
+        wrong: str,
+        correct: str,
+        check_boundaries: bool,
+    ) -> Tuple[str, List[Change]]:
+        """
+        Apply replacement at each match position with two safety layers:
+
+        1. Superset check (all rules): When to_text contains from_text
+           (e.g., "金流"→"现金流"), check if the surrounding text already
+           forms to_text. If so, skip — the text is already correct.
+
+        2. Boundary check (short rules only): Check if the match is inside
+           a longer common word (e.g., "天差" inside "天差地别").
+        """
+        changes = []
+        result_parts = []
+        search_start = 0
+
+        while search_start < len(text):
+            pos = text.find(wrong, search_start)
+            if pos == -1:
+                result_parts.append(text[search_start:])
+                break
+
+            # Safety layer 1: superset check.
+            # If to_text contains from_text, the replacement could create
+            # duplication. Check if to_text already exists at this position.
+            if self._already_corrected(text, pos, wrong, correct):
+                result_parts.append(text[search_start:pos + len(wrong)])
+                search_start = pos + len(wrong)
+                logger.debug(
+                    f"Skipped '{wrong}' at pos {pos}: "
+                    f"already corrected ('{correct}' present)"
+                )
+                continue
+
+            # Safety layer 2: boundary check (short rules only).
+            if check_boundaries and self._is_inside_longer_word(
+                text, pos, wrong
+            ):
+                result_parts.append(text[search_start:pos + len(wrong)])
+                search_start = pos + len(wrong)
+                logger.debug(
+                    f"Skipped '{wrong}' at pos {pos}: part of longer word"
+                )
+                continue
+
+            # Safe to replace
+            line_num = text[:pos].count('\n') + 1
+            changes.append(Change(
+                line_number=line_num,
+                from_text=wrong,
+                to_text=correct,
+                rule_type="dictionary",
+                rule_name="corrections_dict"
+            ))
+
+            result_parts.append(text[search_start:pos])
+            result_parts.append(correct)
+            search_start = pos + len(wrong)
+
+        return "".join(result_parts), changes
+
+    @staticmethod
+    def _already_corrected(
+        text: str, pos: int, from_text: str, to_text: str
+    ) -> bool:
+        """
+        Check if to_text already exists at the match position, meaning
+        the text is already in the corrected form.
+
+        This catches the case where from_text is a substring of to_text
+        (e.g., "金流" is inside "现金流"). If the surrounding text already
+        forms "现金流", replacing "金流" would produce "现现金流".
+
+        Returns True if the replacement should be skipped.
+        """
+        if from_text not in to_text:
+            # to_text doesn't contain from_text, so no superset risk.
+            return False
+
+        to_len = len(to_text)
+        from_len = len(from_text)
+
+        # Find all positions where from_text appears inside to_text.
+        # For each, check if the surrounding text matches to_text.
+        offset = 0
+        while True:
+            idx = to_text.find(from_text, offset)
+            if idx == -1:
+                break
+
+            # If to_text were at text position (pos - idx), from_text at pos
+            # would be the substring starting at idx within to_text.
+            candidate_start = pos - idx
+            candidate_end = candidate_start + to_len
+
+            if (candidate_start >= 0
+                    and candidate_end <= len(text)
+                    and text[candidate_start:candidate_end] == to_text):
+                return True
+
+            offset = idx + 1
+
+        return False
+
+    @staticmethod
+    def _is_inside_longer_word(text: str, pos: int, match: str) -> bool:
+        """
+        Check if the match at `pos` is embedded inside a longer common word.
+
+        Looks at a window around the match and checks all possible substrings
+        of that window against the common words set.
+        """
+        match_len = len(match)
+        # Check windows of 2 to 5 characters that overlap with the match
+        max_word_len = 5
+        window_start = max(0, pos - (max_word_len - 1))
+        window_end = min(len(text), pos + match_len + (max_word_len - 1))
+        window = text[window_start:window_end]
+
+        # Position of the match within the window
+        match_offset = pos - window_start
+
+        # Check all substrings that contain the match position
+        for length in range(match_len + 1, min(max_word_len + 1, len(window) + 1)):
+            for start in range(max(0, match_offset + match_len - length),
+                               min(match_offset + 1, len(window) - length + 1)):
+                substr = window[start:start + length]
+                if substr != match and substr in ALL_COMMON_WORDS:
+                    return True
+
+        return False
+
    def get_summary(self, changes: List[Change]) -> Dict[str, int]:
        """Generate summary statistics"""
        summary = {