fix: prevent dictionary false positives + add tunnel-doctor WSL/Go findings

transcript-fixer:
- Add common_words.py safety system (blocks common Chinese words from dictionary)
- Add --audit command to scan existing dictionary for risky rules
- Add --force flag to override safety checks explicitly
- Fix substring corruption (产线数据→产线束据, 现金流→现现金流)
- Unified position-aware replacement with _already_corrected() check
- 69 tests covering all production false positive scenarios

tunnel-doctor:
- Add Step 5A: Tailscale SSH proxy silent failure on WSL
- Add Step 5B: App Store vs Standalone Tailscale on macOS
- Add Go net/http NO_PROXY CIDR incompatibility warning
- Add utun interface identification (MTU 1280=Tailscale, 4064=Shadowrocket)
- Fix "Four→Five Conflict Layers" inconsistency in reference doc
- Add complete working Shadowrocket config reference

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
daymade
2026-03-21 15:56:38 +08:00
parent d4634cb00b
commit a496c91cae
12 changed files with 1596 additions and 44 deletions

View File

@@ -12,6 +12,7 @@ from __future__ import annotations
import re
import os
import sys
import logging
from pathlib import Path
from typing import Dict, List, Optional, Tuple
@@ -23,6 +24,10 @@ from .correction_repository import (
DatabaseError
)
# Import safety check for common words
sys.path.insert(0, str(Path(__file__).parent.parent))
from utils.common_words import check_correction_safety, audit_corrections, SafetyWarning
logger = logging.getLogger(__name__)
@@ -178,10 +183,15 @@ class CorrectionService:
domain: str = "general",
source: str = "manual",
confidence: float = 1.0,
notes: Optional[str] = None
notes: Optional[str] = None,
force: bool = False,
) -> int:
"""
Add a correction with full validation.
Add a correction with full validation and safety checks.
Safety checks detect common Chinese words and substring collision
risks that would cause false positives. Pass force=True to bypass
(errors become warnings printed to stderr).
Args:
from_text: Original (incorrect) text
@@ -190,12 +200,13 @@ class CorrectionService:
source: Origin of correction
confidence: Confidence score
notes: Optional notes
force: If True, downgrade safety errors to warnings
Returns:
ID of inserted correction
Raises:
ValidationError: If validation fails
ValidationError: If validation or safety check fails
"""
# Comprehensive validation
self.validate_correction_text(from_text, "from_text")
@@ -210,6 +221,34 @@ class CorrectionService:
f"from_text and to_text are identical: '{from_text}'"
)
# Safety check: detect common words and substring collisions
safety_warnings = check_correction_safety(from_text, to_text, strict=True)
if safety_warnings:
errors = [w for w in safety_warnings if w.level == "error"]
warns = [w for w in safety_warnings if w.level == "warning"]
if errors and not force:
# Block the addition
msg_parts = []
for w in errors:
msg_parts.append(f"[{w.category}] {w.message}")
msg_parts.append(f" Suggestion: {w.suggestion}")
raise ValidationError(
f"Safety check BLOCKED adding '{from_text}' -> '{to_text}':\n"
+ "\n".join(msg_parts)
+ "\n\nUse --force to override (at your own risk)."
)
# Print warnings (errors downgraded by --force, or genuine warnings)
all_to_print = errors + warns if force else warns
if all_to_print:
for w in all_to_print:
prefix = "FORCED" if w.level == "error" else "WARNING"
logger.warning(
f"[{prefix}] [{w.category}] {w.message} | {w.suggestion}"
)
# Get current user
added_by = os.getenv("USER") or os.getenv("USERNAME") or "unknown"
@@ -431,6 +470,31 @@ class CorrectionService:
return stats
# ==================== Audit Operations ====================
def audit_dictionary(
self,
domain: Optional[str] = None,
) -> Dict[str, List[SafetyWarning]]:
"""
Audit all active corrections for safety issues.
Scans every rule and flags:
- from_text that is a common Chinese word (false positive risk)
- from_text that is <= 2 characters (high collision risk)
- from_text that appears as substring in common words (collateral damage)
- Both from_text and to_text being common words (bidirectional risk)
Args:
domain: Optional domain filter (None = all domains)
Returns:
Dict mapping from_text to list of SafetyWarnings.
Only entries with issues are included.
"""
corrections = self.get_corrections(domain)
return audit_corrections(corrections)
# ==================== Helper Methods ====================
def _detect_conflicts(

View File

@@ -14,9 +14,17 @@ Features:
from __future__ import annotations
import re
import sys
import logging
from pathlib import Path
from typing import Dict, List, Tuple
from dataclasses import dataclass
sys.path.insert(0, str(Path(__file__).parent.parent))
from utils.common_words import ALL_COMMON_WORDS
logger = logging.getLogger(__name__)
@dataclass
class Change:
@@ -96,7 +104,16 @@ class DictionaryProcessor:
return corrected, changes
def _apply_dictionary(self, text: str) -> Tuple[str, List[Change]]:
"""Apply simple dictionary replacements"""
"""
Apply dictionary replacements with substring safety checks.
Safety layers (applied in order at each match site):
1. Superset check: if to_text already exists at the match position,
skip to prevent duplication (e.g., "金流""现金流" inside "现金流").
This applies to ALL rules regardless of length.
2. Boundary check (short rules only, <=3 chars): if the match is inside
a longer common word, skip to prevent collateral damage.
"""
changes = []
corrected = text
@@ -104,32 +121,167 @@ class DictionaryProcessor:
if wrong not in corrected:
continue
# Find all occurrences
occurrences = []
start = 0
while True:
pos = corrected.find(wrong, start)
if pos == -1:
break
line_num = corrected[:pos].count('\n') + 1
occurrences.append(line_num)
start = pos + len(wrong)
# Track changes
for line_num in occurrences:
changes.append(Change(
line_number=line_num,
from_text=wrong,
to_text=correct,
rule_type="dictionary",
rule_name="corrections_dict"
))
# Apply replacement
corrected = corrected.replace(wrong, correct)
# All rules go through position-aware replacement to get
# the superset check. Short rules additionally get the
# boundary check against common words.
needs_boundary_check = len(wrong) <= 3
corrected, new_changes = self._apply_with_safety_checks(
corrected, wrong, correct, needs_boundary_check,
)
changes.extend(new_changes)
return corrected, changes
def _find_occurrences(self, text: str, target: str) -> List[int]:
"""Find all line numbers where target appears in text."""
occurrences = []
start = 0
while True:
pos = text.find(target, start)
if pos == -1:
break
line_num = text[:pos].count('\n') + 1
occurrences.append(line_num)
start = pos + len(target)
return occurrences
def _apply_with_safety_checks(
self,
text: str,
wrong: str,
correct: str,
check_boundaries: bool,
) -> Tuple[str, List[Change]]:
"""
Apply replacement at each match position with two safety layers:
1. Superset check (all rules): When to_text contains from_text
(e.g., "金流""现金流"), check if the surrounding text already
forms to_text. If so, skip — the text is already correct.
2. Boundary check (short rules only): Check if the match is inside
a longer common word (e.g., "天差" inside "天差地别").
"""
changes = []
result_parts = []
search_start = 0
while search_start < len(text):
pos = text.find(wrong, search_start)
if pos == -1:
result_parts.append(text[search_start:])
break
# Safety layer 1: superset check.
# If to_text contains from_text, the replacement could create
# duplication. Check if to_text already exists at this position.
if self._already_corrected(text, pos, wrong, correct):
result_parts.append(text[search_start:pos + len(wrong)])
search_start = pos + len(wrong)
logger.debug(
f"Skipped '{wrong}' at pos {pos}: "
f"already corrected ('{correct}' present)"
)
continue
# Safety layer 2: boundary check (short rules only).
if check_boundaries and self._is_inside_longer_word(
text, pos, wrong
):
result_parts.append(text[search_start:pos + len(wrong)])
search_start = pos + len(wrong)
logger.debug(
f"Skipped '{wrong}' at pos {pos}: part of longer word"
)
continue
# Safe to replace
line_num = text[:pos].count('\n') + 1
changes.append(Change(
line_number=line_num,
from_text=wrong,
to_text=correct,
rule_type="dictionary",
rule_name="corrections_dict"
))
result_parts.append(text[search_start:pos])
result_parts.append(correct)
search_start = pos + len(wrong)
return "".join(result_parts), changes
@staticmethod
def _already_corrected(
text: str, pos: int, from_text: str, to_text: str
) -> bool:
"""
Check if to_text already exists at the match position, meaning
the text is already in the corrected form.
This catches the case where from_text is a substring of to_text
(e.g., "金流" is inside "现金流"). If the surrounding text already
forms "现金流", replacing "金流" would produce "现现金流".
Returns True if the replacement should be skipped.
"""
if from_text not in to_text:
# to_text doesn't contain from_text, so no superset risk.
return False
to_len = len(to_text)
from_len = len(from_text)
# Find all positions where from_text appears inside to_text.
# For each, check if the surrounding text matches to_text.
offset = 0
while True:
idx = to_text.find(from_text, offset)
if idx == -1:
break
# If to_text were at text position (pos - idx), from_text at pos
# would be the substring starting at idx within to_text.
candidate_start = pos - idx
candidate_end = candidate_start + to_len
if (candidate_start >= 0
and candidate_end <= len(text)
and text[candidate_start:candidate_end] == to_text):
return True
offset = idx + 1
return False
@staticmethod
def _is_inside_longer_word(text: str, pos: int, match: str) -> bool:
"""
Check if the match at `pos` is embedded inside a longer common word.
Looks at a window around the match and checks all possible substrings
of that window against the common words set.
"""
match_len = len(match)
# Check windows of 2 to 5 characters that overlap with the match
max_word_len = 5
window_start = max(0, pos - (max_word_len - 1))
window_end = min(len(text), pos + match_len + (max_word_len - 1))
window = text[window_start:window_end]
# Position of the match within the window
match_offset = pos - window_start
# Check all substrings that contain the match position
for length in range(match_len + 1, min(max_word_len + 1, len(window) + 1)):
for start in range(max(0, match_offset + match_len - length),
min(match_offset + 1, len(window) - length + 1)):
substr = window[start:start + length]
if substr != match and substr in ALL_COMMON_WORDS:
return True
return False
def get_summary(self, changes: List[Change]) -> Dict[str, int]:
"""Generate summary statistics"""
summary = {