#!/usr/bin/env python3 """ Unified Language Detection for Code Blocks Provides confidence-based language detection for documentation scrapers. Supports 20+ programming languages with weighted pattern matching. Author: Skill Seekers Project """ import logging import re logger = logging.getLogger(__name__) # Import Swift patterns from separate module (fork-friendly architecture) try: from skill_seekers.cli.swift_patterns import SWIFT_PATTERNS except ImportError as e: logger.warning( "Swift language detection patterns unavailable. Swift code detection will be disabled. Error: %s", e, ) SWIFT_PATTERNS: dict[str, list[tuple[str, int]]] = {} except Exception as e: logger.error( "Failed to load Swift patterns due to unexpected error: %s. Swift detection disabled.", e ) SWIFT_PATTERNS: dict[str, list[tuple[str, int]]] = {} # Verify Swift patterns were loaded correctly if not SWIFT_PATTERNS: logger.warning( "Swift pattern dictionary is empty. Swift detection is disabled. " "This may indicate swift_patterns.py has no patterns defined." ) elif "swift" not in SWIFT_PATTERNS: logger.error( "Swift patterns loaded but 'swift' key is missing. Swift detection is broken. Please file a bug report." ) else: logger.info( "Swift patterns loaded successfully: %d patterns for language detection", len(SWIFT_PATTERNS.get("swift", [])), ) # Comprehensive language patterns with weighted confidence scoring # Weight 5: Unique identifiers (highly specific) # Weight 4: Strong indicators # Weight 3: Common patterns # Weight 2: Moderate indicators # Weight 1: Weak indicators LANGUAGE_PATTERNS: dict[str, list[tuple[str, int]]] = { # ===== PRIORITY 1: Unity C# (Critical - User's Primary Issue) ===== "csharp": [ # Unity-specific patterns (weight 4-5, CRITICAL) (r"\busing\s+UnityEngine", 5), (r"\bMonoBehaviour\b", 5), (r"\bGameObject\b", 4), (r"\bTransform\b", 4), (r"\bVector[23]\b", 3), (r"\bQuaternion\b", 3), (r"\bvoid\s+Start\s*\(\)", 4), (r"\bvoid\s+Update\s*\(\)", 4), (r"\bvoid\s+Awake\s*\(\)", 4), (r"\bvoid\s+OnEnable\s*\(\)", 3), (r"\bvoid\s+OnDisable\s*\(\)", 3), (r"\bvoid\s+FixedUpdate\s*\(\)", 4), (r"\bvoid\s+LateUpdate\s*\(\)", 4), (r"\bvoid\s+OnCollisionEnter", 4), (r"\bvoid\s+OnTriggerEnter", 4), (r"\bIEnumerator\b", 4), (r"\bStartCoroutine\s*\(", 4), (r"\byield\s+return\s+new\s+WaitForSeconds", 4), (r"\byield\s+return\s+null", 3), (r"\byield\s+return", 4), (r"\[SerializeField\]", 4), (r"\[RequireComponent", 4), (r"\[Header\(", 3), (r"\[Range\(", 3), (r"\bTime\.deltaTime\b", 4), (r"\bInput\.Get", 4), (r"\bRigidbody\b", 3), (r"\bCollider\b", 3), (r"\bRenderer\b", 3), (r"\bGetComponent<", 3), # Basic C# patterns (weight 2-4) (r"\bnamespace\s+\w+", 3), (r"\busing\s+System", 3), (r"\bConsole\.WriteLine", 4), # C#-specific output (r"\bConsole\.Write", 3), (r"\bpublic\s+class\s+\w+", 4), # Increased to match Java weight (r"\bprivate\s+class\s+\w+", 3), (r"\binternal\s+class\s+\w+", 4), # C#-specific modifier (r"\bstring\s+\w+\s*[;=]", 2), # C#-specific lowercase string (r"\bprivate\s+\w+\s+\w+\s*;", 2), # Private fields (common in both C# and Java) (r"\{\s*get;\s*set;\s*\}", 3), # Auto properties (r"\{\s*get;\s*private\s+set;\s*\}", 3), (r"\{\s*get\s*=>\s*", 2), # Expression properties (r"\bpublic\s+static\s+void\s+", 2), # Modern C# patterns (weight 2) (r"\bfrom\s+\w+\s+in\s+", 2), # LINQ (r"\.Where\s*\(", 2), (r"\.Select\s*\(", 2), (r"\basync\s+Task", 2), (r"\bawait\s+", 2), (r"\bvar\s+\w+\s*=", 1), ], # ===== PRIORITY 2: Frontend Languages ===== "typescript": [ # TypeScript-specific (weight 4-5) (r"\binterface\s+\w+\s*\{", 5), (r"\btype\s+\w+\s*=", 4), (r":\s*\w+\s*=", 3), # Type annotation (r":\s*\w+\[\]", 3), # Array type (r"<[\w,\s]+>", 2), # Generic type (r"\bas\s+\w+", 2), # Type assertion (r"\benum\s+\w+\s*\{", 4), (r"\bimplements\s+\w+", 3), (r"\bexport\s+interface", 4), (r"\bexport\s+type", 4), # Also has JS patterns (weight 1) (r"\bconst\s+\w+\s*=", 1), (r"\blet\s+\w+\s*=", 1), (r"=>", 1), ], "javascript": [ (r"\bfunction\s+\w+\s*\(", 3), (r"\bconst\s+\w+\s*=", 2), (r"\blet\s+\w+\s*=", 2), (r"=>", 2), # Arrow function (r"\bconsole\.log", 2), (r"\bvar\s+\w+\s*=", 1), (r"\.then\s*\(", 2), # Promise (r"\.catch\s*\(", 2), # Promise (r"\basync\s+function", 3), (r"\bawait\s+", 2), (r"require\s*\(", 2), # CommonJS (r"\bexport\s+default", 2), # ES6 (r"\bexport\s+const", 2), ], "jsx": [ # JSX patterns (weight 4-5) (r"<\w+\s+[^>]*>", 4), # JSX tag with attributes (r"<\w+\s*/>", 4), # Self-closing tag (r"className=", 3), # React className (r"onClick=", 3), # React event (r"\brender\s*\(\s*\)\s*\{", 4), # React render (r"\buseState\s*\(", 4), # React hook (r"\buseEffect\s*\(", 4), # React hook (r"\buseRef\s*\(", 3), (r"\buseCallback\s*\(", 3), (r"\buseMemo\s*\(", 3), # Also has JS patterns (r"\bconst\s+\w+\s*=", 1), (r"=>", 1), ], "tsx": [ # TSX = TypeScript + JSX (weight 5) (r"<\w+\s+[^>]*>", 3), # JSX tag (r":\s*React\.\w+", 5), # React types (r"interface\s+\w+Props", 5), # Props interface (r"\bFunctionComponent<", 4), (r"\bReact\.FC<", 4), (r"\buseState<", 4), # Typed hook (r"\buseRef<", 3), # Also has TS patterns (r"\binterface\s+\w+", 2), (r"\btype\s+\w+\s*=", 2), ], "vue": [ # Vue SFC patterns (weight 4-5) (r"