diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index 963780d..f12448e 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -32,6 +32,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector from skill_seekers.cli.llms_txt_parser import LlmsTxtParser from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader +from skill_seekers.cli.language_detector import LanguageDetector from skill_seekers.cli.constants import ( DEFAULT_RATE_LIMIT, DEFAULT_MAX_PAGES, @@ -111,6 +112,9 @@ class DocToSkillConverter: self.pages: List[Dict[str, Any]] = [] self.pages_scraped = 0 + # Language detection + self.language_detector = LanguageDetector(min_confidence=0.15) + # Thread-safe lock for parallel scraping if self.workers > 1: import threading @@ -278,81 +282,18 @@ class DocToSkillConverter: return page - def _extract_language_from_classes(self, classes): - """Extract language from class list - - Supports multiple patterns: - - language-{lang} (e.g., "language-python") - - lang-{lang} (e.g., "lang-javascript") - - brush: {lang} (e.g., "brush: java") - - bare language name (e.g., "python", "java") - - """ - # Define common programming languages - known_languages = [ - "javascript", "java", "xml", "html", "python", "bash", "cpp", "typescript", - "go", "rust", "php", "ruby", "swift", "kotlin", "csharp", "c", "sql", - "yaml", "json", "markdown", "css", "scss", "sass", "jsx", "tsx", "vue", - "shell", "powershell", "r", "scala", "dart", "perl", "lua", "elixir" - ] - - for cls in classes: - # Clean special characters (except word chars and hyphens) - cls = re.sub(r'[^\w-]', '', cls) - - if 'language-' in cls: - return cls.replace('language-', '') - - if 'lang-' in cls: - return cls.replace('lang-', '') - - # Check for brush: pattern (e.g., "brush: java") - if 'brush' in cls.lower(): - lang = cls.lower().replace('brush', '').strip() - if lang in known_languages: - return lang - - # Check for bare language name - if cls in known_languages: - return cls - - return None - def detect_language(self, elem, code): - """Detect programming language from code block""" + """Detect programming language from code block - # Check element classes - lang = self._extract_language_from_classes(elem.get('class', [])) - if lang: - return lang + UPDATED: Now uses confidence-based detection with 20+ languages + """ + lang, confidence = self.language_detector.detect_from_html(elem, code) - # Check parent pre element - parent = elem.parent - if parent and parent.name == 'pre': - lang = self._extract_language_from_classes(parent.get('class', [])) - if lang: - return lang + # Log low-confidence detections for debugging + if confidence < 0.5: + logger.debug(f"Low confidence language detection: {lang} ({confidence:.2f})") - # Heuristic detection - if 'import ' in code and 'from ' in code: - return 'python' - if 'const ' in code or 'let ' in code or '=>' in code: - return 'javascript' - if 'func ' in code and 'var ' in code: - return 'gdscript' - if 'def ' in code and ':' in code: - return 'python' - if '#include' in code or 'int main' in code: - return 'cpp' - # C# detection - if 'using System' in code or 'namespace ' in code: - return 'csharp' - if '{ get; set; }' in code: - return 'csharp' - if any(keyword in code for keyword in ['public class ', 'private class ', 'internal class ', 'public static void ']): - return 'csharp' - - return 'unknown' + return lang # Return string for backward compatibility def extract_patterns(self, main: Any, code_samples: List[Dict[str, Any]]) -> List[Dict[str, str]]: """Extract common coding patterns (NEW FEATURE)""" diff --git a/src/skill_seekers/cli/language_detector.py b/src/skill_seekers/cli/language_detector.py new file mode 100644 index 0000000..928a1fd --- /dev/null +++ b/src/skill_seekers/cli/language_detector.py @@ -0,0 +1,554 @@ +#!/usr/bin/env python3 +""" +Unified Language Detection for Code Blocks + +Provides confidence-based language detection for documentation scrapers. +Supports 20+ programming languages with weighted pattern matching. + +Author: Skill Seekers Project +""" + +import re +from typing import Optional, Tuple, Dict, List + + +# Comprehensive language patterns with weighted confidence scoring +# Weight 5: Unique identifiers (highly specific) +# Weight 4: Strong indicators +# Weight 3: Common patterns +# Weight 2: Moderate indicators +# Weight 1: Weak indicators + +LANGUAGE_PATTERNS: Dict[str, List[Tuple[str, int]]] = { + # ===== PRIORITY 1: Unity C# (Critical - User's Primary Issue) ===== + 'csharp': [ + # Unity-specific patterns (weight 4-5, CRITICAL) + (r'\busing\s+UnityEngine', 5), + (r'\bMonoBehaviour\b', 5), + (r'\bGameObject\b', 4), + (r'\bTransform\b', 4), + (r'\bVector[23]\b', 3), + (r'\bQuaternion\b', 3), + (r'\bvoid\s+Start\s*\(\)', 4), + (r'\bvoid\s+Update\s*\(\)', 4), + (r'\bvoid\s+Awake\s*\(\)', 4), + (r'\bvoid\s+OnEnable\s*\(\)', 3), + (r'\bvoid\s+OnDisable\s*\(\)', 3), + (r'\bvoid\s+FixedUpdate\s*\(\)', 4), + (r'\bvoid\s+LateUpdate\s*\(\)', 4), + (r'\bvoid\s+OnCollisionEnter', 4), + (r'\bvoid\s+OnTriggerEnter', 4), + (r'\bIEnumerator\b', 4), + (r'\bStartCoroutine\s*\(', 4), + (r'\byield\s+return\s+new\s+WaitForSeconds', 4), + (r'\byield\s+return\s+null', 3), + (r'\byield\s+return', 4), + (r'\[SerializeField\]', 4), + (r'\[RequireComponent', 4), + (r'\[Header\(', 3), + (r'\[Range\(', 3), + (r'\bTime\.deltaTime\b', 4), + (r'\bInput\.Get', 4), + (r'\bRigidbody\b', 3), + (r'\bCollider\b', 3), + (r'\bRenderer\b', 3), + (r'\bGetComponent<', 3), + + # Basic C# patterns (weight 2-4) + (r'\bnamespace\s+\w+', 3), + (r'\busing\s+System', 3), + (r'\bConsole\.WriteLine', 4), # C#-specific output + (r'\bConsole\.Write', 3), + (r'\bpublic\s+class\s+\w+', 4), # Increased to match Java weight + (r'\bprivate\s+class\s+\w+', 3), + (r'\binternal\s+class\s+\w+', 4), # C#-specific modifier + (r'\bstring\s+\w+\s*[;=]', 2), # C#-specific lowercase string + (r'\bprivate\s+\w+\s+\w+\s*;', 2), # Private fields (common in both C# and Java) + (r'\{\s*get;\s*set;\s*\}', 3), # Auto properties + (r'\{\s*get;\s*private\s+set;\s*\}', 3), + (r'\{\s*get\s*=>\s*', 2), # Expression properties + (r'\bpublic\s+static\s+void\s+', 2), + + # Modern C# patterns (weight 2) + (r'\bfrom\s+\w+\s+in\s+', 2), # LINQ + (r'\.Where\s*\(', 2), + (r'\.Select\s*\(', 2), + (r'\basync\s+Task', 2), + (r'\bawait\s+', 2), + (r'\bvar\s+\w+\s*=', 1), + ], + + # ===== PRIORITY 2: Frontend Languages ===== + 'typescript': [ + # TypeScript-specific (weight 4-5) + (r'\binterface\s+\w+\s*\{', 5), + (r'\btype\s+\w+\s*=', 4), + (r':\s*\w+\s*=', 3), # Type annotation + (r':\s*\w+\[\]', 3), # Array type + (r'<[\w,\s]+>', 2), # Generic type + (r'\bas\s+\w+', 2), # Type assertion + (r'\benum\s+\w+\s*\{', 4), + (r'\bimplements\s+\w+', 3), + (r'\bexport\s+interface', 4), + (r'\bexport\s+type', 4), + + # Also has JS patterns (weight 1) + (r'\bconst\s+\w+\s*=', 1), + (r'\blet\s+\w+\s*=', 1), + (r'=>', 1), + ], + + 'javascript': [ + (r'\bfunction\s+\w+\s*\(', 3), + (r'\bconst\s+\w+\s*=', 2), + (r'\blet\s+\w+\s*=', 2), + (r'=>', 2), # Arrow function + (r'\bconsole\.log', 2), + (r'\bvar\s+\w+\s*=', 1), + (r'\.then\s*\(', 2), # Promise + (r'\.catch\s*\(', 2), # Promise + (r'\basync\s+function', 3), + (r'\bawait\s+', 2), + (r'require\s*\(', 2), # CommonJS + (r'\bexport\s+default', 2), # ES6 + (r'\bexport\s+const', 2), + ], + + 'jsx': [ + # JSX patterns (weight 4-5) + (r'<\w+\s+[^>]*>', 4), # JSX tag with attributes + (r'<\w+\s*/>', 4), # Self-closing tag + (r'className=', 3), # React className + (r'onClick=', 3), # React event + (r'\brender\s*\(\s*\)\s*\{', 4), # React render + (r'\buseState\s*\(', 4), # React hook + (r'\buseEffect\s*\(', 4), # React hook + (r'\buseRef\s*\(', 3), + (r'\buseCallback\s*\(', 3), + (r'\buseMemo\s*\(', 3), + + # Also has JS patterns + (r'\bconst\s+\w+\s*=', 1), + (r'=>', 1), + ], + + 'tsx': [ + # TSX = TypeScript + JSX (weight 5) + (r'<\w+\s+[^>]*>', 3), # JSX tag + (r':\s*React\.\w+', 5), # React types + (r'interface\s+\w+Props', 5), # Props interface + (r'\bFunctionComponent<', 4), + (r'\bReact\.FC<', 4), + (r'\buseState<', 4), # Typed hook + (r'\buseRef<', 3), + + # Also has TS patterns + (r'\binterface\s+\w+', 2), + (r'\btype\s+\w+\s*=', 2), + ], + + 'vue': [ + # Vue SFC patterns (weight 4-5) + (r'