From e28aaa1a5e05841ea935b1d708eecb7de7c476d8 Mon Sep 17 00:00:00 2001 From: Ricardo JL Rufino Date: Wed, 29 Oct 2025 16:17:51 -0300 Subject: [PATCH] feat: Add support for brush: and bare class language detection - Support
 pattern (SyntaxHighlighter)
- Support bare class names like 
- Add _extract_language_from_classes() helper method
- Apply detection logic to both code and parent pre elements
- Add 3 comprehensive test cases

Improves language detection for 25+ programming languages across
various documentation site formats.

Co-authored-by: Ricardo JL Rufino 
---
 cli/doc_scraper.py             | 61 +++++++++++++++++++++++++++-------
 tests/test_scraper_features.py | 21 ++++++++++++
 2 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/cli/doc_scraper.py b/cli/doc_scraper.py
index c6974bf..4702bec 100755
--- a/cli/doc_scraper.py
+++ b/cli/doc_scraper.py
@@ -267,25 +267,62 @@ class DocToSkillConverter:
                 page['links'].append(href)
         
         return page
-    
-    def detect_language(self, elem: Any, code: str) -> str:
-        """Detect programming language from code block"""
-        # Check class attribute
-        classes = elem.get('class', [])
+
+    def _extract_language_from_classes(self, classes):
+        """Extract language from class list
+
+        Supports multiple patterns:
+        - language-{lang} (e.g., "language-python")
+        - lang-{lang} (e.g., "lang-javascript")
+        - brush: {lang} (e.g., "brush: java")
+        - bare language name (e.g., "python", "java")
+
+        """
+        # Define common programming languages
+        known_languages = [
+            "javascript", "java", "xml", "html", "python", "bash", "cpp", "typescript",
+            "go", "rust", "php", "ruby", "swift", "kotlin", "csharp", "c", "sql",
+            "yaml", "json", "markdown", "css", "scss", "sass", "jsx", "tsx", "vue",
+            "shell", "powershell", "r", "scala", "dart", "perl", "lua", "elixir"
+        ]
+
         for cls in classes:
+            # Clean special characters (except word chars and hyphens)
+            cls = re.sub(r'[^\w-]', '', cls)
+
             if 'language-' in cls:
                 return cls.replace('language-', '')
+
             if 'lang-' in cls:
                 return cls.replace('lang-', '')
-        
+
+            # Check for brush: pattern (e.g., "brush: java")
+            if 'brush' in cls.lower():
+                lang = cls.lower().replace('brush', '').strip()
+                if lang in known_languages:
+                    return lang
+
+            # Check for bare language name
+            if cls in known_languages:
+                return cls
+
+        return None
+
+    def detect_language(self, elem, code):
+        """Detect programming language from code block"""
+
+        # Check element classes
+        lang = self._extract_language_from_classes(elem.get('class', []))
+        if lang:
+            return lang
+
         # Check parent pre element
         parent = elem.parent
         if parent and parent.name == 'pre':
-            classes = parent.get('class', [])
-            for cls in classes:
-                if 'language-' in cls:
-                    return cls.replace('language-', '')
-        
+            lang = self._extract_language_from_classes(parent.get('class', []))
+            if lang:
+                return lang
+
         # Heuristic detection
         if 'import ' in code and 'from ' in code:
             return 'python'
@@ -297,7 +334,7 @@ class DocToSkillConverter:
             return 'python'
         if '#include' in code or 'int main' in code:
             return 'cpp'
-        
+
         return 'unknown'
     
     def extract_patterns(self, main: Any, code_samples: List[Dict[str, Any]]) -> List[Dict[str, str]]:
diff --git a/tests/test_scraper_features.py b/tests/test_scraper_features.py
index 59c4bc4..eb6d295 100644
--- a/tests/test_scraper_features.py
+++ b/tests/test_scraper_features.py
@@ -174,6 +174,27 @@ class TestLanguageDetection(unittest.TestCase):
         lang = self.converter.detect_language(elem, code)
         self.assertEqual(lang, 'unknown')
 
+    def test_detect_brush_pattern_in_pre(self):
+        """Test brush: pattern in pre element"""
+        html = '
x
' + elem = BeautifulSoup(html, 'html.parser').find('code') + lang = self.converter.detect_language(elem, 'x') + self.assertEqual(lang, 'python', 'Should detect python from brush: python pattern') + + def test_detect_bare_class_in_pre(self): + """Test bare class name in pre element""" + html = '
x
' + elem = BeautifulSoup(html, 'html.parser').find('code') + lang = self.converter.detect_language(elem, 'x') + self.assertEqual(lang, 'python', 'Should detect python from bare class name') + + def test_detect_bare_class_in_code(self): + """Test bare class name in code element""" + html = 'x' + elem = BeautifulSoup(html, 'html.parser').find('code') + lang = self.converter.detect_language(elem, 'x') + self.assertEqual(lang, 'python', 'Should detect python from bare class name') + class TestPatternExtraction(unittest.TestCase): """Test pattern extraction from documentation"""