feat: Add support for brush: and bare class language detection
- Support <pre class="brush: java"> pattern (SyntaxHighlighter) - Support bare class names like <pre class="python"> - Add _extract_language_from_classes() helper method - Apply detection logic to both code and parent pre elements - Add 3 comprehensive test cases Improves language detection for 25+ programming languages across various documentation site formats. Co-authored-by: Ricardo JL Rufino <ricardo@edu3.com.br>
This commit is contained in:
committed by
GitHub
parent
318d4e89f1
commit
e28aaa1a5e
@@ -267,25 +267,62 @@ class DocToSkillConverter:
|
|||||||
page['links'].append(href)
|
page['links'].append(href)
|
||||||
|
|
||||||
return page
|
return page
|
||||||
|
|
||||||
def detect_language(self, elem: Any, code: str) -> str:
|
def _extract_language_from_classes(self, classes):
|
||||||
"""Detect programming language from code block"""
|
"""Extract language from class list
|
||||||
# Check class attribute
|
|
||||||
classes = elem.get('class', [])
|
Supports multiple patterns:
|
||||||
|
- language-{lang} (e.g., "language-python")
|
||||||
|
- lang-{lang} (e.g., "lang-javascript")
|
||||||
|
- brush: {lang} (e.g., "brush: java")
|
||||||
|
- bare language name (e.g., "python", "java")
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Define common programming languages
|
||||||
|
known_languages = [
|
||||||
|
"javascript", "java", "xml", "html", "python", "bash", "cpp", "typescript",
|
||||||
|
"go", "rust", "php", "ruby", "swift", "kotlin", "csharp", "c", "sql",
|
||||||
|
"yaml", "json", "markdown", "css", "scss", "sass", "jsx", "tsx", "vue",
|
||||||
|
"shell", "powershell", "r", "scala", "dart", "perl", "lua", "elixir"
|
||||||
|
]
|
||||||
|
|
||||||
for cls in classes:
|
for cls in classes:
|
||||||
|
# Clean special characters (except word chars and hyphens)
|
||||||
|
cls = re.sub(r'[^\w-]', '', cls)
|
||||||
|
|
||||||
if 'language-' in cls:
|
if 'language-' in cls:
|
||||||
return cls.replace('language-', '')
|
return cls.replace('language-', '')
|
||||||
|
|
||||||
if 'lang-' in cls:
|
if 'lang-' in cls:
|
||||||
return cls.replace('lang-', '')
|
return cls.replace('lang-', '')
|
||||||
|
|
||||||
|
# Check for brush: pattern (e.g., "brush: java")
|
||||||
|
if 'brush' in cls.lower():
|
||||||
|
lang = cls.lower().replace('brush', '').strip()
|
||||||
|
if lang in known_languages:
|
||||||
|
return lang
|
||||||
|
|
||||||
|
# Check for bare language name
|
||||||
|
if cls in known_languages:
|
||||||
|
return cls
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def detect_language(self, elem, code):
|
||||||
|
"""Detect programming language from code block"""
|
||||||
|
|
||||||
|
# Check element classes
|
||||||
|
lang = self._extract_language_from_classes(elem.get('class', []))
|
||||||
|
if lang:
|
||||||
|
return lang
|
||||||
|
|
||||||
# Check parent pre element
|
# Check parent pre element
|
||||||
parent = elem.parent
|
parent = elem.parent
|
||||||
if parent and parent.name == 'pre':
|
if parent and parent.name == 'pre':
|
||||||
classes = parent.get('class', [])
|
lang = self._extract_language_from_classes(parent.get('class', []))
|
||||||
for cls in classes:
|
if lang:
|
||||||
if 'language-' in cls:
|
return lang
|
||||||
return cls.replace('language-', '')
|
|
||||||
|
|
||||||
# Heuristic detection
|
# Heuristic detection
|
||||||
if 'import ' in code and 'from ' in code:
|
if 'import ' in code and 'from ' in code:
|
||||||
return 'python'
|
return 'python'
|
||||||
@@ -297,7 +334,7 @@ class DocToSkillConverter:
|
|||||||
return 'python'
|
return 'python'
|
||||||
if '#include' in code or 'int main' in code:
|
if '#include' in code or 'int main' in code:
|
||||||
return 'cpp'
|
return 'cpp'
|
||||||
|
|
||||||
return 'unknown'
|
return 'unknown'
|
||||||
|
|
||||||
def extract_patterns(self, main: Any, code_samples: List[Dict[str, Any]]) -> List[Dict[str, str]]:
|
def extract_patterns(self, main: Any, code_samples: List[Dict[str, Any]]) -> List[Dict[str, str]]:
|
||||||
|
|||||||
@@ -174,6 +174,27 @@ class TestLanguageDetection(unittest.TestCase):
|
|||||||
lang = self.converter.detect_language(elem, code)
|
lang = self.converter.detect_language(elem, code)
|
||||||
self.assertEqual(lang, 'unknown')
|
self.assertEqual(lang, 'unknown')
|
||||||
|
|
||||||
|
def test_detect_brush_pattern_in_pre(self):
|
||||||
|
"""Test brush: pattern in pre element"""
|
||||||
|
html = '<pre class="brush: python"><code>x</code></pre>'
|
||||||
|
elem = BeautifulSoup(html, 'html.parser').find('code')
|
||||||
|
lang = self.converter.detect_language(elem, 'x')
|
||||||
|
self.assertEqual(lang, 'python', 'Should detect python from brush: python pattern')
|
||||||
|
|
||||||
|
def test_detect_bare_class_in_pre(self):
|
||||||
|
"""Test bare class name in pre element"""
|
||||||
|
html = '<pre class="python"><code>x</code></pre>'
|
||||||
|
elem = BeautifulSoup(html, 'html.parser').find('code')
|
||||||
|
lang = self.converter.detect_language(elem, 'x')
|
||||||
|
self.assertEqual(lang, 'python', 'Should detect python from bare class name')
|
||||||
|
|
||||||
|
def test_detect_bare_class_in_code(self):
|
||||||
|
"""Test bare class name in code element"""
|
||||||
|
html = '<code class="python">x</code>'
|
||||||
|
elem = BeautifulSoup(html, 'html.parser').find('code')
|
||||||
|
lang = self.converter.detect_language(elem, 'x')
|
||||||
|
self.assertEqual(lang, 'python', 'Should detect python from bare class name')
|
||||||
|
|
||||||
|
|
||||||
class TestPatternExtraction(unittest.TestCase):
|
class TestPatternExtraction(unittest.TestCase):
|
||||||
"""Test pattern extraction from documentation"""
|
"""Test pattern extraction from documentation"""
|
||||||
|
|||||||
Reference in New Issue
Block a user