#!/usr/bin/env python3
"""
Comprehensive Test Suite for LanguageDetector
Tests confidence-based language detection for 20+ programming languages.
Includes Unity C# patterns, CSS class detection, and edge cases.
Run with: pytest tests/test_language_detector.py -v
"""
import pytest
from bs4 import BeautifulSoup
from skill_seekers.cli.language_detector import LanguageDetector
class TestCSSClassDetection:
"""Test language detection from CSS classes"""
def test_language_prefix(self):
"""Test language- prefix pattern"""
detector = LanguageDetector()
classes = ["language-python", "highlight"]
assert detector.extract_language_from_classes(classes) == "python"
classes = ["language-javascript"]
assert detector.extract_language_from_classes(classes) == "javascript"
def test_lang_prefix(self):
"""Test lang- prefix pattern"""
detector = LanguageDetector()
classes = ["lang-java", "code"]
assert detector.extract_language_from_classes(classes) == "java"
classes = ["lang-typescript"]
assert detector.extract_language_from_classes(classes) == "typescript"
def test_brush_pattern(self):
"""Test brush: pattern"""
detector = LanguageDetector()
classes = ["brush: php"]
assert detector.extract_language_from_classes(classes) == "php"
classes = ["brush: csharp"]
assert detector.extract_language_from_classes(classes) == "csharp"
def test_bare_class_name(self):
"""Test bare language name as class"""
detector = LanguageDetector()
classes = ["python", "highlight"]
assert detector.extract_language_from_classes(classes) == "python"
classes = ["rust"]
assert detector.extract_language_from_classes(classes) == "rust"
def test_unknown_language(self):
"""Test unknown language class"""
detector = LanguageDetector()
classes = ["language-foobar"]
assert detector.extract_language_from_classes(classes) is None
classes = ["highlight", "code"]
assert detector.extract_language_from_classes(classes) is None
def test_empty_classes(self):
"""Test empty class list"""
detector = LanguageDetector()
assert detector.extract_language_from_classes([]) is None
assert detector.extract_language_from_classes(None) is None
def test_detect_from_html_with_css_class(self):
"""Test HTML element with CSS class"""
detector = LanguageDetector()
# Create mock element
html = 'print("hello")'
soup = BeautifulSoup(html, "html.parser")
elem = soup.find("code")
lang, confidence = detector.detect_from_html(elem, 'print("hello")')
assert lang == "python"
assert confidence == 1.0 # CSS class = high confidence
def test_detect_from_html_with_parent_class(self):
"""Test parent
element with CSS class"""
detector = LanguageDetector()
# Parent has class, child doesn't
html = 'System.out.println("hello");
'
soup = BeautifulSoup(html, "html.parser")
elem = soup.find("code")
lang, confidence = detector.detect_from_html(elem, 'System.out.println("hello");')
assert lang == "java"
assert confidence == 1.0
class TestUnityCSharpDetection:
"""Test Unity C# specific patterns (CRITICAL - User's Primary Issue)"""
def test_unity_monobehaviour_detection(self):
"""Test Unity MonoBehaviour class detection"""
detector = LanguageDetector()
code = """
using UnityEngine;
public class Player : MonoBehaviour
{
[SerializeField]
private float speed = 5.0f;
void Start() { }
void Update() { }
}
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "csharp"
assert confidence >= 0.9 # High confidence (Unity patterns)
def test_unity_lifecycle_methods(self):
"""Test Unity lifecycle method detection"""
detector = LanguageDetector()
code = """
void Awake() { }
void Start() { }
void Update() { }
void FixedUpdate() { }
void LateUpdate() { }
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "csharp"
assert confidence >= 0.5
def test_unity_coroutine_detection(self):
"""Test Unity coroutine detection"""
detector = LanguageDetector()
code = """
IEnumerator Wait()
{
yield return new WaitForSeconds(1);
}
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "csharp"
assert confidence >= 0.4
def test_unity_serializefield_attribute(self):
"""Test Unity attribute detection"""
detector = LanguageDetector()
code = """
[SerializeField]
private GameObject player;
[RequireComponent(typeof(Rigidbody))]
public class Test : MonoBehaviour { }
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "csharp"
assert confidence >= 0.7
def test_unity_types(self):
"""Test Unity type detection (GameObject, Transform, etc.)"""
detector = LanguageDetector()
code = """
GameObject obj = new GameObject();
Transform transform = obj.transform;
Vector3 position = transform.position;
Rigidbody rb = obj.GetComponent();
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "csharp"
assert confidence >= 0.3
def test_unity_namespace(self):
"""Test Unity namespace detection"""
detector = LanguageDetector()
code = "using UnityEngine;"
lang, confidence = detector.detect_from_code(code)
# Short code, but very specific Unity pattern (19 chars)
# Now detects due to lowered min length threshold (10 chars)
assert lang == "csharp"
assert confidence >= 0.5
# Longer version
code = """
using UnityEngine;
using System.Collections;
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "csharp"
assert confidence >= 0.5
def test_generic_csharp_vs_unity(self):
"""Test generic C# doesn't false-positive as Unity"""
detector = LanguageDetector()
# Generic C# code
code = """
using System;
public class Program
{
static void Main(string[] args)
{
Console.WriteLine("Hello");
}
}
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "csharp"
# Confidence should be high (contains multiple C# patterns)
# No Unity-specific patterns, but Console.WriteLine is strong indicator
assert 0.7 <= confidence <= 1.0
def test_unity_minimal_code(self):
"""Test minimal Unity code (edge case)"""
detector = LanguageDetector()
code = "void Update() { Time.deltaTime; }"
lang, confidence = detector.detect_from_code(code)
assert lang == "csharp"
assert confidence >= 0.3 # Low but detected
def test_unity_input_system(self):
"""Test Unity Input system detection"""
detector = LanguageDetector()
code = """
float horizontal = Input.GetAxis("Horizontal");
if (Input.GetKeyDown(KeyCode.Space)) { }
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "csharp"
assert confidence >= 0.4
def test_unity_full_script(self):
"""Test complete Unity script (high confidence expected)"""
detector = LanguageDetector()
code = """
using UnityEngine;
using System.Collections;
public class PlayerController : MonoBehaviour
{
[SerializeField]
private float speed = 5.0f;
[SerializeField]
private Rigidbody rb;
void Awake()
{
rb = GetComponent();
}
void Update()
{
float moveH = Input.GetAxis("Horizontal");
float moveV = Input.GetAxis("Vertical");
Vector3 movement = new Vector3(moveH, 0, moveV);
rb.AddForce(movement * speed);
}
IEnumerator DashCoroutine()
{
speed *= 2;
yield return new WaitForSeconds(0.5f);
speed /= 2;
}
}
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "csharp"
assert confidence >= 0.9 # Very high confidence (many Unity patterns)
class TestLanguageDetection:
"""Test detection for major programming languages"""
def test_python_detection(self):
"""Test Python code detection"""
detector = LanguageDetector()
code = """
def calculate(x, y):
result = x + y
return result
class MyClass:
def __init__(self):
self.value = 0
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "python"
assert confidence >= 0.5
def test_javascript_detection(self):
"""Test JavaScript code detection"""
detector = LanguageDetector()
code = """
const add = (a, b) => a + b;
function calculate() {
let result = 0;
console.log(result);
return result;
}
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "javascript"
assert confidence >= 0.5
def test_typescript_detection(self):
"""Test TypeScript code detection"""
detector = LanguageDetector()
code = """
interface User {
name: string;
age: number;
}
type ID = string | number;
function getUser(): User {
return { name: "John", age: 30 };
}
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "typescript"
assert confidence >= 0.7
def test_java_detection(self):
"""Test Java code detection"""
detector = LanguageDetector()
code = """
public class Hello {
public static void main(String[] args) {
System.out.println("Hello World");
}
}
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "java"
assert confidence >= 0.6
def test_go_detection(self):
"""Test Go code detection"""
detector = LanguageDetector()
code = """
package main
import "fmt"
func main() {
message := "Hello, World"
fmt.Println(message)
}
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "go"
assert confidence >= 0.6
def test_rust_detection(self):
"""Test Rust code detection"""
detector = LanguageDetector()
code = """
fn main() {
let mut x = 5;
println!("The value is: {}", x);
match x {
1 => println!("One"),
_ => println!("Other"),
}
}
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "rust"
assert confidence >= 0.6
def test_php_detection(self):
"""Test PHP code detection"""
detector = LanguageDetector()
code = """
name;
}
}
?>
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "php"
assert confidence >= 0.7
def test_jsx_detection(self):
"""Test JSX code detection"""
detector = LanguageDetector()
code = """
const Button = () => {
const [count, setCount] = useState(0);
return (
);
};
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "jsx"
assert confidence >= 0.5
def test_vue_detection(self):
"""Test Vue SFC detection"""
detector = LanguageDetector()
code = """
{{ message }}
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "vue"
assert confidence >= 0.7
def test_sql_detection(self):
"""Test SQL code detection"""
detector = LanguageDetector()
code = """
SELECT users.name, orders.total
FROM users
JOIN orders ON users.id = orders.user_id
WHERE orders.status = 'completed'
ORDER BY orders.total DESC;
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "sql"
assert confidence >= 0.6
class TestEdgeCases:
"""Test edge cases and error handling"""
def test_short_code_snippet(self):
"""Test code snippet too short for detection"""
detector = LanguageDetector()
code = "x = 5"
lang, confidence = detector.detect_from_code(code)
assert lang == "unknown"
assert confidence == 0.0
def test_empty_code(self):
"""Test empty code string"""
detector = LanguageDetector()
lang, confidence = detector.detect_from_code("")
assert lang == "unknown"
assert confidence == 0.0
def test_whitespace_only(self):
"""Test whitespace-only code"""
detector = LanguageDetector()
code = " \n \n "
lang, confidence = detector.detect_from_code(code)
assert lang == "unknown"
assert confidence == 0.0
def test_comments_only(self):
"""Test code with only comments"""
detector = LanguageDetector()
code = """
// This is a comment
// Another comment
/* More comments */
"""
lang, confidence = detector.detect_from_code(code)
# Should return unknown or very low confidence
assert confidence < 0.5
def test_mixed_languages(self):
"""Test code with multiple language patterns"""
detector = LanguageDetector()
# HTML with embedded JavaScript
code = """
"""
lang, confidence = detector.detect_from_code(code)
# Should detect strongest pattern
# Both html and javascript patterns present
assert lang in ["html", "javascript"]
def test_confidence_threshold(self):
"""Test minimum confidence threshold"""
# Create detector with high threshold
detector = LanguageDetector(min_confidence=0.7)
# Code with weak patterns (low confidence)
code = "var x = 5; const y = 10;"
lang, confidence = detector.detect_from_code(code)
# If confidence < 0.7, should return unknown
if confidence < 0.7:
assert lang == "unknown"
def test_html_with_embedded_css(self):
"""Test HTML with embedded CSS"""
detector = LanguageDetector()
code = """
"""
lang, confidence = detector.detect_from_code(code)
assert lang in ["html", "css"]
def test_case_insensitive_patterns(self):
"""Test that patterns are case-insensitive"""
detector = LanguageDetector()
# SQL with different cases
code = """
select users.name
FROM users
where users.status = 'active'
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "sql"
def test_r_language_detection(self):
"""Test R language detection (edge case: single letter)"""
detector = LanguageDetector()
code = """
library(ggplot2)
data <- read.csv("data.csv")
summary(data)
ggplot(data, aes(x = x, y = y)) +
geom_point()
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "r"
assert confidence >= 0.5
def test_julia_detection(self):
"""Test Julia language detection"""
detector = LanguageDetector()
code = """
function calculate(x, y)
result = x + y
return result
end
using Statistics
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "julia"
assert confidence >= 0.3
def test_gdscript_detection(self):
"""Test GDScript (Godot) detection"""
detector = LanguageDetector()
code = """
extends Node2D
var speed = 100
func _ready():
pass
func _process(delta):
position.x += speed * delta
"""
lang, confidence = detector.detect_from_code(code)
assert lang == "gdscript"
assert confidence >= 0.5
def test_multiple_confidence_scores(self):
"""Test that multiple languages can have scores"""
detector = LanguageDetector()
# Code that matches both C# and Java patterns
code = """
public class Test {
public static void main() {
System.out.println("hello");
}
}
"""
lang, confidence = detector.detect_from_code(code)
# Should detect the one with highest confidence
assert lang in ["csharp", "java"]
assert confidence > 0.0
class TestIntegration:
"""Integration tests with doc_scraper patterns"""
def test_detect_from_html_fallback_to_patterns(self):
"""Test fallback from CSS classes to pattern matching"""
detector = LanguageDetector()
# Element without CSS classes
html = "def test(): pass"
soup = BeautifulSoup(html, "html.parser")
elem = soup.find("code")
lang, confidence = detector.detect_from_html(elem, "def test(): pass")
# Should fallback to pattern matching
# Now detects due to lowered min length threshold (10 chars)
assert lang == "python"
assert confidence >= 0.2
def test_backward_compatibility_with_doc_scraper(self):
"""Test that detector can be used as drop-in replacement"""
detector = LanguageDetector()
# Simulate doc_scraper.py usage
html = 'import os\nprint("hello")'
soup = BeautifulSoup(html, "html.parser")
elem = soup.find("code")
code = elem.get_text()
# This is how doc_scraper.py would call it
lang, confidence = detector.detect_from_html(elem, code)
# Should work exactly as before (returning string)
assert isinstance(lang, str)
assert isinstance(confidence, float)
assert lang == "python"
assert 0.0 <= confidence <= 1.0
if __name__ == "__main__":
pytest.main([__file__, "-v"])