Files
skill-seekers-reference/tests/test_pdf_extractor.py

636 lines
21 KiB
Python

#!/usr/bin/env python3
"""
Tests for PDF Extractor (cli/pdf_extractor_poc.py)
Tests cover:
- Language detection with confidence scoring
- Code block detection (font, indent, pattern)
- Syntax validation
- Quality scoring
- Chapter detection
- Page chunking
- Code block merging
"""
import sys
import unittest
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "cli"))
try:
import fitz # noqa: F401 PyMuPDF
PYMUPDF_AVAILABLE = True
except ImportError:
PYMUPDF_AVAILABLE = False
class TestLanguageDetection(unittest.TestCase):
"""Test language detection with confidence scoring"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_detect_python_with_confidence(self):
"""Test Python detection returns language and confidence"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Initialize language_detector manually (since __init__ not called)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = "def hello():\n print('world')\n return True"
language, confidence = extractor.detect_language_from_code(code)
self.assertEqual(language, "python")
self.assertGreater(confidence, 0.4) # Should have reasonable confidence
self.assertLessEqual(confidence, 1.0)
def test_detect_javascript_with_confidence(self):
"""Test JavaScript detection"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Initialize language_detector manually (since __init__ not called)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = "const handleClick = () => {\n console.log('clicked');\n};"
language, confidence = extractor.detect_language_from_code(code)
self.assertEqual(language, "javascript")
self.assertGreater(confidence, 0.5)
def test_detect_cpp_with_confidence(self):
"""Test C++ detection"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Initialize language_detector manually (since __init__ not called)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = '#include <iostream>\nint main() {\n std::cout << "Hello";\n}'
language, confidence = extractor.detect_language_from_code(code)
self.assertEqual(language, "cpp")
self.assertGreater(confidence, 0.5)
def test_detect_unknown_low_confidence(self):
"""Test unknown language returns low confidence"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Initialize language_detector manually (since __init__ not called)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = "this is not code at all just plain text"
language, confidence = extractor.detect_language_from_code(code)
self.assertEqual(language, "unknown")
self.assertLess(confidence, 0.3) # Should be low confidence
def test_confidence_range(self):
"""Test confidence is always between 0 and 1"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Initialize language_detector manually (since __init__ not called)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
test_codes = [
"def foo(): pass",
"const x = 10;",
"#include <stdio.h>",
"random text here",
"",
]
for code in test_codes:
_, confidence = extractor.detect_language_from_code(code)
self.assertGreaterEqual(confidence, 0.0)
self.assertLessEqual(confidence, 1.0)
def test_detect_scss_with_confidence(self):
"""Test SCSS detection"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = """
$primary-color: #3498db;
@mixin border-radius($radius) {
border-radius: $radius;
}
.button {
color: $primary-color;
@include border-radius(5px);
&:hover {
background: darken($primary-color, 10%);
}
}
"""
language, confidence = extractor.detect_language_from_code(code)
self.assertEqual(language, "scss")
self.assertGreater(confidence, 0.8)
def test_detect_dart_with_confidence(self):
"""Test Dart detection"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = """
import 'package:flutter/material.dart';
class MyApp extends StatelessWidget {
@override
Widget build(BuildContext context) {
return MaterialApp(
home: Text('Hello'),
);
}
}
"""
language, confidence = extractor.detect_language_from_code(code)
self.assertEqual(language, "dart")
self.assertGreater(confidence, 0.6)
def test_detect_scala_with_confidence(self):
"""Test Scala detection"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = """
case class Person(name: String, age: Int)
object Main extends App {
val person = Person("Alice", 30)
person match {
case Person(n, a) if a >= 18 => println(s"Adult: $n")
case _ => println("Minor")
}
}
"""
language, confidence = extractor.detect_language_from_code(code)
self.assertEqual(language, "scala")
self.assertGreater(confidence, 0.7)
def test_detect_sass_with_confidence(self):
"""Test SASS detection"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = """
$primary-color: #3498db
=border-radius($radius)
border-radius: $radius
.button
color: $primary-color
+border-radius(5px)
&:hover
background: darken($primary-color, 10%)
"""
language, confidence = extractor.detect_language_from_code(code)
self.assertEqual(language, "sass")
self.assertGreater(confidence, 0.8)
def test_detect_elixir_with_confidence(self):
"""Test Elixir detection"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = """
defmodule MyApp.User do
def greet(name) do
"Hello, #{name}"
end
defp calculate_age(birth_year) do
2024 - birth_year
end
def process(data) do
data
|> String.trim()
|> String.downcase()
|> String.split(",")
end
end
"""
language, confidence = extractor.detect_language_from_code(code)
self.assertEqual(language, "elixir")
self.assertGreater(confidence, 0.8)
def test_detect_lua_with_confidence(self):
"""Test Lua detection"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = """
local function calculate_sum(numbers)
local total = 0
for i = 1, #numbers do
total = total + numbers[i]
end
return total
end
local items = {1, 2, 3, 4, 5}
local result = calculate_sum(items)
print("Sum: " .. result)
"""
language, confidence = extractor.detect_language_from_code(code)
self.assertEqual(language, "lua")
self.assertGreater(confidence, 0.7)
def test_detect_perl_with_confidence(self):
"""Test Perl detection"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = r"""
#!/usr/bin/perl
use strict;
use warnings;
sub process_line {
my $line = shift;
chomp($line);
if ($line =~ /^(\w+)=(\w+)$/) {
my ($name, $value) = ($1, $2);
return "$name has value $value";
}
return undef;
}
my @lines = ("foo=10", "bar=20");
foreach my $line (@lines) {
my $result = process_line($line);
print $result if defined $result;
}
"""
language, confidence = extractor.detect_language_from_code(code)
self.assertEqual(language, "perl")
self.assertGreater(confidence, 0.8)
class TestSyntaxValidation(unittest.TestCase):
"""Test syntax validation for different languages"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_validate_python_valid(self):
"""Test valid Python syntax"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
code = "def hello():\n print('world')\n return True"
is_valid, issues = extractor.validate_code_syntax(code, "python")
self.assertTrue(is_valid)
self.assertEqual(len(issues), 0)
def test_validate_python_invalid_indentation(self):
"""Test invalid Python indentation"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
code = "def hello():\n print('world')\n\tprint('mixed')" # Mixed tabs and spaces
is_valid, issues = extractor.validate_code_syntax(code, "python")
self.assertFalse(is_valid)
self.assertGreater(len(issues), 0)
def test_validate_python_unbalanced_brackets(self):
"""Test unbalanced brackets"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
code = "x = [[[1, 2, 3" # Severely unbalanced brackets
is_valid, issues = extractor.validate_code_syntax(code, "python")
self.assertFalse(is_valid)
self.assertGreater(len(issues), 0)
def test_validate_javascript_valid(self):
"""Test valid JavaScript syntax"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
code = "const x = () => { return 42; };"
is_valid, issues = extractor.validate_code_syntax(code, "javascript")
self.assertTrue(is_valid)
self.assertEqual(len(issues), 0)
def test_validate_natural_language_fails(self):
"""Test natural language fails validation"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
code = "This is just a regular sentence with the and for and with and that and have and from words."
is_valid, issues = extractor.validate_code_syntax(code, "python")
self.assertFalse(is_valid)
self.assertIn("May be natural language", " ".join(issues))
class TestQualityScoring(unittest.TestCase):
"""Test code quality scoring (0-10 scale)"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_quality_score_range(self):
"""Test quality score is between 0 and 10"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
code = "def hello():\n print('world')"
quality = extractor.score_code_quality(code, "python", 0.8)
self.assertGreaterEqual(quality, 0.0)
self.assertLessEqual(quality, 10.0)
def test_high_quality_code(self):
"""Test high-quality code gets good score"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
code = """def calculate_sum(numbers):
'''Calculate sum of numbers'''
total = 0
for num in numbers:
total += num
return total"""
quality = extractor.score_code_quality(code, "python", 0.9)
self.assertGreater(quality, 6.0) # Should be good quality
def test_low_quality_code(self):
"""Test low-quality code gets low score"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
code = "x" # Too short, no structure
quality = extractor.score_code_quality(code, "unknown", 0.1)
self.assertLess(quality, 6.0) # Should be low quality
def test_quality_factors(self):
"""Test that quality considers multiple factors"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Good: proper structure, indentation, confidence
good_code = "def foo():\n return bar()"
good_quality = extractor.score_code_quality(good_code, "python", 0.9)
# Bad: no structure, low confidence
bad_code = "some text"
bad_quality = extractor.score_code_quality(bad_code, "unknown", 0.1)
self.assertGreater(good_quality, bad_quality)
class TestChapterDetection(unittest.TestCase):
"""Test chapter/section detection"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_detect_chapter_with_number(self):
"""Test chapter detection with number"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
page_data = {
"text": "Chapter 1: Introduction to Python\nThis is the first chapter.",
"headings": [],
}
is_chapter, title = extractor.detect_chapter_start(page_data)
self.assertTrue(is_chapter)
self.assertIsNotNone(title)
def test_detect_chapter_uppercase(self):
"""Test chapter detection with uppercase"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
page_data = {
"text": "Chapter 1\nThis is the introduction", # Pattern requires Chapter + digit
"headings": [],
}
is_chapter, title = extractor.detect_chapter_start(page_data)
self.assertTrue(is_chapter)
def test_detect_section_heading(self):
"""Test section heading detection"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
page_data = {"text": "2. Getting Started\nThis is a section.", "headings": []}
is_chapter, title = extractor.detect_chapter_start(page_data)
self.assertTrue(is_chapter)
def test_not_chapter(self):
"""Test normal text is not detected as chapter"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
page_data = {
"text": "This is just normal paragraph text without any chapter markers.",
"headings": [],
}
is_chapter, title = extractor.detect_chapter_start(page_data)
self.assertFalse(is_chapter)
class TestCodeBlockMerging(unittest.TestCase):
"""Test code block merging across pages"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_merge_continued_blocks(self):
"""Test merging code blocks split across pages"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.verbose = False # Initialize verbose attribute
pages = [
{
"page_number": 1,
"code_samples": [
{
"code": "def hello():",
"language": "python",
"detection_method": "pattern",
}
],
"code_blocks_count": 1,
},
{
"page_number": 2,
"code_samples": [
{
"code": ' print("world")',
"language": "python",
"detection_method": "pattern",
}
],
"code_blocks_count": 1,
},
]
merged = extractor.merge_continued_code_blocks(pages)
# Should have merged the two blocks
self.assertIn("def hello():", merged[0]["code_samples"][0]["code"])
self.assertIn('print("world")', merged[0]["code_samples"][0]["code"])
def test_no_merge_different_languages(self):
"""Test blocks with different languages are not merged"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
pages = [
{
"page_number": 1,
"code_samples": [
{
"code": "def foo():",
"language": "python",
"detection_method": "pattern",
}
],
"code_blocks_count": 1,
},
{
"page_number": 2,
"code_samples": [
{
"code": "const x = 10;",
"language": "javascript",
"detection_method": "pattern",
}
],
"code_blocks_count": 1,
},
]
merged = extractor.merge_continued_code_blocks(pages)
# Should NOT merge different languages
self.assertEqual(len(merged[0]["code_samples"]), 1)
self.assertEqual(len(merged[1]["code_samples"]), 1)
class TestCodeDetectionMethods(unittest.TestCase):
"""Test different code detection methods"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_pattern_based_detection(self):
"""Test pattern-based code detection"""
_extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Should detect function definitions
text = "Here is an example:\ndef calculate(x, y):\n return x + y"
# Pattern-based detection should find this
# (implementation details depend on pdf_extractor_poc.py)
self.assertIn("def ", text)
self.assertIn("return", text)
def test_indent_based_detection(self):
"""Test indent-based code detection"""
_extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Code with consistent indentation
indented_text = """ def foo():
return bar()"""
# Should detect as code due to indentation
self.assertTrue(indented_text.startswith(" " * 4))
class TestQualityFiltering(unittest.TestCase):
"""Test quality-based filtering"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_filter_by_min_quality(self):
"""Test filtering code blocks by minimum quality"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.min_quality = 5.0
# High quality block
high_quality = {
"code": "def calculate():\n return 42",
"language": "python",
"quality": 8.0,
}
# Low quality block
low_quality = {"code": "x", "language": "unknown", "quality": 2.0}
# Only high quality should pass
self.assertGreaterEqual(high_quality["quality"], extractor.min_quality)
self.assertLess(low_quality["quality"], extractor.min_quality)
if __name__ == "__main__":
unittest.main()