run ruff

2026-01-17 17:29:21 +00:00
parent c89f059712
commit 5ed767ff9a
144 changed files with 14142 additions and 16488 deletions
--- a/tests/test_pdf_extractor.py
+++ b/tests/test_pdf_extractor.py
@@ -12,8 +12,8 @@ Tests cover:
 - Code block merging
 """

-import unittest
 import sys
+import unittest
 from pathlib import Path

 # Add parent directory to path for imports
@@ -21,6 +21,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent / "cli"))

 try:
    import fitz  # PyMuPDF
+
    PYMUPDF_AVAILABLE = True
 except ImportError:
    PYMUPDF_AVAILABLE = False
@@ -33,6 +34,7 @@ class TestLanguageDetection(unittest.TestCase):
        if not PYMUPDF_AVAILABLE:
            self.skipTest("PyMuPDF not installed")
        from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
+
        self.PDFExtractor = PDFExtractor

    def test_detect_python_with_confidence(self):
@@ -40,6 +42,7 @@ class TestLanguageDetection(unittest.TestCase):
        extractor = self.PDFExtractor.__new__(self.PDFExtractor)
        # Initialize language_detector manually (since __init__ not called)
        from skill_seekers.cli.language_detector import LanguageDetector
+
        extractor.language_detector = LanguageDetector(min_confidence=0.15)

        code = "def hello():\n    print('world')\n    return True"
@@ -55,6 +58,7 @@ class TestLanguageDetection(unittest.TestCase):
        extractor = self.PDFExtractor.__new__(self.PDFExtractor)
        # Initialize language_detector manually (since __init__ not called)
        from skill_seekers.cli.language_detector import LanguageDetector
+
        extractor.language_detector = LanguageDetector(min_confidence=0.15)

        code = "const handleClick = () => {\n  console.log('clicked');\n};"
@@ -69,9 +73,10 @@ class TestLanguageDetection(unittest.TestCase):
        extractor = self.PDFExtractor.__new__(self.PDFExtractor)
        # Initialize language_detector manually (since __init__ not called)
        from skill_seekers.cli.language_detector import LanguageDetector
+
        extractor.language_detector = LanguageDetector(min_confidence=0.15)

-        code = "#include <iostream>\nint main() {\n  std::cout << \"Hello\";\n}"
+        code = '#include <iostream>\nint main() {\n  std::cout << "Hello";\n}'

        language, confidence = extractor.detect_language_from_code(code)

@@ -83,6 +88,7 @@ class TestLanguageDetection(unittest.TestCase):
        extractor = self.PDFExtractor.__new__(self.PDFExtractor)
        # Initialize language_detector manually (since __init__ not called)
        from skill_seekers.cli.language_detector import LanguageDetector
+
        extractor.language_detector = LanguageDetector(min_confidence=0.15)

        code = "this is not code at all just plain text"
@@ -97,15 +103,10 @@ class TestLanguageDetection(unittest.TestCase):
        extractor = self.PDFExtractor.__new__(self.PDFExtractor)
        # Initialize language_detector manually (since __init__ not called)
        from skill_seekers.cli.language_detector import LanguageDetector
+
        extractor.language_detector = LanguageDetector(min_confidence=0.15)

-        test_codes = [
-            "def foo(): pass",
-            "const x = 10;",
-            "#include <stdio.h>",
-            "random text here",
-            ""
-        ]
+        test_codes = ["def foo(): pass", "const x = 10;", "#include <stdio.h>", "random text here", ""]

        for code in test_codes:
            _, confidence = extractor.detect_language_from_code(code)
@@ -120,6 +121,7 @@ class TestSyntaxValidation(unittest.TestCase):
        if not PYMUPDF_AVAILABLE:
            self.skipTest("PyMuPDF not installed")
        from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
+
        self.PDFExtractor = PDFExtractor

    def test_validate_python_valid(self):
@@ -170,7 +172,7 @@ class TestSyntaxValidation(unittest.TestCase):
        is_valid, issues = extractor.validate_code_syntax(code, "python")

        self.assertFalse(is_valid)
-        self.assertIn('May be natural language', ' '.join(issues))
+        self.assertIn("May be natural language", " ".join(issues))


 class TestQualityScoring(unittest.TestCase):
@@ -180,6 +182,7 @@ class TestQualityScoring(unittest.TestCase):
        if not PYMUPDF_AVAILABLE:
            self.skipTest("PyMuPDF not installed")
        from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
+
        self.PDFExtractor = PDFExtractor

    def test_quality_score_range(self):
@@ -237,15 +240,13 @@ class TestChapterDetection(unittest.TestCase):
        if not PYMUPDF_AVAILABLE:
            self.skipTest("PyMuPDF not installed")
        from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
+
        self.PDFExtractor = PDFExtractor

    def test_detect_chapter_with_number(self):
        """Test chapter detection with number"""
        extractor = self.PDFExtractor.__new__(self.PDFExtractor)
-        page_data = {
-            'text': 'Chapter 1: Introduction to Python\nThis is the first chapter.',
-            'headings': []
-        }
+        page_data = {"text": "Chapter 1: Introduction to Python\nThis is the first chapter.", "headings": []}

        is_chapter, title = extractor.detect_chapter_start(page_data)

@@ -256,8 +257,8 @@ class TestChapterDetection(unittest.TestCase):
        """Test chapter detection with uppercase"""
        extractor = self.PDFExtractor.__new__(self.PDFExtractor)
        page_data = {
-            'text': 'Chapter 1\nThis is the introduction',  # Pattern requires Chapter + digit
-            'headings': []
+            "text": "Chapter 1\nThis is the introduction",  # Pattern requires Chapter + digit
+            "headings": [],
        }

        is_chapter, title = extractor.detect_chapter_start(page_data)
@@ -267,10 +268,7 @@ class TestChapterDetection(unittest.TestCase):
    def test_detect_section_heading(self):
        """Test section heading detection"""
        extractor = self.PDFExtractor.__new__(self.PDFExtractor)
-        page_data = {
-            'text': '2. Getting Started\nThis is a section.',
-            'headings': []
-        }
+        page_data = {"text": "2. Getting Started\nThis is a section.", "headings": []}

        is_chapter, title = extractor.detect_chapter_start(page_data)

@@ -279,10 +277,7 @@ class TestChapterDetection(unittest.TestCase):
    def test_not_chapter(self):
        """Test normal text is not detected as chapter"""
        extractor = self.PDFExtractor.__new__(self.PDFExtractor)
-        page_data = {
-            'text': 'This is just normal paragraph text without any chapter markers.',
-            'headings': []
-        }
+        page_data = {"text": "This is just normal paragraph text without any chapter markers.", "headings": []}

        is_chapter, title = extractor.detect_chapter_start(page_data)

@@ -296,6 +291,7 @@ class TestCodeBlockMerging(unittest.TestCase):
        if not PYMUPDF_AVAILABLE:
            self.skipTest("PyMuPDF not installed")
        from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
+
        self.PDFExtractor = PDFExtractor

    def test_merge_continued_blocks(self):
@@ -305,26 +301,22 @@ class TestCodeBlockMerging(unittest.TestCase):

        pages = [
            {
-                'page_number': 1,
-                'code_samples': [
-                    {'code': 'def hello():', 'language': 'python', 'detection_method': 'pattern'}
-                ],
-                'code_blocks_count': 1
+                "page_number": 1,
+                "code_samples": [{"code": "def hello():", "language": "python", "detection_method": "pattern"}],
+                "code_blocks_count": 1,
            },
            {
-                'page_number': 2,
-                'code_samples': [
-                    {'code': '    print("world")', 'language': 'python', 'detection_method': 'pattern'}
-                ],
-                'code_blocks_count': 1
-            }
+                "page_number": 2,
+                "code_samples": [{"code": '    print("world")', "language": "python", "detection_method": "pattern"}],
+                "code_blocks_count": 1,
+            },
        ]

        merged = extractor.merge_continued_code_blocks(pages)

        # Should have merged the two blocks
-        self.assertIn('def hello():', merged[0]['code_samples'][0]['code'])
-        self.assertIn('print("world")', merged[0]['code_samples'][0]['code'])
+        self.assertIn("def hello():", merged[0]["code_samples"][0]["code"])
+        self.assertIn('print("world")', merged[0]["code_samples"][0]["code"])

    def test_no_merge_different_languages(self):
        """Test blocks with different languages are not merged"""
@@ -332,26 +324,22 @@ class TestCodeBlockMerging(unittest.TestCase):

        pages = [
            {
-                'page_number': 1,
-                'code_samples': [
-                    {'code': 'def foo():', 'language': 'python', 'detection_method': 'pattern'}
-                ],
-                'code_blocks_count': 1
+                "page_number": 1,
+                "code_samples": [{"code": "def foo():", "language": "python", "detection_method": "pattern"}],
+                "code_blocks_count": 1,
            },
            {
-                'page_number': 2,
-                'code_samples': [
-                    {'code': 'const x = 10;', 'language': 'javascript', 'detection_method': 'pattern'}
-                ],
-                'code_blocks_count': 1
-            }
+                "page_number": 2,
+                "code_samples": [{"code": "const x = 10;", "language": "javascript", "detection_method": "pattern"}],
+                "code_blocks_count": 1,
+            },
        ]

        merged = extractor.merge_continued_code_blocks(pages)

        # Should NOT merge different languages
-        self.assertEqual(len(merged[0]['code_samples']), 1)
-        self.assertEqual(len(merged[1]['code_samples']), 1)
+        self.assertEqual(len(merged[0]["code_samples"]), 1)
+        self.assertEqual(len(merged[1]["code_samples"]), 1)


 class TestCodeDetectionMethods(unittest.TestCase):
@@ -361,6 +349,7 @@ class TestCodeDetectionMethods(unittest.TestCase):
        if not PYMUPDF_AVAILABLE:
            self.skipTest("PyMuPDF not installed")
        from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
+
        self.PDFExtractor = PDFExtractor

    def test_pattern_based_detection(self):
@@ -394,6 +383,7 @@ class TestQualityFiltering(unittest.TestCase):
        if not PYMUPDF_AVAILABLE:
            self.skipTest("PyMuPDF not installed")
        from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
+
        self.PDFExtractor = PDFExtractor

    def test_filter_by_min_quality(self):
@@ -402,23 +392,15 @@ class TestQualityFiltering(unittest.TestCase):
        extractor.min_quality = 5.0

        # High quality block
-        high_quality = {
-            'code': 'def calculate():\n    return 42',
-            'language': 'python',
-            'quality': 8.0
-        }
+        high_quality = {"code": "def calculate():\n    return 42", "language": "python", "quality": 8.0}

        # Low quality block
-        low_quality = {
-            'code': 'x',
-            'language': 'unknown',
-            'quality': 2.0
-        }
+        low_quality = {"code": "x", "language": "unknown", "quality": 2.0}

        # Only high quality should pass
-        self.assertGreaterEqual(high_quality['quality'], extractor.min_quality)
-        self.assertLess(low_quality['quality'], extractor.min_quality)
+        self.assertGreaterEqual(high_quality["quality"], extractor.min_quality)
+        self.assertLess(low_quality["quality"], extractor.min_quality)


-if __name__ == '__main__':
+if __name__ == "__main__":
    unittest.main()