From 427ea176c67ddf0cb6ef2b61d6daf617d6c58076 Mon Sep 17 00:00:00 2001 From: pawu Date: Sun, 1 Feb 2026 15:15:30 +0530 Subject: [PATCH 1/2] feat: Add Dart, Scala, SCSS, SASS, Elixir, Lua, Perl language detection resolves #165 --- src/skill_seekers/cli/language_detector.py | 83 +++++++++ tests/test_pdf_extractor.py | 201 ++++++++++++++++++++- 2 files changed, 282 insertions(+), 2 deletions(-) diff --git a/src/skill_seekers/cli/language_detector.py b/src/skill_seekers/cli/language_detector.py index ff1b1cf..4510d85 100644 --- a/src/skill_seekers/cli/language_detector.py +++ b/src/skill_seekers/cli/language_detector.py @@ -310,6 +310,67 @@ LANGUAGE_PATTERNS: dict[str, list[tuple[str, int]]] = { (r"\b_ready\s*\(", 4), (r"\b_process\s*\(", 4), ], + "dart": [ + (r"\bimport\s+['\"]package:", 5), + (r"\bclass\s+\w+\s+extends\s+StatelessWidget", 5), + (r"\bclass\s+\w+\s+extends\s+StatefulWidget", 5), + (r"@override\b", 4), + (r"\bWidget\s+build\s*\(", 5), + (r"\bimport\s+['\"]dart:", 5), + (r"\bfinal\s+\w+\s+\w+;", 4), + (r"=>\s*\w+\(", 4), + (r"\basync\s*\{", 3), + (r"\bawait\s+", 3), + (r"\bsetState\s*\(", 4), + (r"\bvoid\s+main\s*\(", 3), + ], + "scala": [ + (r"\bcase\s+class\s+\w+", 5), + (r"\btrait\s+\w+", 5), + (r"\bdef\s+\w+.*:\s*\w+", 5), + (r"\bimport\s+scala\.", 4), + (r"\bmatch\s*\{", 4), + (r"\bval\s+\w+.*:\s*\w+", 4), + (r"\bobject\s+\w+", 5), + (r"=>", 3), + (r"\bdef\s+\w+\[\w+\]", 4), + (r"\bextends\s+\w+", 2), + ], + "elixir": [ + (r"\bdefmodule\s+[A-Z]", 5), + (r"\bdef\s+\w+\s+do\b", 5), + (r"\bdefp\s+\w+", 5), + (r"\|>", 5), + (r"\buse\s+[A-Z]", 4), + (r"\balias\s+[A-Z]", 4), + (r"#\{", 4), + (r"@[\w_]+", 3), + (r"\bcase\s+\w+\s+do\b", 3), + ], + "lua": [ + (r"\blocal\s+\w+\s*=", 5), + (r"\.\.\.(?!\.)", 5), + (r"\brepeat\b.*\buntil\b", 5), + (r"~=", 4), + (r"\belseif\b", 4), + (r"\bthen\b", 3), + (r"\bfunction\s+\w+\s*\(", 3), + (r"\bend\b", 2), + ], + "perl": [ + (r"\bmy\s+\$\w+", 5), + (r"\buse\s+strict", 5), + (r"\buse\s+warnings", 5), + (r"\bsub\s+\w+\s*\{", 5), + (r"\bchomp\s*\(", 5), + (r"@\w+\s*=", 5), + (r"%\w+\s*=", 5), + (r"\$\w+\s*=~\s*/", 4), + (r"\$[0-9]+", 4), + (r"->", 3), + ], + + # ===== Markup/Config Languages ===== "html": [ (r"", 5), @@ -327,6 +388,28 @@ LANGUAGE_PATTERNS: dict[str, list[tuple[str, int]]] = { (r"#[\w-]+\s*\{", 2), (r"@import", 2), ], + "scss": [ + (r"\$[\w-]+\s*:", 5), + (r"@mixin\s+[\w-]+", 5), + (r"@include\s+[\w-]+", 5), + (r"@extend\s+", 4), + (r"@function\s+[\w-]+", 4), + (r"&[:\.]", 4), + (r"#\{", 4), + (r"@import\s+['\"]", 3), + (r"@if\s+", 5), + (r"@for\s+", 5), + (r"@each\s+", 5), + ], + "sass": [ + (r"\$[\w-]+\s*:", 5), + (r"=[\w-]+", 5), + (r"\+[\w-]+", 5), + (r"@for\s+.+\s+through\s+", 5), + (r"@mixin\s+[\w-]+", 4), + (r"@if\s+", 4), + (r"^\s{2,}[\w-]+:", 3), + ], "json": [ (r"^\s*\{", 3), (r"^\s*\[", 3), diff --git a/tests/test_pdf_extractor.py b/tests/test_pdf_extractor.py index c0e321e..4cd23a5 100644 --- a/tests/test_pdf_extractor.py +++ b/tests/test_pdf_extractor.py @@ -119,6 +119,195 @@ class TestLanguageDetection(unittest.TestCase): self.assertGreaterEqual(confidence, 0.0) self.assertLessEqual(confidence, 1.0) + def test_detect_scss_with_confidence(self): + """Test SCSS detection""" + extractor = self.PDFExtractor.__new__(self.PDFExtractor) + from skill_seekers.cli.language_detector import LanguageDetector + + extractor.language_detector = LanguageDetector(min_confidence=0.15) + + code = """ + $primary-color: #3498db; + + @mixin border-radius($radius) { + border-radius: $radius; + } + + .button { + color: $primary-color; + @include border-radius(5px); + + &:hover { + background: darken($primary-color, 10%); + } + } + """ + + language, confidence = extractor.detect_language_from_code(code) + self.assertEqual(language, "scss") + self.assertGreater(confidence, 0.8) + + def test_detect_dart_with_confidence(self): + """Test Dart detection""" + extractor = self.PDFExtractor.__new__(self.PDFExtractor) + from skill_seekers.cli.language_detector import LanguageDetector + + extractor.language_detector = LanguageDetector(min_confidence=0.15) + + code = """ + import 'package:flutter/material.dart'; + + class MyApp extends StatelessWidget { + @override + Widget build(BuildContext context) { + return MaterialApp( + home: Text('Hello'), + ); + } + } + """ + + language, confidence = extractor.detect_language_from_code(code) + self.assertEqual(language, "dart") + self.assertGreater(confidence, 0.6) + + def test_detect_scala_with_confidence(self): + """Test Scala detection""" + extractor = self.PDFExtractor.__new__(self.PDFExtractor) + from skill_seekers.cli.language_detector import LanguageDetector + + extractor.language_detector = LanguageDetector(min_confidence=0.15) + + code = """ + case class Person(name: String, age: Int) + + object Main extends App { + val person = Person("Alice", 30) + person match { + case Person(n, a) if a >= 18 => println(s"Adult: $n") + case _ => println("Minor") + } + } + """ + + language, confidence = extractor.detect_language_from_code(code) + self.assertEqual(language, "scala") + self.assertGreater(confidence, 0.7) + + def test_detect_sass_with_confidence(self): + """Test SASS detection""" + extractor = self.PDFExtractor.__new__(self.PDFExtractor) + from skill_seekers.cli.language_detector import LanguageDetector + + extractor.language_detector = LanguageDetector(min_confidence=0.15) + + code = """ + $primary-color: #3498db + + =border-radius($radius) + border-radius: $radius + + .button + color: $primary-color + +border-radius(5px) + + &:hover + background: darken($primary-color, 10%) + """ + + language, confidence = extractor.detect_language_from_code(code) + self.assertEqual(language, "sass") + self.assertGreater(confidence, 0.8) + + def test_detect_elixir_with_confidence(self): + """Test Elixir detection""" + extractor = self.PDFExtractor.__new__(self.PDFExtractor) + from skill_seekers.cli.language_detector import LanguageDetector + + extractor.language_detector = LanguageDetector(min_confidence=0.15) + + code = """ + defmodule MyApp.User do + def greet(name) do + "Hello, #{name}" + end + + defp calculate_age(birth_year) do + 2024 - birth_year + end + + def process(data) do + data + |> String.trim() + |> String.downcase() + |> String.split(",") + end + end + """ + + language, confidence = extractor.detect_language_from_code(code) + self.assertEqual(language, "elixir") + self.assertGreater(confidence, 0.8) + + def test_detect_lua_with_confidence(self): + """Test Lua detection""" + extractor = self.PDFExtractor.__new__(self.PDFExtractor) + from skill_seekers.cli.language_detector import LanguageDetector + + extractor.language_detector = LanguageDetector(min_confidence=0.15) + + code = """ + local function calculate_sum(numbers) + local total = 0 + for i = 1, #numbers do + total = total + numbers[i] + end + return total + end + + local items = {1, 2, 3, 4, 5} + local result = calculate_sum(items) + print("Sum: " .. result) + """ + + language, confidence = extractor.detect_language_from_code(code) + self.assertEqual(language, "lua") + self.assertGreater(confidence, 0.7) + + def test_detect_perl_with_confidence(self): + """Test Perl detection""" + extractor = self.PDFExtractor.__new__(self.PDFExtractor) + from skill_seekers.cli.language_detector import LanguageDetector + + extractor.language_detector = LanguageDetector(min_confidence=0.15) + + code = """ + #!/usr/bin/perl + use strict; + use warnings; + + sub process_line { + my $line = shift; + chomp($line); + + if ($line =~ /^(\w+)=(\w+)$/) { + my ($name, $value) = ($1, $2); + return "$name has value $value"; + } + return undef; + } + + my @lines = ("foo=10", "bar=20"); + foreach my $line (@lines) { + my $result = process_line($line); + print $result if defined $result; + } + """ + + language, confidence = extractor.detect_language_from_code(code) + self.assertEqual(language, "perl") + self.assertGreater(confidence, 0.8) + class TestSyntaxValidation(unittest.TestCase): """Test syntax validation for different languages""" @@ -315,7 +504,11 @@ class TestCodeBlockMerging(unittest.TestCase): { "page_number": 1, "code_samples": [ - {"code": "def hello():", "language": "python", "detection_method": "pattern"} + { + "code": "def hello():", + "language": "python", + "detection_method": "pattern", + } ], "code_blocks_count": 1, }, @@ -346,7 +539,11 @@ class TestCodeBlockMerging(unittest.TestCase): { "page_number": 1, "code_samples": [ - {"code": "def foo():", "language": "python", "detection_method": "pattern"} + { + "code": "def foo():", + "language": "python", + "detection_method": "pattern", + } ], "code_blocks_count": 1, }, From 3204c73c012c6ce35e496b62706a62766b8cdec2 Mon Sep 17 00:00:00 2001 From: pawu Date: Mon, 2 Feb 2026 01:08:59 +0530 Subject: [PATCH 2/2] fix: Resolves CI test failures and linting errors --- src/skill_seekers/cli/language_detector.py | 4 +- tests/test_pdf_extractor.py | 68 +++++++++++----------- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/skill_seekers/cli/language_detector.py b/src/skill_seekers/cli/language_detector.py index 4510d85..5694d35 100644 --- a/src/skill_seekers/cli/language_detector.py +++ b/src/skill_seekers/cli/language_detector.py @@ -327,10 +327,10 @@ LANGUAGE_PATTERNS: dict[str, list[tuple[str, int]]] = { "scala": [ (r"\bcase\s+class\s+\w+", 5), (r"\btrait\s+\w+", 5), - (r"\bdef\s+\w+.*:\s*\w+", 5), + (r"\bdef\s+\w+[^:]*:\s*\w+\s*=", 5), (r"\bimport\s+scala\.", 4), (r"\bmatch\s*\{", 4), - (r"\bval\s+\w+.*:\s*\w+", 4), + (r"\bval\s+\w+\s*:\s*\w+\s*=", 4), (r"\bobject\s+\w+", 5), (r"=>", 3), (r"\bdef\s+\w+\[\w+\]", 4), diff --git a/tests/test_pdf_extractor.py b/tests/test_pdf_extractor.py index 4cd23a5..3b240e9 100644 --- a/tests/test_pdf_extractor.py +++ b/tests/test_pdf_extractor.py @@ -128,15 +128,15 @@ class TestLanguageDetection(unittest.TestCase): code = """ $primary-color: #3498db; - + @mixin border-radius($radius) { border-radius: $radius; } - + .button { color: $primary-color; @include border-radius(5px); - + &:hover { background: darken($primary-color, 10%); } @@ -146,17 +146,17 @@ class TestLanguageDetection(unittest.TestCase): language, confidence = extractor.detect_language_from_code(code) self.assertEqual(language, "scss") self.assertGreater(confidence, 0.8) - + def test_detect_dart_with_confidence(self): """Test Dart detection""" extractor = self.PDFExtractor.__new__(self.PDFExtractor) from skill_seekers.cli.language_detector import LanguageDetector - + extractor.language_detector = LanguageDetector(min_confidence=0.15) - + code = """ import 'package:flutter/material.dart'; - + class MyApp extends StatelessWidget { @override Widget build(BuildContext context) { @@ -166,7 +166,7 @@ class TestLanguageDetection(unittest.TestCase): } } """ - + language, confidence = extractor.detect_language_from_code(code) self.assertEqual(language, "dart") self.assertGreater(confidence, 0.6) @@ -175,12 +175,12 @@ class TestLanguageDetection(unittest.TestCase): """Test Scala detection""" extractor = self.PDFExtractor.__new__(self.PDFExtractor) from skill_seekers.cli.language_detector import LanguageDetector - + extractor.language_detector = LanguageDetector(min_confidence=0.15) - + code = """ case class Person(name: String, age: Int) - + object Main extends App { val person = Person("Alice", 30) person match { @@ -189,7 +189,7 @@ class TestLanguageDetection(unittest.TestCase): } } """ - + language, confidence = extractor.detect_language_from_code(code) self.assertEqual(language, "scala") self.assertGreater(confidence, 0.7) @@ -198,23 +198,23 @@ class TestLanguageDetection(unittest.TestCase): """Test SASS detection""" extractor = self.PDFExtractor.__new__(self.PDFExtractor) from skill_seekers.cli.language_detector import LanguageDetector - + extractor.language_detector = LanguageDetector(min_confidence=0.15) - + code = """ $primary-color: #3498db - + =border-radius($radius) border-radius: $radius - + .button color: $primary-color +border-radius(5px) - + &:hover background: darken($primary-color, 10%) """ - + language, confidence = extractor.detect_language_from_code(code) self.assertEqual(language, "sass") self.assertGreater(confidence, 0.8) @@ -223,19 +223,19 @@ class TestLanguageDetection(unittest.TestCase): """Test Elixir detection""" extractor = self.PDFExtractor.__new__(self.PDFExtractor) from skill_seekers.cli.language_detector import LanguageDetector - + extractor.language_detector = LanguageDetector(min_confidence=0.15) - + code = """ defmodule MyApp.User do def greet(name) do "Hello, #{name}" end - + defp calculate_age(birth_year) do 2024 - birth_year end - + def process(data) do data |> String.trim() @@ -244,7 +244,7 @@ class TestLanguageDetection(unittest.TestCase): end end """ - + language, confidence = extractor.detect_language_from_code(code) self.assertEqual(language, "elixir") self.assertGreater(confidence, 0.8) @@ -253,9 +253,9 @@ class TestLanguageDetection(unittest.TestCase): """Test Lua detection""" extractor = self.PDFExtractor.__new__(self.PDFExtractor) from skill_seekers.cli.language_detector import LanguageDetector - + extractor.language_detector = LanguageDetector(min_confidence=0.15) - + code = """ local function calculate_sum(numbers) local total = 0 @@ -264,12 +264,12 @@ class TestLanguageDetection(unittest.TestCase): end return total end - + local items = {1, 2, 3, 4, 5} local result = calculate_sum(items) print("Sum: " .. result) """ - + language, confidence = extractor.detect_language_from_code(code) self.assertEqual(language, "lua") self.assertGreater(confidence, 0.7) @@ -278,32 +278,32 @@ class TestLanguageDetection(unittest.TestCase): """Test Perl detection""" extractor = self.PDFExtractor.__new__(self.PDFExtractor) from skill_seekers.cli.language_detector import LanguageDetector - + extractor.language_detector = LanguageDetector(min_confidence=0.15) - - code = """ + + code = r""" #!/usr/bin/perl use strict; use warnings; - + sub process_line { my $line = shift; chomp($line); - + if ($line =~ /^(\w+)=(\w+)$/) { my ($name, $value) = ($1, $2); return "$name has value $value"; } return undef; } - + my @lines = ("foo=10", "bar=20"); foreach my $line (@lines) { my $result = process_line($line); print $result if defined $result; } """ - + language, confidence = extractor.detect_language_from_code(code) self.assertEqual(language, "perl") self.assertGreater(confidence, 0.8)