feat: add Kotlin language support for codebase analysis (#287)

Adds full C3.x pipeline support for Kotlin (.kt, .kts):
- Language detection patterns (40+ weighted patterns for data/sealed classes, coroutines, companion objects, KMP, etc.)
- AST regex parser in code_analyzer.py (classes, objects, functions, extension functions, suspend functions)
- Dependency extraction for Kotlin import statements (with alias support)
- Design pattern adaptations (object→Singleton, companion→Factory, sealed→Strategy, data→Builder, Flow→Observer)
- Test example extraction for JUnit 4/5, Kotest, MockK, Spek
- Config detection for build.gradle.kts / settings.gradle.kts
- Extension maps registered in codebase_scraper, unified_codebase_analyzer, github_scraper, generate_router

Also fixes pre-existing parser count tests (35→36 for doctor command added in previous commit).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-03-28 23:25:12 +03:00
parent ea4fed0be4
commit 6fded977dd
16 changed files with 1994 additions and 901 deletions

View File

@@ -133,6 +133,8 @@ class CodeAnalyzer:
return self._analyze_rust(content, file_path)
elif language == "Java":
return self._analyze_java(content, file_path)
elif language == "Kotlin":
return self._analyze_kotlin(content, file_path)
elif language == "Ruby":
return self._analyze_ruby(content, file_path)
elif language == "PHP":
@@ -1242,6 +1244,259 @@ class CodeAnalyzer:
return comments
def _analyze_kotlin(self, content: str, _file_path: str) -> dict[str, Any]:
"""
Analyze Kotlin file using regex patterns.
Handles Kotlin-specific constructs:
- Classes (regular, data, sealed, abstract, open, inner, enum, annotation)
- Object declarations and companion objects (Kotlin singletons)
- Functions (regular, suspend, inline, extension, infix, operator)
- Properties (val/var with types)
- Imports (including alias with `as`)
Regex patterns based on Kotlin language specification:
https://kotlinlang.org/spec/
"""
self._newline_offsets = build_line_index(content)
classes = []
functions = []
# Extract class definitions (data class, sealed class, abstract class, open class, enum class, annotation class, inner class, regular class)
class_pattern = (
r"(?:(?:public|private|protected|internal)\s+)?"
r"(?:(?:data|sealed|abstract|open|inner|enum|annotation)\s+)*"
r"class\s+(\w+)"
r"(?:\s*<[^>]+>)?" # Generic type parameters
r"(?:\s*(?:private|protected|internal)?\s*(?:constructor\s*)?\([^)]*\))?" # Primary constructor (with optional visibility)
r"(?:\s*:\s*([\w\s,.<>()]+?))?" # Superclass/interfaces
r"\s*\{"
)
for match in re.finditer(class_pattern, content):
class_name = match.group(1)
supertypes_str = match.group(2)
base_classes = []
if supertypes_str:
# Split by comma, strip constructor calls like Foo()
for st in supertypes_str.split(","):
st = st.strip()
# Remove constructor args: SuperClass(args) -> SuperClass
st = re.sub(r"\(.*\)", "", st).strip()
if st and st not in ("", " "):
base_classes.append(st)
# Extract methods from class body
class_block_start = match.end()
brace_count = 1
class_block_end = class_block_start
for i, char in enumerate(content[class_block_start:], class_block_start):
if char == "{":
brace_count += 1
elif char == "}":
brace_count -= 1
if brace_count == 0:
class_block_end = i
break
if class_block_end > class_block_start:
class_body = content[class_block_start:class_block_end]
methods = self._extract_kotlin_methods(class_body)
else:
methods = []
classes.append(
{
"name": class_name,
"base_classes": base_classes,
"methods": methods,
"docstring": None,
"line_number": self._offset_to_line(match.start()),
}
)
# Extract object declarations (Kotlin singletons)
object_pattern = r"(?:(?:public|private|protected|internal)\s+)?object\s+(\w+)(?:\s*:\s*([\w\s,.<>()]+?))?\s*\{"
for match in re.finditer(object_pattern, content):
obj_name = match.group(1)
supertypes_str = match.group(2)
base_classes = []
if supertypes_str:
for st in supertypes_str.split(","):
st = re.sub(r"\(.*\)", "", st).strip()
if st:
base_classes.append(st)
# Extract methods
block_start = match.end()
brace_count = 1
block_end = block_start
for i, char in enumerate(content[block_start:], block_start):
if char == "{":
brace_count += 1
elif char == "}":
brace_count -= 1
if brace_count == 0:
block_end = i
break
methods = []
if block_end > block_start:
methods = self._extract_kotlin_methods(content[block_start:block_end])
classes.append(
{
"name": obj_name,
"base_classes": base_classes,
"methods": methods,
"docstring": None,
"line_number": self._offset_to_line(match.start()),
}
)
# Extract top-level functions
# Matches: [modifiers] fun [Type.]name([params]): ReturnType
func_pattern = (
r"(?:(?:public|private|protected|internal)\s+)?"
r"(?:(?:suspend|inline|infix|operator|tailrec|external)\s+)*"
r"fun\s+"
r"(?:<[^>]+>\s+)?" # Generic type parameters (e.g., <reified T>)
r"(?:([\w<>?*,\s]+)\.)?" # Extension receiver type (e.g., List<T>.)
r"(\w+)\s*"
r"\(([^)]*)\)"
r"(?:\s*:\s*([\w<>.,\s?*]+))?"
)
for match in re.finditer(func_pattern, content):
_receiver_type = match.group(1)
func_name = match.group(2)
params_str = match.group(3)
return_type = match.group(4)
if return_type:
return_type = return_type.strip()
# Skip if inside a class body (heuristic: check indentation)
line_start = content.rfind("\n", 0, match.start()) + 1
indent = match.start() - line_start
if indent > 4:
continue
is_suspend = "suspend" in content[max(0, match.start() - 50) : match.start()]
params = self._parse_kotlin_parameters(params_str)
functions.append(
{
"name": func_name,
"parameters": params,
"return_type": return_type,
"docstring": None,
"line_number": self._offset_to_line(match.start()),
"is_async": is_suspend,
"is_method": False,
"decorators": [],
}
)
# Extract comments (// and /* */ and /** KDoc */)
comments = self._extract_java_comments(content) # Same syntax as Java
# Extract imports
imports = []
import_pattern = r"import\s+([\w.]+(?:\.\*)?)"
for match in re.finditer(import_pattern, content):
import_path = match.group(1)
parts = import_path.split(".")
if len(parts) >= 2:
package = ".".join(parts[:2])
imports.append(package)
return {
"classes": classes,
"functions": functions,
"comments": comments,
"imports": list(set(imports)),
}
def _extract_kotlin_methods(self, class_body: str) -> list[dict]:
"""Extract Kotlin method signatures from class body."""
methods = []
method_pattern = (
r"(?:(?:public|private|protected|internal|override)\s+)*"
r"(?:(?:suspend|inline|infix|operator|open|abstract|final)\s+)*"
r"fun\s+"
r"(?:<[^>]+>\s*)?"
r"(?:\w+\.)?" # Extension receiver
r"(\w+)\s*"
r"\(([^)]*)\)"
r"(?:\s*:\s*([\w<>.,\s?*]+))?"
)
for match in re.finditer(method_pattern, class_body):
method_name = match.group(1)
params_str = match.group(2)
return_type = match.group(3)
if return_type:
return_type = return_type.strip()
params = self._parse_kotlin_parameters(params_str)
methods.append(
{
"name": method_name,
"parameters": params,
"return_type": return_type,
"docstring": None,
"line_number": None,
"is_async": False,
"is_method": True,
"decorators": [],
}
)
return methods
def _parse_kotlin_parameters(self, params_str: str) -> list[dict]:
"""Parse Kotlin parameter string (name: Type = default)."""
params = []
if not params_str.strip():
return params
param_list = [p.strip() for p in params_str.split(",")]
for param in param_list:
if not param:
continue
default = None
if "=" in param:
param, default = param.split("=", 1)
param = param.strip()
default = default.strip()
# Kotlin format: [vararg] name: Type
param = re.sub(r"^\s*(?:vararg|noinline|crossinline)\s+", "", param)
if ":" in param:
name_part, type_part = param.split(":", 1)
param_name = name_part.strip()
param_type = type_part.strip()
else:
param_name = param.strip()
param_type = None
params.append(
{
"name": param_name,
"type_hint": param_type,
"default": default,
}
)
return params
def _analyze_ruby(self, content: str, _file_path: str) -> dict[str, Any]:
"""
Analyze Ruby file using regex patterns.

View File

@@ -73,6 +73,8 @@ LANGUAGE_EXTENSIONS = {
".go": "Go",
".rs": "Rust",
".java": "Java",
".kt": "Kotlin",
".kts": "Kotlin",
".rb": "Ruby",
".php": "PHP",
}

View File

@@ -77,6 +77,7 @@ class ConfigFile:
"ini",
"python",
"javascript",
"kotlin-gradle",
"dockerfile",
"docker-compose",
]
@@ -215,6 +216,14 @@ class ConfigFileDetector:
"webpack.config.js",
],
},
"kotlin-gradle": {
"patterns": ["*.gradle.kts"],
"names": [
"build.gradle.kts",
"settings.gradle.kts",
"gradle.properties",
],
},
"dockerfile": {
"patterns": ["Dockerfile*"],
"names": ["Dockerfile", "Dockerfile.dev", "Dockerfile.prod"],
@@ -358,7 +367,13 @@ class ConfigFileDetector:
return "ci_cd_configuration"
# Package configs
if filename in ["package.json", "pyproject.toml", "cargo.toml"]:
if filename in [
"package.json",
"pyproject.toml",
"cargo.toml",
"build.gradle.kts",
"settings.gradle.kts",
]:
return "package_configuration"
# TypeScript/JavaScript configs

View File

@@ -139,6 +139,8 @@ class DependencyAnalyzer:
deps = self._extract_rust_imports(content, file_path)
elif language == "Java":
deps = self._extract_java_imports(content, file_path)
elif language == "Kotlin":
deps = self._extract_kotlin_imports(content, file_path)
elif language == "Ruby":
deps = self._extract_ruby_imports(content, file_path)
elif language == "PHP":
@@ -595,6 +597,38 @@ class DependencyAnalyzer:
return deps
def _extract_kotlin_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
"""
Extract Kotlin import statements.
Handles:
- import kotlin.collections.List
- import kotlinx.coroutines.*
- import com.example.Foo as Bar (alias imports)
Regex patterns based on Kotlin language specification:
https://kotlinlang.org/spec/packages-and-imports.html
"""
deps = []
# Match: import package.Class [as Alias]
import_pattern = r"import\s+([A-Za-z_][\w.]*(?:\.\*)?)\s*(?:as\s+\w+)?"
for match in re.finditer(import_pattern, content):
import_path = match.group(1)
line_num = self._offset_to_line(match.start())
deps.append(
DependencyInfo(
source_file=file_path,
imported_module=import_path,
import_type="import",
is_relative=False,
line_number=line_num,
)
)
return deps
def _extract_ruby_imports(self, content: str, file_path: str) -> list[DependencyInfo]:
"""
Extract Ruby require/require_relative/load statements.

View File

@@ -249,6 +249,7 @@ class RouterGenerator:
"Go": f"Go 1.20+, requires {self.router_name} package",
"Rust": f"Rust 1.70+, requires {self.router_name} package",
"Java": f"Java 17+, requires {self.router_name} package",
"Kotlin": f"Kotlin 1.9+, JDK 17+, requires {self.router_name} package",
}
if language in compatibility_map:
compatibility = compatibility_map[language]

View File

@@ -729,6 +729,8 @@ class GitHubScraper:
"Python": [".py"],
"JavaScript": [".js", ".jsx"],
"TypeScript": [".ts", ".tsx"],
"Kotlin": [".kt", ".kts"],
"Java": [".java"],
"C": [".c", ".h"],
"C++": [".cpp", ".hpp", ".cc", ".hh", ".cxx"],
}

View File

@@ -202,6 +202,49 @@ LANGUAGE_PATTERNS: dict[str, list[tuple[str, int]]] = {
(r"\bimport\s+java\.", 2),
(r"\bextends\s+\w+", 2),
],
"kotlin": [
# Kotlin-unique keywords (weight 5)
(r"\bfun\s+\w+\s*\(", 4), # Kotlin function declaration
(r"\bval\s+\w+\s*:", 3), # Immutable variable with type
(r"\bvar\s+\w+\s*:", 3), # Mutable variable with type
(r"\bdata\s+class\s+\w+", 5), # Data class — Kotlin-unique
(r"\bsealed\s+class\s+\w+", 5), # Sealed class — Kotlin-unique
(r"\bsealed\s+interface\s+\w+", 5), # Sealed interface — Kotlin-unique
(r"\bobject\s+\w+\s*:", 5), # Object declaration — Kotlin singleton
(r"\bobject\s+\w+\s*\{", 5), # Object declaration — Kotlin singleton
(r"\bcompanion\s+object\b", 5), # Companion object — Kotlin-unique
(r"\bsuspend\s+fun\b", 5), # Coroutine suspend function
(r"\bwhen\s*\(", 4), # when expression (like switch but richer)
(r"\bwhen\s*\{", 4), # when without argument
(r"\binline\s+fun\b", 5), # Inline function — Kotlin-specific
(r"\breified\b", 5), # Reified type parameter — Kotlin-unique
(r"\binit\s*\{", 4), # Init block
(r"\bimport\s+kotlin\.", 5), # Kotlin stdlib import
(r"\bimport\s+kotlinx?\.", 5), # Kotlin/KotlinX imports
(r"\bimport\s+android\.", 4), # Android imports (common in Kotlin)
(r"\bimport\s+androidx\.", 4), # AndroidX imports
# Kotlin idioms (weight 3-4)
(r"\bby\s+lazy\b", 4), # Lazy delegation — Kotlin idiom
(r"\blistOf\s*\(", 3), # Kotlin stdlib
(r"\bmapOf\s*\(", 3), # Kotlin stdlib
(r"\bsetOf\s*\(", 3), # Kotlin stdlib
(r"\blet\s*\{", 3), # Scope function
(r"\bapply\s*\{", 3), # Scope function
(r"\balso\s*\{", 3), # Scope function
(r"\brun\s*\{", 2), # Scope function (weak — common word)
(r"\?\.", 2), # Safe call operator
(r"\?:", 2), # Elvis operator
(r"!!", 2), # Non-null assertion
# Kotlin multiplatform
(r"\bexpect\s+(?:fun|class|val|var)\b", 5), # KMP expect declaration
(r"\bactual\s+(?:fun|class|val|var)\b", 5), # KMP actual declaration
# Coroutines
(r"\blaunch\s*\{", 4), # Coroutine launch
(r"\basync\s*\{", 3), # Coroutine async
(r"\bwithContext\s*\(", 4), # Coroutine context switch
(r"\bCoroutineScope\b", 4), # Coroutine scope
(r"\bFlow<", 4), # Kotlin Flow
],
"go": [
(r"\bfunc\s+\w+\s*\(", 3),
(r"\bpackage\s+\w+", 4),

View File

@@ -1580,6 +1580,43 @@ class LanguageAdapter:
elif pattern.pattern_type == "TemplateMethod" and "abstract" in evidence_str:
pattern.confidence = min(pattern.confidence + 0.1, 1.0)
# Kotlin adaptations
elif language == "Kotlin":
# Singleton: object declaration is the idiomatic Kotlin singleton
if pattern.pattern_type == "Singleton":
if "object" in evidence_str or "companion" in evidence_str:
pattern.confidence = min(pattern.confidence + 0.15, 1.0)
pattern.evidence.append("Kotlin object declaration (singleton)")
# Factory: companion object with create/of methods
elif pattern.pattern_type == "Factory":
if "companion" in evidence_str:
pattern.confidence = min(pattern.confidence + 0.1, 1.0)
pattern.evidence.append("Kotlin companion object factory")
# Strategy: sealed class/interface with when expression
elif pattern.pattern_type == "Strategy":
if "sealed" in evidence_str:
pattern.confidence = min(pattern.confidence + 0.15, 1.0)
pattern.evidence.append("Kotlin sealed class/interface strategy")
# Builder: data class copy() or DSL builder pattern
elif pattern.pattern_type == "Builder":
if "data" in evidence_str or "apply" in evidence_str:
pattern.confidence = min(pattern.confidence + 0.1, 1.0)
pattern.evidence.append("Kotlin data class / DSL builder")
# Observer: Flow/StateFlow is the coroutine-based observer
elif pattern.pattern_type == "Observer":
if "flow" in evidence_str or "stateflow" in evidence_str:
pattern.confidence = min(pattern.confidence + 0.1, 1.0)
pattern.evidence.append("Kotlin Flow/StateFlow observer")
# Decorator: extension functions serve as lightweight decorators
elif pattern.pattern_type == "Decorator" and "extension" in evidence_str:
pattern.confidence = min(pattern.confidence + 0.05, 1.0)
pattern.evidence.append("Kotlin extension function decorator")
# Go adaptations
elif language == "Go":
# Singleton: sync.Once is idiomatic

View File

@@ -678,6 +678,18 @@ class GenericTestAnalyzer:
"assertion": r"assert(?:Equals|True|False|NotNull)\(([^)]+)\)",
"test_function": r"@Test\s+public\s+void\s+(\w+)\(\)",
},
"kotlin": {
# Object instantiation: val x = Foo(args) or val x: Type = Foo(args)
"instantiation": r"(?:val|var)\s+(\w+)(?:\s*:\s*[\w<>.,\s?]+)?\s*=\s*(\w+)\(([^)]*)\)",
# JUnit assertions + Kotest matchers
"assertion": r"(?:assert(?:Equals|True|False|NotNull|That)\(([^)]+)\)|(\w+)\s+should(?:Be|Equal|Match|Have|Contain|Throw)\b)",
# JUnit @Test, Kotest test functions, Spek describe/it
"test_function": r"(?:@Test\s+fun\s+(\w+)\s*\(|fun\s+[\"']([^\"']+)[\"']\s*\(|(?:test|it|should)\s*\(\s*[\"']([^\"']+)[\"'])",
# MockK mocking patterns
"mock": r"(?:mockk<([\w<>]+)>\s*\(|every\s*\{\s*(\w+)\.(\w+)|verify\s*\{)",
# Coroutine test patterns
"coroutine_test": r"(?:runTest\s*\{|runBlocking\s*\{|testCoroutineDispatcher)",
},
"csharp": {
# Object instantiation patterns (var, explicit type, generic)
"instantiation": r"(?:var|[\w<>]+)\s+(\w+)\s*=\s*new\s+([\w<>]+)\(([^)]*)\)",
@@ -929,6 +941,9 @@ class TestExampleExtractor:
"*_test.go",
"*_test.rs",
"Test*.java",
"*Test.kt",
"Test*.kt",
"*Spec.kt", # Kotest/Spek naming convention
"Test*.cs",
"*Test.php",
"*_spec.rb",
@@ -944,6 +959,8 @@ class TestExampleExtractor:
".go": "Go",
".rs": "Rust",
".java": "Java",
".kt": "Kotlin",
".kts": "Kotlin",
".cs": "C#",
".php": "PHP",
".rb": "Ruby",

View File

@@ -559,6 +559,8 @@ class UnifiedCodebaseAnalyzer:
".go": "Go",
".rs": "Rust",
".java": "Java",
".kt": "Kotlin",
".kts": "Kotlin",
".rb": "Ruby",
".php": "PHP",
}