perf: optimize with caching, pre-compiled regex, O(1) lookups, and bisect line indexing (#309)

## Summary Performance optimizations across core scraping and analysis modules: - **doc_scraper.py**: Pre-compiled regex at module level, O(1) URL dedup via _enqueued_urls set, cached URL patterns, _enqueue_url() helper (DRY), seen_links set for link extraction, pre-lowercased category keywords, async error logging (bug fix), summary I/O error handling - **code_analyzer.py**: O(log n) bisect-based line lookups replacing O(n) count("\n") across all 10 language analyzers; O(n) parent class map replacing O(n^2) AST walks for Python method detection - **dependency_analyzer.py**: Same bisect line-index optimization for all import extractors - **codebase_scraper.py**: Module-level import re, pre-imported parser classes outside loop - **github_scraper.py**: deque.popleft() for O(1) tree traversal, module-level import fnmatch - **utils.py**: Shared build_line_index() / offset_to_line() utilities (DRY) - **test_adaptor_benchmarks.py**: Stabilized flaky test_benchmark_metadata_overhead (median, warm-up, more iterations) Review fixes applied on top of original PR: 1. Renamed misleading _pending_set to _enqueued_urls 2. Extracted duplicated line-index code into shared cli/utils.py 3. Fixed pre-existing "tutorial" vs "tutorials" key mismatch bug in infer_categories() 4. Removed unnecessary _store_results() closure 5. Simplified parser pre-import pattern
2026-03-14 13:35:39 -07:00
parent 0ca271cdcb
commit 89f5e6fe5f
5 changed files with 191 additions and 140 deletions
--- a/src/skill_seekers/cli/dependency_analyzer.py
+++ b/src/skill_seekers/cli/dependency_analyzer.py
@@ -40,6 +40,7 @@ Credits:
 """

 import ast
+import bisect
 import logging
 import re
 from dataclasses import dataclass, field
@@ -95,6 +96,16 @@ class DependencyAnalyzer:
        self.graph = nx.DiGraph()  # Directed graph for dependencies
        self.file_dependencies: dict[str, list[DependencyInfo]] = {}
        self.file_nodes: dict[str, FileNode] = {}
+        self._newline_offsets: list[int] = []
+
+    @staticmethod
+    def _build_line_index(content: str) -> list[int]:
+        """Build a sorted list of newline positions for O(log n) line lookups."""
+        return [i for i, ch in enumerate(content) if ch == "\n"]
+
+    def _offset_to_line(self, offset: int) -> int:
+        """Convert a character offset to a 1-based line number using bisect."""
+        return bisect.bisect_left(self._newline_offsets, offset) + 1

    def analyze_file(self, file_path: str, content: str, language: str) -> list[DependencyInfo]:
        """
@@ -109,6 +120,9 @@ class DependencyAnalyzer:
        Returns:
            List of DependencyInfo objects
        """
+        # Build line index once for O(log n) lookups in all extractors
+        self._newline_offsets = self._build_line_index(content)
+
        if language == "Python":
            deps = self._extract_python_imports(content, file_path)
        elif language == "GDScript":
@@ -216,7 +230,7 @@ class DependencyAnalyzer:
        preload_pattern = r'(?:const|var)\s+\w+\s*=\s*preload\("(.+?)"\)'
        for match in re.finditer(preload_pattern, content):
            resource_path = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            # Convert res:// paths to relative
            if resource_path.startswith("res://"):
@@ -236,7 +250,7 @@ class DependencyAnalyzer:
        load_pattern = r'(?:const|var)\s+\w+\s*=\s*load\("(.+?)"\)'
        for match in re.finditer(load_pattern, content):
            resource_path = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            if resource_path.startswith("res://"):
                resource_path = resource_path[6:]
@@ -255,7 +269,7 @@ class DependencyAnalyzer:
        extends_path_pattern = r'extends\s+"(.+?)"'
        for match in re.finditer(extends_path_pattern, content):
            resource_path = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            if resource_path.startswith("res://"):
                resource_path = resource_path[6:]
@@ -275,7 +289,7 @@ class DependencyAnalyzer:
        extends_class_pattern = r"extends\s+([A-Z]\w+)"
        for match in re.finditer(extends_class_pattern, content):
            class_name = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            # Skip built-in Godot classes (Node, Resource, etc.)
            if class_name not in (
@@ -334,7 +348,7 @@ class DependencyAnalyzer:
        import_pattern = r"import\s+(?:[\w\s{},*]+\s+from\s+)?['\"]([^'\"]+)['\"]"
        for match in re.finditer(import_pattern, content):
            module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
            is_relative = module.startswith(".") or module.startswith("/")

            deps.append(
@@ -351,7 +365,7 @@ class DependencyAnalyzer:
        require_pattern = r"require\s*\(['\"]([^'\"]+)['\"]\)"
        for match in re.finditer(require_pattern, content):
            module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
            is_relative = module.startswith(".") or module.startswith("/")

            deps.append(
@@ -380,7 +394,7 @@ class DependencyAnalyzer:
        include_pattern = r'#include\s+[<"]([^>"]+)[>"]'
        for match in re.finditer(include_pattern, content):
            header = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            # Headers with "" are usually local, <> are system headers
            is_relative = '"' in match.group(0)
@@ -417,7 +431,7 @@ class DependencyAnalyzer:
        for match in re.finditer(using_pattern, content):
            alias = match.group(1)  # Optional alias
            namespace = match.group(2)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            # Skip 'using' statements for IDisposable (using var x = ...)
            if "=" in match.group(0) and not alias:
@@ -454,7 +468,7 @@ class DependencyAnalyzer:
        for match in re.finditer(single_import_pattern, content):
            match.group(1)  # Optional alias
            package = match.group(2)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            # Check if relative (starts with ./ or ../)
            is_relative = package.startswith("./")
@@ -516,7 +530,7 @@ class DependencyAnalyzer:
        use_pattern = r"use\s+([\w:{}]+(?:\s*,\s*[\w:{}]+)*|[\w:]+::\{[^}]+\})\s*;"
        for match in re.finditer(use_pattern, content):
            module_path = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            # Determine if relative
            is_relative = module_path.startswith(("self::", "super::"))
@@ -571,7 +585,7 @@ class DependencyAnalyzer:
        import_pattern = r"import\s+(?:static\s+)?([A-Za-z_][\w.]*(?:\.\*)?)\s*;"
        for match in re.finditer(import_pattern, content):
            import_path = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            deps.append(
                DependencyInfo(
@@ -603,7 +617,7 @@ class DependencyAnalyzer:
        require_pattern = r"require\s+['\"]([^'\"]+)['\"]"
        for match in re.finditer(require_pattern, content):
            module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            deps.append(
                DependencyInfo(
@@ -619,7 +633,7 @@ class DependencyAnalyzer:
        require_relative_pattern = r"require_relative\s+['\"]([^'\"]+)['\"]"
        for match in re.finditer(require_relative_pattern, content):
            module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            deps.append(
                DependencyInfo(
@@ -635,7 +649,7 @@ class DependencyAnalyzer:
        load_pattern = r"load\s+['\"]([^'\"]+)['\"]"
        for match in re.finditer(load_pattern, content):
            module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            deps.append(
                DependencyInfo(
@@ -669,7 +683,7 @@ class DependencyAnalyzer:
        require_pattern = r"(?:require|include)(?:_once)?\s+['\"]([^'\"]+)['\"]"
        for match in re.finditer(require_pattern, content):
            module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            # Determine import type
            import_type = "require" if "require" in match.group(0) else "include"
@@ -691,7 +705,7 @@ class DependencyAnalyzer:
        use_pattern = r"use\s+([A-Za-z_][\w\\]*)\s*(?:as\s+\w+)?\s*;"
        for match in re.finditer(use_pattern, content):
            namespace = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            deps.append(
                DependencyInfo(
@@ -908,7 +922,7 @@ class DependencyAnalyzer:
                    source_file=file_path,
                    imported_module=resource_path,
                    import_type="ext_resource",
-                    line_number=content[: match.start()].count("\n") + 1,
+                    line_number=self._offset_to_line(match.start()),
                )
            )

@@ -924,7 +938,7 @@ class DependencyAnalyzer:
                    source_file=file_path,
                    imported_module=resource_path,
                    import_type="preload",
-                    line_number=content[: match.start()].count("\n") + 1,
+                    line_number=self._offset_to_line(match.start()),
                )
            )