perf: optimize with caching, pre-compiled regex, O(1) lookups, and bisect line indexing (#309)

## Summary

Performance optimizations across core scraping and analysis modules:

- **doc_scraper.py**: Pre-compiled regex at module level, O(1) URL dedup via _enqueued_urls set, cached URL patterns, _enqueue_url() helper (DRY), seen_links set for link extraction, pre-lowercased category keywords, async error logging (bug fix), summary I/O error handling
- **code_analyzer.py**: O(log n) bisect-based line lookups replacing O(n) count("\n") across all 10 language analyzers; O(n) parent class map replacing O(n^2) AST walks for Python method detection
- **dependency_analyzer.py**: Same bisect line-index optimization for all import extractors
- **codebase_scraper.py**: Module-level import re, pre-imported parser classes outside loop
- **github_scraper.py**: deque.popleft() for O(1) tree traversal, module-level import fnmatch
- **utils.py**: Shared build_line_index() / offset_to_line() utilities (DRY)
- **test_adaptor_benchmarks.py**: Stabilized flaky test_benchmark_metadata_overhead (median, warm-up, more iterations)

Review fixes applied on top of original PR:
1. Renamed misleading _pending_set to _enqueued_urls
2. Extracted duplicated line-index code into shared cli/utils.py
3. Fixed pre-existing "tutorial" vs "tutorials" key mismatch bug in infer_categories()
4. Removed unnecessary _store_results() closure
5. Simplified parser pre-import pattern
This commit is contained in:
copperlang2007
2026-03-14 13:35:39 -07:00
committed by GitHub
parent 0ca271cdcb
commit 89f5e6fe5f
5 changed files with 191 additions and 140 deletions

View File

@@ -40,6 +40,7 @@ Credits:
"""
import ast
import bisect
import logging
import re
from dataclasses import dataclass, field
@@ -95,6 +96,16 @@ class DependencyAnalyzer:
self.graph = nx.DiGraph() # Directed graph for dependencies
self.file_dependencies: dict[str, list[DependencyInfo]] = {}
self.file_nodes: dict[str, FileNode] = {}
self._newline_offsets: list[int] = []
@staticmethod
def _build_line_index(content: str) -> list[int]:
"""Build a sorted list of newline positions for O(log n) line lookups."""
return [i for i, ch in enumerate(content) if ch == "\n"]
def _offset_to_line(self, offset: int) -> int:
"""Convert a character offset to a 1-based line number using bisect."""
return bisect.bisect_left(self._newline_offsets, offset) + 1
def analyze_file(self, file_path: str, content: str, language: str) -> list[DependencyInfo]:
"""
@@ -109,6 +120,9 @@ class DependencyAnalyzer:
Returns:
List of DependencyInfo objects
"""
# Build line index once for O(log n) lookups in all extractors
self._newline_offsets = self._build_line_index(content)
if language == "Python":
deps = self._extract_python_imports(content, file_path)
elif language == "GDScript":
@@ -216,7 +230,7 @@ class DependencyAnalyzer:
preload_pattern = r'(?:const|var)\s+\w+\s*=\s*preload\("(.+?)"\)'
for match in re.finditer(preload_pattern, content):
resource_path = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
# Convert res:// paths to relative
if resource_path.startswith("res://"):
@@ -236,7 +250,7 @@ class DependencyAnalyzer:
load_pattern = r'(?:const|var)\s+\w+\s*=\s*load\("(.+?)"\)'
for match in re.finditer(load_pattern, content):
resource_path = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
if resource_path.startswith("res://"):
resource_path = resource_path[6:]
@@ -255,7 +269,7 @@ class DependencyAnalyzer:
extends_path_pattern = r'extends\s+"(.+?)"'
for match in re.finditer(extends_path_pattern, content):
resource_path = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
if resource_path.startswith("res://"):
resource_path = resource_path[6:]
@@ -275,7 +289,7 @@ class DependencyAnalyzer:
extends_class_pattern = r"extends\s+([A-Z]\w+)"
for match in re.finditer(extends_class_pattern, content):
class_name = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
# Skip built-in Godot classes (Node, Resource, etc.)
if class_name not in (
@@ -334,7 +348,7 @@ class DependencyAnalyzer:
import_pattern = r"import\s+(?:[\w\s{},*]+\s+from\s+)?['\"]([^'\"]+)['\"]"
for match in re.finditer(import_pattern, content):
module = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
is_relative = module.startswith(".") or module.startswith("/")
deps.append(
@@ -351,7 +365,7 @@ class DependencyAnalyzer:
require_pattern = r"require\s*\(['\"]([^'\"]+)['\"]\)"
for match in re.finditer(require_pattern, content):
module = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
is_relative = module.startswith(".") or module.startswith("/")
deps.append(
@@ -380,7 +394,7 @@ class DependencyAnalyzer:
include_pattern = r'#include\s+[<"]([^>"]+)[>"]'
for match in re.finditer(include_pattern, content):
header = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
# Headers with "" are usually local, <> are system headers
is_relative = '"' in match.group(0)
@@ -417,7 +431,7 @@ class DependencyAnalyzer:
for match in re.finditer(using_pattern, content):
alias = match.group(1) # Optional alias
namespace = match.group(2)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
# Skip 'using' statements for IDisposable (using var x = ...)
if "=" in match.group(0) and not alias:
@@ -454,7 +468,7 @@ class DependencyAnalyzer:
for match in re.finditer(single_import_pattern, content):
match.group(1) # Optional alias
package = match.group(2)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
# Check if relative (starts with ./ or ../)
is_relative = package.startswith("./")
@@ -516,7 +530,7 @@ class DependencyAnalyzer:
use_pattern = r"use\s+([\w:{}]+(?:\s*,\s*[\w:{}]+)*|[\w:]+::\{[^}]+\})\s*;"
for match in re.finditer(use_pattern, content):
module_path = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
# Determine if relative
is_relative = module_path.startswith(("self::", "super::"))
@@ -571,7 +585,7 @@ class DependencyAnalyzer:
import_pattern = r"import\s+(?:static\s+)?([A-Za-z_][\w.]*(?:\.\*)?)\s*;"
for match in re.finditer(import_pattern, content):
import_path = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
deps.append(
DependencyInfo(
@@ -603,7 +617,7 @@ class DependencyAnalyzer:
require_pattern = r"require\s+['\"]([^'\"]+)['\"]"
for match in re.finditer(require_pattern, content):
module = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
deps.append(
DependencyInfo(
@@ -619,7 +633,7 @@ class DependencyAnalyzer:
require_relative_pattern = r"require_relative\s+['\"]([^'\"]+)['\"]"
for match in re.finditer(require_relative_pattern, content):
module = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
deps.append(
DependencyInfo(
@@ -635,7 +649,7 @@ class DependencyAnalyzer:
load_pattern = r"load\s+['\"]([^'\"]+)['\"]"
for match in re.finditer(load_pattern, content):
module = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
deps.append(
DependencyInfo(
@@ -669,7 +683,7 @@ class DependencyAnalyzer:
require_pattern = r"(?:require|include)(?:_once)?\s+['\"]([^'\"]+)['\"]"
for match in re.finditer(require_pattern, content):
module = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
# Determine import type
import_type = "require" if "require" in match.group(0) else "include"
@@ -691,7 +705,7 @@ class DependencyAnalyzer:
use_pattern = r"use\s+([A-Za-z_][\w\\]*)\s*(?:as\s+\w+)?\s*;"
for match in re.finditer(use_pattern, content):
namespace = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
deps.append(
DependencyInfo(
@@ -908,7 +922,7 @@ class DependencyAnalyzer:
source_file=file_path,
imported_module=resource_path,
import_type="ext_resource",
line_number=content[: match.start()].count("\n") + 1,
line_number=self._offset_to_line(match.start()),
)
)
@@ -924,7 +938,7 @@ class DependencyAnalyzer:
source_file=file_path,
imported_module=resource_path,
import_type="preload",
line_number=content[: match.start()].count("\n") + 1,
line_number=self._offset_to_line(match.start()),
)
)