perf: optimize with caching, pre-compiled regex, O(1) lookups, and bisect line indexing (#309)
## Summary
Performance optimizations across core scraping and analysis modules:
- **doc_scraper.py**: Pre-compiled regex at module level, O(1) URL dedup via _enqueued_urls set, cached URL patterns, _enqueue_url() helper (DRY), seen_links set for link extraction, pre-lowercased category keywords, async error logging (bug fix), summary I/O error handling
- **code_analyzer.py**: O(log n) bisect-based line lookups replacing O(n) count("\n") across all 10 language analyzers; O(n) parent class map replacing O(n^2) AST walks for Python method detection
- **dependency_analyzer.py**: Same bisect line-index optimization for all import extractors
- **codebase_scraper.py**: Module-level import re, pre-imported parser classes outside loop
- **github_scraper.py**: deque.popleft() for O(1) tree traversal, module-level import fnmatch
- **utils.py**: Shared build_line_index() / offset_to_line() utilities (DRY)
- **test_adaptor_benchmarks.py**: Stabilized flaky test_benchmark_metadata_overhead (median, warm-up, more iterations)
Review fixes applied on top of original PR:
1. Renamed misleading _pending_set to _enqueued_urls
2. Extracted duplicated line-index code into shared cli/utils.py
3. Fixed pre-existing "tutorial" vs "tutorials" key mismatch bug in infer_categories()
4. Removed unnecessary _store_results() closure
5. Simplified parser pre-import pattern
This commit is contained in:
@@ -40,6 +40,7 @@ Credits:
|
||||
"""
|
||||
|
||||
import ast
|
||||
import bisect
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
@@ -95,6 +96,16 @@ class DependencyAnalyzer:
|
||||
self.graph = nx.DiGraph() # Directed graph for dependencies
|
||||
self.file_dependencies: dict[str, list[DependencyInfo]] = {}
|
||||
self.file_nodes: dict[str, FileNode] = {}
|
||||
self._newline_offsets: list[int] = []
|
||||
|
||||
@staticmethod
|
||||
def _build_line_index(content: str) -> list[int]:
|
||||
"""Build a sorted list of newline positions for O(log n) line lookups."""
|
||||
return [i for i, ch in enumerate(content) if ch == "\n"]
|
||||
|
||||
def _offset_to_line(self, offset: int) -> int:
|
||||
"""Convert a character offset to a 1-based line number using bisect."""
|
||||
return bisect.bisect_left(self._newline_offsets, offset) + 1
|
||||
|
||||
def analyze_file(self, file_path: str, content: str, language: str) -> list[DependencyInfo]:
|
||||
"""
|
||||
@@ -109,6 +120,9 @@ class DependencyAnalyzer:
|
||||
Returns:
|
||||
List of DependencyInfo objects
|
||||
"""
|
||||
# Build line index once for O(log n) lookups in all extractors
|
||||
self._newline_offsets = self._build_line_index(content)
|
||||
|
||||
if language == "Python":
|
||||
deps = self._extract_python_imports(content, file_path)
|
||||
elif language == "GDScript":
|
||||
@@ -216,7 +230,7 @@ class DependencyAnalyzer:
|
||||
preload_pattern = r'(?:const|var)\s+\w+\s*=\s*preload\("(.+?)"\)'
|
||||
for match in re.finditer(preload_pattern, content):
|
||||
resource_path = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
# Convert res:// paths to relative
|
||||
if resource_path.startswith("res://"):
|
||||
@@ -236,7 +250,7 @@ class DependencyAnalyzer:
|
||||
load_pattern = r'(?:const|var)\s+\w+\s*=\s*load\("(.+?)"\)'
|
||||
for match in re.finditer(load_pattern, content):
|
||||
resource_path = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
if resource_path.startswith("res://"):
|
||||
resource_path = resource_path[6:]
|
||||
@@ -255,7 +269,7 @@ class DependencyAnalyzer:
|
||||
extends_path_pattern = r'extends\s+"(.+?)"'
|
||||
for match in re.finditer(extends_path_pattern, content):
|
||||
resource_path = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
if resource_path.startswith("res://"):
|
||||
resource_path = resource_path[6:]
|
||||
@@ -275,7 +289,7 @@ class DependencyAnalyzer:
|
||||
extends_class_pattern = r"extends\s+([A-Z]\w+)"
|
||||
for match in re.finditer(extends_class_pattern, content):
|
||||
class_name = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
# Skip built-in Godot classes (Node, Resource, etc.)
|
||||
if class_name not in (
|
||||
@@ -334,7 +348,7 @@ class DependencyAnalyzer:
|
||||
import_pattern = r"import\s+(?:[\w\s{},*]+\s+from\s+)?['\"]([^'\"]+)['\"]"
|
||||
for match in re.finditer(import_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
is_relative = module.startswith(".") or module.startswith("/")
|
||||
|
||||
deps.append(
|
||||
@@ -351,7 +365,7 @@ class DependencyAnalyzer:
|
||||
require_pattern = r"require\s*\(['\"]([^'\"]+)['\"]\)"
|
||||
for match in re.finditer(require_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
is_relative = module.startswith(".") or module.startswith("/")
|
||||
|
||||
deps.append(
|
||||
@@ -380,7 +394,7 @@ class DependencyAnalyzer:
|
||||
include_pattern = r'#include\s+[<"]([^>"]+)[>"]'
|
||||
for match in re.finditer(include_pattern, content):
|
||||
header = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
# Headers with "" are usually local, <> are system headers
|
||||
is_relative = '"' in match.group(0)
|
||||
@@ -417,7 +431,7 @@ class DependencyAnalyzer:
|
||||
for match in re.finditer(using_pattern, content):
|
||||
alias = match.group(1) # Optional alias
|
||||
namespace = match.group(2)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
# Skip 'using' statements for IDisposable (using var x = ...)
|
||||
if "=" in match.group(0) and not alias:
|
||||
@@ -454,7 +468,7 @@ class DependencyAnalyzer:
|
||||
for match in re.finditer(single_import_pattern, content):
|
||||
match.group(1) # Optional alias
|
||||
package = match.group(2)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
# Check if relative (starts with ./ or ../)
|
||||
is_relative = package.startswith("./")
|
||||
@@ -516,7 +530,7 @@ class DependencyAnalyzer:
|
||||
use_pattern = r"use\s+([\w:{}]+(?:\s*,\s*[\w:{}]+)*|[\w:]+::\{[^}]+\})\s*;"
|
||||
for match in re.finditer(use_pattern, content):
|
||||
module_path = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
# Determine if relative
|
||||
is_relative = module_path.startswith(("self::", "super::"))
|
||||
@@ -571,7 +585,7 @@ class DependencyAnalyzer:
|
||||
import_pattern = r"import\s+(?:static\s+)?([A-Za-z_][\w.]*(?:\.\*)?)\s*;"
|
||||
for match in re.finditer(import_pattern, content):
|
||||
import_path = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
@@ -603,7 +617,7 @@ class DependencyAnalyzer:
|
||||
require_pattern = r"require\s+['\"]([^'\"]+)['\"]"
|
||||
for match in re.finditer(require_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
@@ -619,7 +633,7 @@ class DependencyAnalyzer:
|
||||
require_relative_pattern = r"require_relative\s+['\"]([^'\"]+)['\"]"
|
||||
for match in re.finditer(require_relative_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
@@ -635,7 +649,7 @@ class DependencyAnalyzer:
|
||||
load_pattern = r"load\s+['\"]([^'\"]+)['\"]"
|
||||
for match in re.finditer(load_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
@@ -669,7 +683,7 @@ class DependencyAnalyzer:
|
||||
require_pattern = r"(?:require|include)(?:_once)?\s+['\"]([^'\"]+)['\"]"
|
||||
for match in re.finditer(require_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
# Determine import type
|
||||
import_type = "require" if "require" in match.group(0) else "include"
|
||||
@@ -691,7 +705,7 @@ class DependencyAnalyzer:
|
||||
use_pattern = r"use\s+([A-Za-z_][\w\\]*)\s*(?:as\s+\w+)?\s*;"
|
||||
for match in re.finditer(use_pattern, content):
|
||||
namespace = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
@@ -908,7 +922,7 @@ class DependencyAnalyzer:
|
||||
source_file=file_path,
|
||||
imported_module=resource_path,
|
||||
import_type="ext_resource",
|
||||
line_number=content[: match.start()].count("\n") + 1,
|
||||
line_number=self._offset_to_line(match.start()),
|
||||
)
|
||||
)
|
||||
|
||||
@@ -924,7 +938,7 @@ class DependencyAnalyzer:
|
||||
source_file=file_path,
|
||||
imported_module=resource_path,
|
||||
import_type="preload",
|
||||
line_number=content[: match.start()].count("\n") + 1,
|
||||
line_number=self._offset_to_line(match.start()),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user