perf: optimize with caching, pre-compiled regex, O(1) lookups, and bisect line indexing (#309)

## Summary

Performance optimizations across core scraping and analysis modules:

- **doc_scraper.py**: Pre-compiled regex at module level, O(1) URL dedup via _enqueued_urls set, cached URL patterns, _enqueue_url() helper (DRY), seen_links set for link extraction, pre-lowercased category keywords, async error logging (bug fix), summary I/O error handling
- **code_analyzer.py**: O(log n) bisect-based line lookups replacing O(n) count("\n") across all 10 language analyzers; O(n) parent class map replacing O(n^2) AST walks for Python method detection
- **dependency_analyzer.py**: Same bisect line-index optimization for all import extractors
- **codebase_scraper.py**: Module-level import re, pre-imported parser classes outside loop
- **github_scraper.py**: deque.popleft() for O(1) tree traversal, module-level import fnmatch
- **utils.py**: Shared build_line_index() / offset_to_line() utilities (DRY)
- **test_adaptor_benchmarks.py**: Stabilized flaky test_benchmark_metadata_overhead (median, warm-up, more iterations)

Review fixes applied on top of original PR:
1. Renamed misleading _pending_set to _enqueued_urls
2. Extracted duplicated line-index code into shared cli/utils.py
3. Fixed pre-existing "tutorial" vs "tutorials" key mismatch bug in infer_categories()
4. Removed unnecessary _store_results() closure
5. Simplified parser pre-import pattern
This commit is contained in:
copperlang2007
2026-03-14 13:35:39 -07:00
committed by GitHub
parent 0ca271cdcb
commit 89f5e6fe5f
5 changed files with 191 additions and 140 deletions

View File

@@ -23,6 +23,7 @@ consider using dedicated parsers (tree-sitter, language-specific AST libraries).
"""
import ast
import bisect
import contextlib
import logging
import re
@@ -84,6 +85,16 @@ class CodeAnalyzer:
depth: Analysis depth ('surface', 'deep', 'full')
"""
self.depth = depth
self._newline_offsets: list[int] = []
@staticmethod
def _build_line_index(content: str) -> list[int]:
"""Build a sorted list of newline positions for O(log n) line lookups."""
return [i for i, ch in enumerate(content) if ch == "\n"]
def _offset_to_line(self, offset: int) -> int:
"""Convert a character offset to a 1-based line number using bisect."""
return bisect.bisect_left(self._newline_offsets, offset) + 1
def analyze_file(self, file_path: str, content: str, language: str) -> dict[str, Any]:
"""
@@ -149,35 +160,26 @@ class CodeAnalyzer:
functions = []
imports = []
# Build parent map once (O(n)) instead of walking tree per node (O(n²))
class_children: set[int] = set()
for node in ast.walk(tree):
if isinstance(node, ast.ClassDef) and isinstance(node.body, list):
for child in node.body:
class_children.add(id(child))
for node in ast.walk(tree):
if isinstance(node, ast.ClassDef):
class_sig = self._extract_python_class(node)
classes.append(asdict(class_sig))
elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
# Only top-level functions (not methods)
# Fix AST parser to check isinstance(parent.body, list) before 'in' operator
is_method = False
try:
is_method = any(
isinstance(parent, ast.ClassDef)
for parent in ast.walk(tree)
if hasattr(parent, "body")
and isinstance(parent.body, list)
and node in parent.body
)
except (TypeError, AttributeError):
# If body is not iterable or check fails, assume it's a top-level function
is_method = False
if not is_method:
# Only top-level functions (not methods) - O(1) lookup via pre-built set
if id(node) not in class_children:
func_sig = self._extract_python_function(node)
functions.append(asdict(func_sig))
elif isinstance(node, ast.Import):
# Extract: import foo, bar
for alias in node.names:
imports.append(alias.name)
elif isinstance(node, ast.ImportFrom):
# Extract: from foo import bar
module = node.module or ""
imports.append(module)
@@ -188,7 +190,7 @@ class CodeAnalyzer:
"classes": classes,
"functions": functions,
"comments": comments,
"imports": imports, # Include imports for framework detection
"imports": imports,
}
def _extract_python_class(self, node: ast.ClassDef) -> ClassSignature:
@@ -285,6 +287,7 @@ class CodeAnalyzer:
Note: This is a simplified approach. For production, consider using
a proper JS/TS parser like esprima or ts-morph.
"""
self._newline_offsets = self._build_line_index(content)
classes = []
functions = []
@@ -310,7 +313,7 @@ class CodeAnalyzer:
"base_classes": [base_class] if base_class else [],
"methods": methods,
"docstring": None,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
}
)
@@ -329,7 +332,7 @@ class CodeAnalyzer:
"parameters": params,
"return_type": None, # JS doesn't have type annotations (unless TS)
"docstring": None,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
"is_async": is_async,
"is_method": False,
"decorators": [],
@@ -351,7 +354,7 @@ class CodeAnalyzer:
"parameters": params,
"return_type": None,
"docstring": None,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
"is_async": is_async,
"is_method": False,
"decorators": [],
@@ -460,6 +463,7 @@ class CodeAnalyzer:
Note: This is a simplified approach focusing on header files.
For production, consider using libclang or similar.
"""
self._newline_offsets = self._build_line_index(content)
classes = []
functions = []
@@ -475,7 +479,7 @@ class CodeAnalyzer:
"base_classes": [base_class] if base_class else [],
"methods": [], # Simplified - would need to parse class body
"docstring": None,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
}
)
@@ -498,7 +502,7 @@ class CodeAnalyzer:
"parameters": params,
"return_type": return_type,
"docstring": None,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
"is_async": False,
"is_method": False,
"decorators": [],
@@ -577,14 +581,14 @@ class CodeAnalyzer:
# Extract single-line comments (//)
for match in re.finditer(r"//(.+)$", content, re.MULTILINE):
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
comment_text = match.group(1).strip()
comments.append({"line": line_num, "text": comment_text, "type": "inline"})
# Extract multi-line comments (/* */)
for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL):
start_line = content[: match.start()].count("\n") + 1
start_line = self._offset_to_line(match.start())
comment_text = match.group(1).strip()
comments.append({"line": start_line, "text": comment_text, "type": "block"})
@@ -610,6 +614,7 @@ class CodeAnalyzer:
Regex patterns inspired by C# language specification:
https://learn.microsoft.com/en-us/dotnet/csharp/language-reference/
"""
self._newline_offsets = self._build_line_index(content)
classes = []
functions = []
@@ -651,7 +656,7 @@ class CodeAnalyzer:
"base_classes": base_classes,
"methods": methods,
"docstring": None, # Would need to extract XML doc comments
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
}
)
@@ -676,7 +681,7 @@ class CodeAnalyzer:
"parameters": params,
"return_type": return_type,
"docstring": None,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
"is_async": is_async,
"is_method": False,
"decorators": [],
@@ -791,7 +796,7 @@ class CodeAnalyzer:
# Single-line comments (//)
for match in re.finditer(r"//(.+)$", content, re.MULTILINE):
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
comment_text = match.group(1).strip()
# Distinguish XML doc comments (///)
@@ -803,7 +808,7 @@ class CodeAnalyzer:
# Multi-line comments (/* */)
for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL):
start_line = content[: match.start()].count("\n") + 1
start_line = self._offset_to_line(match.start())
comment_text = match.group(1).strip()
comments.append({"line": start_line, "text": comment_text, "type": "block"})
@@ -820,6 +825,7 @@ class CodeAnalyzer:
Regex patterns based on Go language specification:
https://go.dev/ref/spec
"""
self._newline_offsets = self._build_line_index(content)
classes = [] # Go doesn't have classes, but we'll extract structs
functions = []
@@ -834,7 +840,7 @@ class CodeAnalyzer:
"base_classes": [], # Go uses embedding, not inheritance
"methods": [], # Methods extracted separately
"docstring": None,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
}
)
@@ -867,7 +873,7 @@ class CodeAnalyzer:
"parameters": params,
"return_type": return_type,
"docstring": None,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
"is_async": False, # Go uses goroutines differently
"is_method": is_method,
"decorators": [],
@@ -929,6 +935,7 @@ class CodeAnalyzer:
Regex patterns based on Rust language reference:
https://doc.rust-lang.org/reference/
"""
self._newline_offsets = self._build_line_index(content)
classes = [] # Rust uses structs/enums/traits
functions = []
@@ -943,7 +950,7 @@ class CodeAnalyzer:
"base_classes": [], # Rust uses traits, not inheritance
"methods": [],
"docstring": None,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
}
)
@@ -964,7 +971,7 @@ class CodeAnalyzer:
"parameters": params,
"return_type": return_type,
"docstring": None,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
"is_async": is_async,
"is_method": False,
"decorators": [],
@@ -1016,7 +1023,7 @@ class CodeAnalyzer:
# Single-line comments (//)
for match in re.finditer(r"//(.+)$", content, re.MULTILINE):
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
comment_text = match.group(1).strip()
# Distinguish doc comments (/// or //!)
@@ -1030,7 +1037,7 @@ class CodeAnalyzer:
# Multi-line comments (/* */)
for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL):
start_line = content[: match.start()].count("\n") + 1
start_line = self._offset_to_line(match.start())
comment_text = match.group(1).strip()
comments.append({"line": start_line, "text": comment_text, "type": "block"})
@@ -1047,6 +1054,7 @@ class CodeAnalyzer:
Regex patterns based on Java language specification:
https://docs.oracle.com/javase/specs/
"""
self._newline_offsets = self._build_line_index(content)
classes = []
functions = []
@@ -1089,7 +1097,7 @@ class CodeAnalyzer:
"base_classes": base_classes,
"methods": methods,
"docstring": None,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
}
)
@@ -1112,7 +1120,7 @@ class CodeAnalyzer:
"parameters": params,
"return_type": return_type,
"docstring": None,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
"is_async": False,
"is_method": False,
"decorators": [],
@@ -1221,14 +1229,14 @@ class CodeAnalyzer:
# Single-line comments (//)
for match in re.finditer(r"//(.+)$", content, re.MULTILINE):
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
comment_text = match.group(1).strip()
comments.append({"line": line_num, "text": comment_text, "type": "inline"})
# Multi-line and JavaDoc comments (/* */ and /** */)
for match in re.finditer(r"/\*\*?(.+?)\*/", content, re.DOTALL):
start_line = content[: match.start()].count("\n") + 1
start_line = self._offset_to_line(match.start())
comment_text = match.group(1).strip()
# Distinguish JavaDoc (starts with **)
@@ -1248,6 +1256,7 @@ class CodeAnalyzer:
Regex patterns based on Ruby language documentation:
https://ruby-doc.org/
"""
self._newline_offsets = self._build_line_index(content)
classes = []
functions = []
@@ -1265,7 +1274,7 @@ class CodeAnalyzer:
"base_classes": base_classes,
"methods": [], # Would need to parse class body
"docstring": None,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
}
)
@@ -1284,7 +1293,7 @@ class CodeAnalyzer:
"parameters": params,
"return_type": None, # Ruby has no type annotations (usually)
"docstring": None,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
"is_async": False,
"is_method": False,
"decorators": [],
@@ -1365,6 +1374,7 @@ class CodeAnalyzer:
Regex patterns based on PHP language reference:
https://www.php.net/manual/en/langref.php
"""
self._newline_offsets = self._build_line_index(content)
classes = []
functions = []
@@ -1406,7 +1416,7 @@ class CodeAnalyzer:
"base_classes": base_classes,
"methods": methods,
"docstring": None,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
}
)
@@ -1425,7 +1435,7 @@ class CodeAnalyzer:
"parameters": params,
"return_type": return_type,
"docstring": None,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
"is_async": False,
"is_method": False,
"decorators": [],
@@ -1526,14 +1536,14 @@ class CodeAnalyzer:
# Single-line comments (// and #)
for match in re.finditer(r"(?://|#)(.+)$", content, re.MULTILINE):
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
comment_text = match.group(1).strip()
comments.append({"line": line_num, "text": comment_text, "type": "inline"})
# Multi-line and PHPDoc comments (/* */ and /** */)
for match in re.finditer(r"/\*\*?(.+?)\*/", content, re.DOTALL):
start_line = content[: match.start()].count("\n") + 1
start_line = self._offset_to_line(match.start())
comment_text = match.group(1).strip()
# Distinguish PHPDoc (starts with **)
@@ -1708,6 +1718,7 @@ class CodeAnalyzer:
- @export var speed: float = 100.0
- @onready var sprite = $Sprite2D
"""
self._newline_offsets = self._build_line_index(content)
classes = []
functions = []
signals = []
@@ -1764,7 +1775,7 @@ class CodeAnalyzer:
"name": func_name,
"parameters": param_list,
"return_type": return_type,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
}
)
@@ -1774,7 +1785,7 @@ class CodeAnalyzer:
for match in re.finditer(r"signal\s+(\w+)(?:\(([^)]*)\))?", content):
signal_name, params = match.groups()
line_number = content[: match.start()].count("\n") + 1
line_number = self._offset_to_line(match.start())
# Extract documentation comment above signal (## or #)
doc_comment = None
@@ -1800,7 +1811,7 @@ class CodeAnalyzer:
{
"signal": signal_path,
"handler": handler.strip(),
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
}
)
@@ -1811,7 +1822,7 @@ class CodeAnalyzer:
{
"signal": signal_path,
"arguments": args.strip() if args else "",
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
}
)
@@ -1827,7 +1838,7 @@ class CodeAnalyzer:
"type": var_type,
"default": default,
"export_hint": hint,
"line_number": content[: match.start()].count("\n") + 1,
"line_number": self._offset_to_line(match.start()),
}
)

View File

@@ -28,6 +28,7 @@ import argparse
import json
import logging
import os
import re
import sys
from pathlib import Path
from typing import Any
@@ -380,8 +381,6 @@ def extract_markdown_structure(content: str) -> dict[str, Any]:
Returns:
Dictionary with extracted structure
"""
import re
structure = {
"title": None,
"headers": [],
@@ -526,8 +525,6 @@ def extract_rst_structure(content: str) -> dict[str, Any]:
logger.warning(f"Enhanced RST parser failed: {e}, using basic parser")
# Legacy basic extraction (fallback)
import re
structure = {
"title": None,
"headers": [],
@@ -679,6 +676,17 @@ def process_markdown_docs(
processed_docs = []
categories = {}
# Pre-import parsers once outside the loop
_rst_parser_cls = None
_md_parser_cls = None
try:
from skill_seekers.cli.parsers.extractors import RstParser, MarkdownParser
_rst_parser_cls = RstParser
_md_parser_cls = MarkdownParser
except ImportError:
logger.debug("Unified parsers not available, using legacy parsers")
for md_path in md_files:
try:
content = md_path.read_text(encoding="utf-8", errors="ignore")
@@ -701,7 +709,10 @@ def process_markdown_docs(
parsed_doc = None
try:
from skill_seekers.cli.parsers.extractors import RstParser, MarkdownParser
RstParser = _rst_parser_cls
MarkdownParser = _md_parser_cls
if RstParser is None or MarkdownParser is None:
raise ImportError("Parsers not available")
# Use appropriate unified parser based on file extension
if md_path.suffix.lower() in RST_EXTENSIONS:
@@ -957,8 +968,6 @@ Return JSON with format:
# Parse response and merge enhancements
try:
import re
json_match = re.search(r"\{.*\}", response.content[0].text, re.DOTALL)
if json_match:
enhancements = json.loads(json_match.group())
@@ -1022,8 +1031,6 @@ Output JSON only:
os.unlink(prompt_file)
if result.returncode == 0 and result.stdout:
import re
json_match = re.search(r"\{.*\}", result.stdout, re.DOTALL)
if json_match:
enhancements = json.loads(json_match.group())

View File

@@ -40,6 +40,7 @@ Credits:
"""
import ast
import bisect
import logging
import re
from dataclasses import dataclass, field
@@ -95,6 +96,16 @@ class DependencyAnalyzer:
self.graph = nx.DiGraph() # Directed graph for dependencies
self.file_dependencies: dict[str, list[DependencyInfo]] = {}
self.file_nodes: dict[str, FileNode] = {}
self._newline_offsets: list[int] = []
@staticmethod
def _build_line_index(content: str) -> list[int]:
"""Build a sorted list of newline positions for O(log n) line lookups."""
return [i for i, ch in enumerate(content) if ch == "\n"]
def _offset_to_line(self, offset: int) -> int:
"""Convert a character offset to a 1-based line number using bisect."""
return bisect.bisect_left(self._newline_offsets, offset) + 1
def analyze_file(self, file_path: str, content: str, language: str) -> list[DependencyInfo]:
"""
@@ -109,6 +120,9 @@ class DependencyAnalyzer:
Returns:
List of DependencyInfo objects
"""
# Build line index once for O(log n) lookups in all extractors
self._newline_offsets = self._build_line_index(content)
if language == "Python":
deps = self._extract_python_imports(content, file_path)
elif language == "GDScript":
@@ -216,7 +230,7 @@ class DependencyAnalyzer:
preload_pattern = r'(?:const|var)\s+\w+\s*=\s*preload\("(.+?)"\)'
for match in re.finditer(preload_pattern, content):
resource_path = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
# Convert res:// paths to relative
if resource_path.startswith("res://"):
@@ -236,7 +250,7 @@ class DependencyAnalyzer:
load_pattern = r'(?:const|var)\s+\w+\s*=\s*load\("(.+?)"\)'
for match in re.finditer(load_pattern, content):
resource_path = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
if resource_path.startswith("res://"):
resource_path = resource_path[6:]
@@ -255,7 +269,7 @@ class DependencyAnalyzer:
extends_path_pattern = r'extends\s+"(.+?)"'
for match in re.finditer(extends_path_pattern, content):
resource_path = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
if resource_path.startswith("res://"):
resource_path = resource_path[6:]
@@ -275,7 +289,7 @@ class DependencyAnalyzer:
extends_class_pattern = r"extends\s+([A-Z]\w+)"
for match in re.finditer(extends_class_pattern, content):
class_name = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
# Skip built-in Godot classes (Node, Resource, etc.)
if class_name not in (
@@ -334,7 +348,7 @@ class DependencyAnalyzer:
import_pattern = r"import\s+(?:[\w\s{},*]+\s+from\s+)?['\"]([^'\"]+)['\"]"
for match in re.finditer(import_pattern, content):
module = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
is_relative = module.startswith(".") or module.startswith("/")
deps.append(
@@ -351,7 +365,7 @@ class DependencyAnalyzer:
require_pattern = r"require\s*\(['\"]([^'\"]+)['\"]\)"
for match in re.finditer(require_pattern, content):
module = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
is_relative = module.startswith(".") or module.startswith("/")
deps.append(
@@ -380,7 +394,7 @@ class DependencyAnalyzer:
include_pattern = r'#include\s+[<"]([^>"]+)[>"]'
for match in re.finditer(include_pattern, content):
header = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
# Headers with "" are usually local, <> are system headers
is_relative = '"' in match.group(0)
@@ -417,7 +431,7 @@ class DependencyAnalyzer:
for match in re.finditer(using_pattern, content):
alias = match.group(1) # Optional alias
namespace = match.group(2)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
# Skip 'using' statements for IDisposable (using var x = ...)
if "=" in match.group(0) and not alias:
@@ -454,7 +468,7 @@ class DependencyAnalyzer:
for match in re.finditer(single_import_pattern, content):
match.group(1) # Optional alias
package = match.group(2)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
# Check if relative (starts with ./ or ../)
is_relative = package.startswith("./")
@@ -516,7 +530,7 @@ class DependencyAnalyzer:
use_pattern = r"use\s+([\w:{}]+(?:\s*,\s*[\w:{}]+)*|[\w:]+::\{[^}]+\})\s*;"
for match in re.finditer(use_pattern, content):
module_path = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
# Determine if relative
is_relative = module_path.startswith(("self::", "super::"))
@@ -571,7 +585,7 @@ class DependencyAnalyzer:
import_pattern = r"import\s+(?:static\s+)?([A-Za-z_][\w.]*(?:\.\*)?)\s*;"
for match in re.finditer(import_pattern, content):
import_path = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
deps.append(
DependencyInfo(
@@ -603,7 +617,7 @@ class DependencyAnalyzer:
require_pattern = r"require\s+['\"]([^'\"]+)['\"]"
for match in re.finditer(require_pattern, content):
module = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
deps.append(
DependencyInfo(
@@ -619,7 +633,7 @@ class DependencyAnalyzer:
require_relative_pattern = r"require_relative\s+['\"]([^'\"]+)['\"]"
for match in re.finditer(require_relative_pattern, content):
module = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
deps.append(
DependencyInfo(
@@ -635,7 +649,7 @@ class DependencyAnalyzer:
load_pattern = r"load\s+['\"]([^'\"]+)['\"]"
for match in re.finditer(load_pattern, content):
module = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
deps.append(
DependencyInfo(
@@ -669,7 +683,7 @@ class DependencyAnalyzer:
require_pattern = r"(?:require|include)(?:_once)?\s+['\"]([^'\"]+)['\"]"
for match in re.finditer(require_pattern, content):
module = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
# Determine import type
import_type = "require" if "require" in match.group(0) else "include"
@@ -691,7 +705,7 @@ class DependencyAnalyzer:
use_pattern = r"use\s+([A-Za-z_][\w\\]*)\s*(?:as\s+\w+)?\s*;"
for match in re.finditer(use_pattern, content):
namespace = match.group(1)
line_num = content[: match.start()].count("\n") + 1
line_num = self._offset_to_line(match.start())
deps.append(
DependencyInfo(
@@ -908,7 +922,7 @@ class DependencyAnalyzer:
source_file=file_path,
imported_module=resource_path,
import_type="ext_resource",
line_number=content[: match.start()].count("\n") + 1,
line_number=self._offset_to_line(match.start()),
)
)
@@ -924,7 +938,7 @@ class DependencyAnalyzer:
source_file=file_path,
imported_module=resource_path,
import_type="preload",
line_number=content[: match.start()].count("\n") + 1,
line_number=self._offset_to_line(match.start()),
)
)

View File

@@ -64,6 +64,11 @@ FALLBACK_MAIN_SELECTORS = [
"#main-content",
]
# Pre-compiled regex patterns for frequently called methods
_WHITESPACE_RE = re.compile(r"\s+")
_SAFE_TITLE_RE = re.compile(r"[^\w\s-]")
_SAFE_TITLE_SEP_RE = re.compile(r"[-\s]+")
def infer_description_from_docs(
base_url: str, first_page_content: str | None = None, name: str = ""
@@ -188,12 +193,18 @@ class DocToSkillConverter:
# Support multiple starting URLs
start_urls = config.get("start_urls", [self.base_url])
self.pending_urls = deque(start_urls)
self._pending_set: set[str] = set(start_urls) # Shadow set for O(1) membership checks
self.pages: list[dict[str, Any]] = []
self.pages_scraped = 0
# Language detection
self.language_detector = LanguageDetector(min_confidence=0.15)
# Pre-cache URL patterns for faster is_valid_url checks
url_patterns = config.get("url_patterns", {})
self._include_patterns: list[str] = url_patterns.get("include", [])
self._exclude_patterns: list[str] = url_patterns.get("exclude", [])
# Thread-safe lock for parallel scraping
if self.workers > 1:
import threading
@@ -211,6 +222,12 @@ class DocToSkillConverter:
if resume and not dry_run:
self.load_checkpoint()
def _enqueue_url(self, url: str) -> None:
"""Add a URL to the pending queue if not already visited or pending (O(1))."""
if url not in self.visited_urls and url not in self._pending_set:
self._pending_set.add(url)
self.pending_urls.append(url)
def is_valid_url(self, url: str) -> bool:
"""Check if URL should be scraped based on patterns.
@@ -223,14 +240,10 @@ class DocToSkillConverter:
if not url.startswith(self.base_url):
return False
# Include patterns
includes = self.config.get("url_patterns", {}).get("include", [])
if includes and not any(pattern in url for pattern in includes):
if self._include_patterns and not any(pattern in url for pattern in self._include_patterns):
return False
# Exclude patterns
excludes = self.config.get("url_patterns", {}).get("exclude", [])
return not any(pattern in url for pattern in excludes)
return not any(pattern in url for pattern in self._exclude_patterns)
def save_checkpoint(self) -> None:
"""Save progress checkpoint"""
@@ -264,7 +277,9 @@ class DocToSkillConverter:
checkpoint_data = json.load(f)
self.visited_urls = set(checkpoint_data["visited_urls"])
self.pending_urls = deque(checkpoint_data["pending_urls"])
pending = checkpoint_data["pending_urls"]
self.pending_urls = deque(pending)
self._pending_set = set(pending)
self.pages_scraped = checkpoint_data["pages_scraped"]
logger.info("✅ Resumed from checkpoint")
@@ -337,11 +352,13 @@ class DocToSkillConverter:
# Extract links from entire page (always, even if main content not found).
# This allows discovery of navigation links outside the main content area.
seen_links: set[str] = set()
for link in soup.find_all("a", href=True):
href = urljoin(url, link["href"])
# Strip anchor fragments to avoid treating #anchors as separate pages
href = href.split("#")[0]
if self.is_valid_url(href) and href not in page["links"]:
if href not in seen_links and self.is_valid_url(href):
seen_links.add(href)
page["links"].append(href)
# Find main content using shared fallback logic
@@ -413,8 +430,6 @@ class DocToSkillConverter:
Only .md links are extracted to avoid client-side rendered HTML pages.
Anchor fragments (#section) are stripped from links.
"""
import re
# Detect if content is actually HTML (some .md URLs return HTML)
if content.strip().startswith("<!DOCTYPE") or content.strip().startswith("<html"):
return self._extract_html_as_markdown(content, url)
@@ -649,8 +664,7 @@ class DocToSkillConverter:
def clean_text(self, text: str) -> str:
"""Clean text content"""
text = re.sub(r"\s+", " ", text)
return text.strip()
return _WHITESPACE_RE.sub(" ", text).strip()
def save_page(self, page: dict[str, Any]) -> None:
"""Save page data (skip pages with empty content)"""
@@ -660,8 +674,8 @@ class DocToSkillConverter:
return
url_hash = hashlib.md5(page["url"].encode()).hexdigest()[:10]
safe_title = re.sub(r"[^\w\s-]", "", page["title"])[:50]
safe_title = re.sub(r"[-\s]+", "_", safe_title)
safe_title = _SAFE_TITLE_RE.sub("", page["title"])[:50]
safe_title = _SAFE_TITLE_SEP_RE.sub("_", safe_title)
filename = f"{safe_title}_{url_hash}.json"
filepath = os.path.join(self.data_dir, "pages", filename)
@@ -695,27 +709,19 @@ class DocToSkillConverter:
soup = BeautifulSoup(response.content, "html.parser")
page = self.extract_content(soup, url)
# Thread-safe operations (lock required)
if self.workers > 1:
with self.lock:
logger.info(" %s", url)
self.save_page(page)
self.pages.append(page)
# Add new URLs
for link in page["links"]:
if link not in self.visited_urls and link not in self.pending_urls:
self.pending_urls.append(link)
else:
# Single-threaded mode (no lock needed)
# Store results (thread-safe when workers > 1)
def _store_results():
logger.info(" %s", url)
self.save_page(page)
self.pages.append(page)
# Add new URLs
for link in page["links"]:
if link not in self.visited_urls and link not in self.pending_urls:
self.pending_urls.append(link)
self._enqueue_url(link)
if self.workers > 1:
with self.lock:
_store_results()
else:
_store_results()
# Rate limiting
rate_limit = self.config.get("rate_limit", DEFAULT_RATE_LIMIT)
@@ -766,8 +772,7 @@ class DocToSkillConverter:
# Add new URLs
for link in page["links"]:
if link not in self.visited_urls and link not in self.pending_urls:
self.pending_urls.append(link)
self._enqueue_url(link)
# Rate limiting
rate_limit = self.config.get("rate_limit", DEFAULT_RATE_LIMIT)
@@ -924,8 +929,8 @@ class DocToSkillConverter:
# Filter URLs based on url_patterns config
for url in md_urls:
if self.is_valid_url(url) and url not in self.visited_urls:
self.pending_urls.append(url)
if self.is_valid_url(url):
self._enqueue_url(url)
logger.info(
" 📋 %d URLs added to crawl queue after filtering",
@@ -1010,8 +1015,8 @@ class DocToSkillConverter:
# Filter URLs based on url_patterns config
for url in md_urls:
if self.is_valid_url(url) and url not in self.visited_urls:
self.pending_urls.append(url)
if self.is_valid_url(url):
self._enqueue_url(url)
logger.info(
" 📋 %d URLs added to crawl queue after filtering",
@@ -1115,8 +1120,8 @@ class DocToSkillConverter:
for link in soup.find_all("a", href=True):
href = urljoin(url, link["href"])
href = href.split("#")[0]
if self.is_valid_url(href) and href not in self.visited_urls:
self.pending_urls.append(href)
if self.is_valid_url(href):
self._enqueue_url(href)
except Exception as e:
# Failed to extract links in fast mode, continue anyway
logger.warning("⚠️ Warning: Could not extract links from %s: %s", url, e)
@@ -1299,8 +1304,8 @@ class DocToSkillConverter:
for link in soup.find_all("a", href=True):
href = urljoin(url, link["href"])
href = href.split("#")[0]
if self.is_valid_url(href) and href not in self.visited_urls:
self.pending_urls.append(href)
if self.is_valid_url(href):
self._enqueue_url(href)
except Exception as e:
logger.warning(
"⚠️ Warning: Could not extract links from %s: %s", url, e
@@ -1313,7 +1318,12 @@ class DocToSkillConverter:
# Wait for batch to complete before continuing
if tasks:
await asyncio.gather(*tasks, return_exceptions=True)
results = await asyncio.gather(*tasks, return_exceptions=True)
for result in results:
if isinstance(result, Exception):
logger.error(
" ✗ Async task failed: %s: %s", type(result).__name__, result
)
tasks = []
self.pages_scraped = len(self.visited_urls)
@@ -1331,7 +1341,10 @@ class DocToSkillConverter:
# Wait for any remaining tasks
if tasks:
await asyncio.gather(*tasks, return_exceptions=True)
results = await asyncio.gather(*tasks, return_exceptions=True)
for result in results:
if isinstance(result, Exception):
logger.error(" ✗ Async task failed: %s: %s", type(result).__name__, result)
if self.dry_run:
logger.info("\n✅ Dry run complete: would scrape ~%d pages", len(self.visited_urls))
@@ -1356,8 +1369,11 @@ class DocToSkillConverter:
"pages": [{"title": p["title"], "url": p["url"]} for p in self.pages],
}
with open(f"{self.data_dir}/summary.json", "w", encoding="utf-8") as f:
json.dump(summary, f, indent=2, ensure_ascii=False)
try:
with open(f"{self.data_dir}/summary.json", "w", encoding="utf-8") as f:
json.dump(summary, f, indent=2, ensure_ascii=False)
except OSError as e:
logger.error(" ✗ Failed to save summary: %s", e)
def load_scraped_data(self) -> list[dict[str, Any]]:
"""Load previously scraped data"""
@@ -1395,6 +1411,11 @@ class DocToSkillConverter:
categories: dict[str, list[dict[str, Any]]] = {cat: [] for cat in category_defs}
categories["other"] = []
# Pre-lowercase keywords once instead of per-page per-keyword
lowered_defs = {
cat: [kw.lower() for kw in keywords] for cat, keywords in category_defs.items()
}
for page in pages:
url = page["url"].lower()
title = page["title"].lower()
@@ -1404,11 +1425,10 @@ class DocToSkillConverter:
categorized = False
# Match against keywords
for cat, keywords in category_defs.items():
# Match against pre-lowercased keywords
for cat, keywords in lowered_defs.items():
score = 0
for keyword in keywords:
keyword = keyword.lower()
if keyword in url:
score += 3
if keyword in title:
@@ -1450,15 +1470,12 @@ class DocToSkillConverter:
if count >= 3: # At least 3 pages
categories[seg] = [seg]
# Add common defaults
if "tutorial" not in categories and any(
"tutorial" in url for url in [p["url"] for p in pages]
):
# Add common defaults (use pre-built URL list to avoid repeated comprehensions)
all_urls = [p["url"] for p in pages]
if "tutorial" not in categories and any("tutorial" in url for url in all_urls):
categories["tutorials"] = ["tutorial", "guide", "getting-started"]
if "api" not in categories and any(
"api" in url or "reference" in url for url in [p["url"] for p in pages]
):
if "api" not in categories and any("api" in url or "reference" in url for url in all_urls):
categories["api"] = ["api", "reference", "class"]
return categories

View File

@@ -15,6 +15,7 @@ Usage:
"""
import argparse
import fnmatch
import json
import logging
import os
@@ -664,11 +665,13 @@ class GitHubScraper:
def _extract_file_tree_github(self):
"""Extract file tree from GitHub API (rate-limited)."""
try:
contents = self.repo.get_contents("")
from collections import deque
contents = deque(self.repo.get_contents(""))
file_tree = []
while contents:
file_content = contents.pop(0)
file_content = contents.popleft()
file_info = {
"path": file_content.path,
@@ -741,11 +744,10 @@ class GitHubScraper:
continue
# Check if file matches patterns (if specified)
if self.file_patterns:
import fnmatch
if not any(fnmatch.fnmatch(file_path, pattern) for pattern in self.file_patterns):
continue
if self.file_patterns and not any(
fnmatch.fnmatch(file_path, pattern) for pattern in self.file_patterns
):
continue
# Analyze this file
try: