perf: optimize with caching, pre-compiled regex, O(1) lookups, and bisect line indexing (#309)
## Summary
Performance optimizations across core scraping and analysis modules:
- **doc_scraper.py**: Pre-compiled regex at module level, O(1) URL dedup via _enqueued_urls set, cached URL patterns, _enqueue_url() helper (DRY), seen_links set for link extraction, pre-lowercased category keywords, async error logging (bug fix), summary I/O error handling
- **code_analyzer.py**: O(log n) bisect-based line lookups replacing O(n) count("\n") across all 10 language analyzers; O(n) parent class map replacing O(n^2) AST walks for Python method detection
- **dependency_analyzer.py**: Same bisect line-index optimization for all import extractors
- **codebase_scraper.py**: Module-level import re, pre-imported parser classes outside loop
- **github_scraper.py**: deque.popleft() for O(1) tree traversal, module-level import fnmatch
- **utils.py**: Shared build_line_index() / offset_to_line() utilities (DRY)
- **test_adaptor_benchmarks.py**: Stabilized flaky test_benchmark_metadata_overhead (median, warm-up, more iterations)
Review fixes applied on top of original PR:
1. Renamed misleading _pending_set to _enqueued_urls
2. Extracted duplicated line-index code into shared cli/utils.py
3. Fixed pre-existing "tutorial" vs "tutorials" key mismatch bug in infer_categories()
4. Removed unnecessary _store_results() closure
5. Simplified parser pre-import pattern
This commit is contained in:
@@ -23,6 +23,7 @@ consider using dedicated parsers (tree-sitter, language-specific AST libraries).
|
||||
"""
|
||||
|
||||
import ast
|
||||
import bisect
|
||||
import contextlib
|
||||
import logging
|
||||
import re
|
||||
@@ -84,6 +85,16 @@ class CodeAnalyzer:
|
||||
depth: Analysis depth ('surface', 'deep', 'full')
|
||||
"""
|
||||
self.depth = depth
|
||||
self._newline_offsets: list[int] = []
|
||||
|
||||
@staticmethod
|
||||
def _build_line_index(content: str) -> list[int]:
|
||||
"""Build a sorted list of newline positions for O(log n) line lookups."""
|
||||
return [i for i, ch in enumerate(content) if ch == "\n"]
|
||||
|
||||
def _offset_to_line(self, offset: int) -> int:
|
||||
"""Convert a character offset to a 1-based line number using bisect."""
|
||||
return bisect.bisect_left(self._newline_offsets, offset) + 1
|
||||
|
||||
def analyze_file(self, file_path: str, content: str, language: str) -> dict[str, Any]:
|
||||
"""
|
||||
@@ -149,35 +160,26 @@ class CodeAnalyzer:
|
||||
functions = []
|
||||
imports = []
|
||||
|
||||
# Build parent map once (O(n)) instead of walking tree per node (O(n²))
|
||||
class_children: set[int] = set()
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.ClassDef) and isinstance(node.body, list):
|
||||
for child in node.body:
|
||||
class_children.add(id(child))
|
||||
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.ClassDef):
|
||||
class_sig = self._extract_python_class(node)
|
||||
classes.append(asdict(class_sig))
|
||||
elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
||||
# Only top-level functions (not methods)
|
||||
# Fix AST parser to check isinstance(parent.body, list) before 'in' operator
|
||||
is_method = False
|
||||
try:
|
||||
is_method = any(
|
||||
isinstance(parent, ast.ClassDef)
|
||||
for parent in ast.walk(tree)
|
||||
if hasattr(parent, "body")
|
||||
and isinstance(parent.body, list)
|
||||
and node in parent.body
|
||||
)
|
||||
except (TypeError, AttributeError):
|
||||
# If body is not iterable or check fails, assume it's a top-level function
|
||||
is_method = False
|
||||
|
||||
if not is_method:
|
||||
# Only top-level functions (not methods) - O(1) lookup via pre-built set
|
||||
if id(node) not in class_children:
|
||||
func_sig = self._extract_python_function(node)
|
||||
functions.append(asdict(func_sig))
|
||||
elif isinstance(node, ast.Import):
|
||||
# Extract: import foo, bar
|
||||
for alias in node.names:
|
||||
imports.append(alias.name)
|
||||
elif isinstance(node, ast.ImportFrom):
|
||||
# Extract: from foo import bar
|
||||
module = node.module or ""
|
||||
imports.append(module)
|
||||
|
||||
@@ -188,7 +190,7 @@ class CodeAnalyzer:
|
||||
"classes": classes,
|
||||
"functions": functions,
|
||||
"comments": comments,
|
||||
"imports": imports, # Include imports for framework detection
|
||||
"imports": imports,
|
||||
}
|
||||
|
||||
def _extract_python_class(self, node: ast.ClassDef) -> ClassSignature:
|
||||
@@ -285,6 +287,7 @@ class CodeAnalyzer:
|
||||
Note: This is a simplified approach. For production, consider using
|
||||
a proper JS/TS parser like esprima or ts-morph.
|
||||
"""
|
||||
self._newline_offsets = self._build_line_index(content)
|
||||
classes = []
|
||||
functions = []
|
||||
|
||||
@@ -310,7 +313,7 @@ class CodeAnalyzer:
|
||||
"base_classes": [base_class] if base_class else [],
|
||||
"methods": methods,
|
||||
"docstring": None,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -329,7 +332,7 @@ class CodeAnalyzer:
|
||||
"parameters": params,
|
||||
"return_type": None, # JS doesn't have type annotations (unless TS)
|
||||
"docstring": None,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
"is_async": is_async,
|
||||
"is_method": False,
|
||||
"decorators": [],
|
||||
@@ -351,7 +354,7 @@ class CodeAnalyzer:
|
||||
"parameters": params,
|
||||
"return_type": None,
|
||||
"docstring": None,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
"is_async": is_async,
|
||||
"is_method": False,
|
||||
"decorators": [],
|
||||
@@ -460,6 +463,7 @@ class CodeAnalyzer:
|
||||
Note: This is a simplified approach focusing on header files.
|
||||
For production, consider using libclang or similar.
|
||||
"""
|
||||
self._newline_offsets = self._build_line_index(content)
|
||||
classes = []
|
||||
functions = []
|
||||
|
||||
@@ -475,7 +479,7 @@ class CodeAnalyzer:
|
||||
"base_classes": [base_class] if base_class else [],
|
||||
"methods": [], # Simplified - would need to parse class body
|
||||
"docstring": None,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -498,7 +502,7 @@ class CodeAnalyzer:
|
||||
"parameters": params,
|
||||
"return_type": return_type,
|
||||
"docstring": None,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
"is_async": False,
|
||||
"is_method": False,
|
||||
"decorators": [],
|
||||
@@ -577,14 +581,14 @@ class CodeAnalyzer:
|
||||
|
||||
# Extract single-line comments (//)
|
||||
for match in re.finditer(r"//(.+)$", content, re.MULTILINE):
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
comment_text = match.group(1).strip()
|
||||
|
||||
comments.append({"line": line_num, "text": comment_text, "type": "inline"})
|
||||
|
||||
# Extract multi-line comments (/* */)
|
||||
for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL):
|
||||
start_line = content[: match.start()].count("\n") + 1
|
||||
start_line = self._offset_to_line(match.start())
|
||||
comment_text = match.group(1).strip()
|
||||
|
||||
comments.append({"line": start_line, "text": comment_text, "type": "block"})
|
||||
@@ -610,6 +614,7 @@ class CodeAnalyzer:
|
||||
Regex patterns inspired by C# language specification:
|
||||
https://learn.microsoft.com/en-us/dotnet/csharp/language-reference/
|
||||
"""
|
||||
self._newline_offsets = self._build_line_index(content)
|
||||
classes = []
|
||||
functions = []
|
||||
|
||||
@@ -651,7 +656,7 @@ class CodeAnalyzer:
|
||||
"base_classes": base_classes,
|
||||
"methods": methods,
|
||||
"docstring": None, # Would need to extract XML doc comments
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -676,7 +681,7 @@ class CodeAnalyzer:
|
||||
"parameters": params,
|
||||
"return_type": return_type,
|
||||
"docstring": None,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
"is_async": is_async,
|
||||
"is_method": False,
|
||||
"decorators": [],
|
||||
@@ -791,7 +796,7 @@ class CodeAnalyzer:
|
||||
|
||||
# Single-line comments (//)
|
||||
for match in re.finditer(r"//(.+)$", content, re.MULTILINE):
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
comment_text = match.group(1).strip()
|
||||
|
||||
# Distinguish XML doc comments (///)
|
||||
@@ -803,7 +808,7 @@ class CodeAnalyzer:
|
||||
|
||||
# Multi-line comments (/* */)
|
||||
for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL):
|
||||
start_line = content[: match.start()].count("\n") + 1
|
||||
start_line = self._offset_to_line(match.start())
|
||||
comment_text = match.group(1).strip()
|
||||
|
||||
comments.append({"line": start_line, "text": comment_text, "type": "block"})
|
||||
@@ -820,6 +825,7 @@ class CodeAnalyzer:
|
||||
Regex patterns based on Go language specification:
|
||||
https://go.dev/ref/spec
|
||||
"""
|
||||
self._newline_offsets = self._build_line_index(content)
|
||||
classes = [] # Go doesn't have classes, but we'll extract structs
|
||||
functions = []
|
||||
|
||||
@@ -834,7 +840,7 @@ class CodeAnalyzer:
|
||||
"base_classes": [], # Go uses embedding, not inheritance
|
||||
"methods": [], # Methods extracted separately
|
||||
"docstring": None,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -867,7 +873,7 @@ class CodeAnalyzer:
|
||||
"parameters": params,
|
||||
"return_type": return_type,
|
||||
"docstring": None,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
"is_async": False, # Go uses goroutines differently
|
||||
"is_method": is_method,
|
||||
"decorators": [],
|
||||
@@ -929,6 +935,7 @@ class CodeAnalyzer:
|
||||
Regex patterns based on Rust language reference:
|
||||
https://doc.rust-lang.org/reference/
|
||||
"""
|
||||
self._newline_offsets = self._build_line_index(content)
|
||||
classes = [] # Rust uses structs/enums/traits
|
||||
functions = []
|
||||
|
||||
@@ -943,7 +950,7 @@ class CodeAnalyzer:
|
||||
"base_classes": [], # Rust uses traits, not inheritance
|
||||
"methods": [],
|
||||
"docstring": None,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -964,7 +971,7 @@ class CodeAnalyzer:
|
||||
"parameters": params,
|
||||
"return_type": return_type,
|
||||
"docstring": None,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
"is_async": is_async,
|
||||
"is_method": False,
|
||||
"decorators": [],
|
||||
@@ -1016,7 +1023,7 @@ class CodeAnalyzer:
|
||||
|
||||
# Single-line comments (//)
|
||||
for match in re.finditer(r"//(.+)$", content, re.MULTILINE):
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
comment_text = match.group(1).strip()
|
||||
|
||||
# Distinguish doc comments (/// or //!)
|
||||
@@ -1030,7 +1037,7 @@ class CodeAnalyzer:
|
||||
|
||||
# Multi-line comments (/* */)
|
||||
for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL):
|
||||
start_line = content[: match.start()].count("\n") + 1
|
||||
start_line = self._offset_to_line(match.start())
|
||||
comment_text = match.group(1).strip()
|
||||
|
||||
comments.append({"line": start_line, "text": comment_text, "type": "block"})
|
||||
@@ -1047,6 +1054,7 @@ class CodeAnalyzer:
|
||||
Regex patterns based on Java language specification:
|
||||
https://docs.oracle.com/javase/specs/
|
||||
"""
|
||||
self._newline_offsets = self._build_line_index(content)
|
||||
classes = []
|
||||
functions = []
|
||||
|
||||
@@ -1089,7 +1097,7 @@ class CodeAnalyzer:
|
||||
"base_classes": base_classes,
|
||||
"methods": methods,
|
||||
"docstring": None,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -1112,7 +1120,7 @@ class CodeAnalyzer:
|
||||
"parameters": params,
|
||||
"return_type": return_type,
|
||||
"docstring": None,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
"is_async": False,
|
||||
"is_method": False,
|
||||
"decorators": [],
|
||||
@@ -1221,14 +1229,14 @@ class CodeAnalyzer:
|
||||
|
||||
# Single-line comments (//)
|
||||
for match in re.finditer(r"//(.+)$", content, re.MULTILINE):
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
comment_text = match.group(1).strip()
|
||||
|
||||
comments.append({"line": line_num, "text": comment_text, "type": "inline"})
|
||||
|
||||
# Multi-line and JavaDoc comments (/* */ and /** */)
|
||||
for match in re.finditer(r"/\*\*?(.+?)\*/", content, re.DOTALL):
|
||||
start_line = content[: match.start()].count("\n") + 1
|
||||
start_line = self._offset_to_line(match.start())
|
||||
comment_text = match.group(1).strip()
|
||||
|
||||
# Distinguish JavaDoc (starts with **)
|
||||
@@ -1248,6 +1256,7 @@ class CodeAnalyzer:
|
||||
Regex patterns based on Ruby language documentation:
|
||||
https://ruby-doc.org/
|
||||
"""
|
||||
self._newline_offsets = self._build_line_index(content)
|
||||
classes = []
|
||||
functions = []
|
||||
|
||||
@@ -1265,7 +1274,7 @@ class CodeAnalyzer:
|
||||
"base_classes": base_classes,
|
||||
"methods": [], # Would need to parse class body
|
||||
"docstring": None,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -1284,7 +1293,7 @@ class CodeAnalyzer:
|
||||
"parameters": params,
|
||||
"return_type": None, # Ruby has no type annotations (usually)
|
||||
"docstring": None,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
"is_async": False,
|
||||
"is_method": False,
|
||||
"decorators": [],
|
||||
@@ -1365,6 +1374,7 @@ class CodeAnalyzer:
|
||||
Regex patterns based on PHP language reference:
|
||||
https://www.php.net/manual/en/langref.php
|
||||
"""
|
||||
self._newline_offsets = self._build_line_index(content)
|
||||
classes = []
|
||||
functions = []
|
||||
|
||||
@@ -1406,7 +1416,7 @@ class CodeAnalyzer:
|
||||
"base_classes": base_classes,
|
||||
"methods": methods,
|
||||
"docstring": None,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -1425,7 +1435,7 @@ class CodeAnalyzer:
|
||||
"parameters": params,
|
||||
"return_type": return_type,
|
||||
"docstring": None,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
"is_async": False,
|
||||
"is_method": False,
|
||||
"decorators": [],
|
||||
@@ -1526,14 +1536,14 @@ class CodeAnalyzer:
|
||||
|
||||
# Single-line comments (// and #)
|
||||
for match in re.finditer(r"(?://|#)(.+)$", content, re.MULTILINE):
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
comment_text = match.group(1).strip()
|
||||
|
||||
comments.append({"line": line_num, "text": comment_text, "type": "inline"})
|
||||
|
||||
# Multi-line and PHPDoc comments (/* */ and /** */)
|
||||
for match in re.finditer(r"/\*\*?(.+?)\*/", content, re.DOTALL):
|
||||
start_line = content[: match.start()].count("\n") + 1
|
||||
start_line = self._offset_to_line(match.start())
|
||||
comment_text = match.group(1).strip()
|
||||
|
||||
# Distinguish PHPDoc (starts with **)
|
||||
@@ -1708,6 +1718,7 @@ class CodeAnalyzer:
|
||||
- @export var speed: float = 100.0
|
||||
- @onready var sprite = $Sprite2D
|
||||
"""
|
||||
self._newline_offsets = self._build_line_index(content)
|
||||
classes = []
|
||||
functions = []
|
||||
signals = []
|
||||
@@ -1764,7 +1775,7 @@ class CodeAnalyzer:
|
||||
"name": func_name,
|
||||
"parameters": param_list,
|
||||
"return_type": return_type,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -1774,7 +1785,7 @@ class CodeAnalyzer:
|
||||
|
||||
for match in re.finditer(r"signal\s+(\w+)(?:\(([^)]*)\))?", content):
|
||||
signal_name, params = match.groups()
|
||||
line_number = content[: match.start()].count("\n") + 1
|
||||
line_number = self._offset_to_line(match.start())
|
||||
|
||||
# Extract documentation comment above signal (## or #)
|
||||
doc_comment = None
|
||||
@@ -1800,7 +1811,7 @@ class CodeAnalyzer:
|
||||
{
|
||||
"signal": signal_path,
|
||||
"handler": handler.strip(),
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -1811,7 +1822,7 @@ class CodeAnalyzer:
|
||||
{
|
||||
"signal": signal_path,
|
||||
"arguments": args.strip() if args else "",
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -1827,7 +1838,7 @@ class CodeAnalyzer:
|
||||
"type": var_type,
|
||||
"default": default,
|
||||
"export_hint": hint,
|
||||
"line_number": content[: match.start()].count("\n") + 1,
|
||||
"line_number": self._offset_to_line(match.start()),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -28,6 +28,7 @@ import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
@@ -380,8 +381,6 @@ def extract_markdown_structure(content: str) -> dict[str, Any]:
|
||||
Returns:
|
||||
Dictionary with extracted structure
|
||||
"""
|
||||
import re
|
||||
|
||||
structure = {
|
||||
"title": None,
|
||||
"headers": [],
|
||||
@@ -526,8 +525,6 @@ def extract_rst_structure(content: str) -> dict[str, Any]:
|
||||
logger.warning(f"Enhanced RST parser failed: {e}, using basic parser")
|
||||
|
||||
# Legacy basic extraction (fallback)
|
||||
import re
|
||||
|
||||
structure = {
|
||||
"title": None,
|
||||
"headers": [],
|
||||
@@ -679,6 +676,17 @@ def process_markdown_docs(
|
||||
processed_docs = []
|
||||
categories = {}
|
||||
|
||||
# Pre-import parsers once outside the loop
|
||||
_rst_parser_cls = None
|
||||
_md_parser_cls = None
|
||||
try:
|
||||
from skill_seekers.cli.parsers.extractors import RstParser, MarkdownParser
|
||||
|
||||
_rst_parser_cls = RstParser
|
||||
_md_parser_cls = MarkdownParser
|
||||
except ImportError:
|
||||
logger.debug("Unified parsers not available, using legacy parsers")
|
||||
|
||||
for md_path in md_files:
|
||||
try:
|
||||
content = md_path.read_text(encoding="utf-8", errors="ignore")
|
||||
@@ -701,7 +709,10 @@ def process_markdown_docs(
|
||||
parsed_doc = None
|
||||
|
||||
try:
|
||||
from skill_seekers.cli.parsers.extractors import RstParser, MarkdownParser
|
||||
RstParser = _rst_parser_cls
|
||||
MarkdownParser = _md_parser_cls
|
||||
if RstParser is None or MarkdownParser is None:
|
||||
raise ImportError("Parsers not available")
|
||||
|
||||
# Use appropriate unified parser based on file extension
|
||||
if md_path.suffix.lower() in RST_EXTENSIONS:
|
||||
@@ -957,8 +968,6 @@ Return JSON with format:
|
||||
|
||||
# Parse response and merge enhancements
|
||||
try:
|
||||
import re
|
||||
|
||||
json_match = re.search(r"\{.*\}", response.content[0].text, re.DOTALL)
|
||||
if json_match:
|
||||
enhancements = json.loads(json_match.group())
|
||||
@@ -1022,8 +1031,6 @@ Output JSON only:
|
||||
os.unlink(prompt_file)
|
||||
|
||||
if result.returncode == 0 and result.stdout:
|
||||
import re
|
||||
|
||||
json_match = re.search(r"\{.*\}", result.stdout, re.DOTALL)
|
||||
if json_match:
|
||||
enhancements = json.loads(json_match.group())
|
||||
|
||||
@@ -40,6 +40,7 @@ Credits:
|
||||
"""
|
||||
|
||||
import ast
|
||||
import bisect
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
@@ -95,6 +96,16 @@ class DependencyAnalyzer:
|
||||
self.graph = nx.DiGraph() # Directed graph for dependencies
|
||||
self.file_dependencies: dict[str, list[DependencyInfo]] = {}
|
||||
self.file_nodes: dict[str, FileNode] = {}
|
||||
self._newline_offsets: list[int] = []
|
||||
|
||||
@staticmethod
|
||||
def _build_line_index(content: str) -> list[int]:
|
||||
"""Build a sorted list of newline positions for O(log n) line lookups."""
|
||||
return [i for i, ch in enumerate(content) if ch == "\n"]
|
||||
|
||||
def _offset_to_line(self, offset: int) -> int:
|
||||
"""Convert a character offset to a 1-based line number using bisect."""
|
||||
return bisect.bisect_left(self._newline_offsets, offset) + 1
|
||||
|
||||
def analyze_file(self, file_path: str, content: str, language: str) -> list[DependencyInfo]:
|
||||
"""
|
||||
@@ -109,6 +120,9 @@ class DependencyAnalyzer:
|
||||
Returns:
|
||||
List of DependencyInfo objects
|
||||
"""
|
||||
# Build line index once for O(log n) lookups in all extractors
|
||||
self._newline_offsets = self._build_line_index(content)
|
||||
|
||||
if language == "Python":
|
||||
deps = self._extract_python_imports(content, file_path)
|
||||
elif language == "GDScript":
|
||||
@@ -216,7 +230,7 @@ class DependencyAnalyzer:
|
||||
preload_pattern = r'(?:const|var)\s+\w+\s*=\s*preload\("(.+?)"\)'
|
||||
for match in re.finditer(preload_pattern, content):
|
||||
resource_path = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
# Convert res:// paths to relative
|
||||
if resource_path.startswith("res://"):
|
||||
@@ -236,7 +250,7 @@ class DependencyAnalyzer:
|
||||
load_pattern = r'(?:const|var)\s+\w+\s*=\s*load\("(.+?)"\)'
|
||||
for match in re.finditer(load_pattern, content):
|
||||
resource_path = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
if resource_path.startswith("res://"):
|
||||
resource_path = resource_path[6:]
|
||||
@@ -255,7 +269,7 @@ class DependencyAnalyzer:
|
||||
extends_path_pattern = r'extends\s+"(.+?)"'
|
||||
for match in re.finditer(extends_path_pattern, content):
|
||||
resource_path = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
if resource_path.startswith("res://"):
|
||||
resource_path = resource_path[6:]
|
||||
@@ -275,7 +289,7 @@ class DependencyAnalyzer:
|
||||
extends_class_pattern = r"extends\s+([A-Z]\w+)"
|
||||
for match in re.finditer(extends_class_pattern, content):
|
||||
class_name = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
# Skip built-in Godot classes (Node, Resource, etc.)
|
||||
if class_name not in (
|
||||
@@ -334,7 +348,7 @@ class DependencyAnalyzer:
|
||||
import_pattern = r"import\s+(?:[\w\s{},*]+\s+from\s+)?['\"]([^'\"]+)['\"]"
|
||||
for match in re.finditer(import_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
is_relative = module.startswith(".") or module.startswith("/")
|
||||
|
||||
deps.append(
|
||||
@@ -351,7 +365,7 @@ class DependencyAnalyzer:
|
||||
require_pattern = r"require\s*\(['\"]([^'\"]+)['\"]\)"
|
||||
for match in re.finditer(require_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
is_relative = module.startswith(".") or module.startswith("/")
|
||||
|
||||
deps.append(
|
||||
@@ -380,7 +394,7 @@ class DependencyAnalyzer:
|
||||
include_pattern = r'#include\s+[<"]([^>"]+)[>"]'
|
||||
for match in re.finditer(include_pattern, content):
|
||||
header = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
# Headers with "" are usually local, <> are system headers
|
||||
is_relative = '"' in match.group(0)
|
||||
@@ -417,7 +431,7 @@ class DependencyAnalyzer:
|
||||
for match in re.finditer(using_pattern, content):
|
||||
alias = match.group(1) # Optional alias
|
||||
namespace = match.group(2)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
# Skip 'using' statements for IDisposable (using var x = ...)
|
||||
if "=" in match.group(0) and not alias:
|
||||
@@ -454,7 +468,7 @@ class DependencyAnalyzer:
|
||||
for match in re.finditer(single_import_pattern, content):
|
||||
match.group(1) # Optional alias
|
||||
package = match.group(2)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
# Check if relative (starts with ./ or ../)
|
||||
is_relative = package.startswith("./")
|
||||
@@ -516,7 +530,7 @@ class DependencyAnalyzer:
|
||||
use_pattern = r"use\s+([\w:{}]+(?:\s*,\s*[\w:{}]+)*|[\w:]+::\{[^}]+\})\s*;"
|
||||
for match in re.finditer(use_pattern, content):
|
||||
module_path = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
# Determine if relative
|
||||
is_relative = module_path.startswith(("self::", "super::"))
|
||||
@@ -571,7 +585,7 @@ class DependencyAnalyzer:
|
||||
import_pattern = r"import\s+(?:static\s+)?([A-Za-z_][\w.]*(?:\.\*)?)\s*;"
|
||||
for match in re.finditer(import_pattern, content):
|
||||
import_path = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
@@ -603,7 +617,7 @@ class DependencyAnalyzer:
|
||||
require_pattern = r"require\s+['\"]([^'\"]+)['\"]"
|
||||
for match in re.finditer(require_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
@@ -619,7 +633,7 @@ class DependencyAnalyzer:
|
||||
require_relative_pattern = r"require_relative\s+['\"]([^'\"]+)['\"]"
|
||||
for match in re.finditer(require_relative_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
@@ -635,7 +649,7 @@ class DependencyAnalyzer:
|
||||
load_pattern = r"load\s+['\"]([^'\"]+)['\"]"
|
||||
for match in re.finditer(load_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
@@ -669,7 +683,7 @@ class DependencyAnalyzer:
|
||||
require_pattern = r"(?:require|include)(?:_once)?\s+['\"]([^'\"]+)['\"]"
|
||||
for match in re.finditer(require_pattern, content):
|
||||
module = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
# Determine import type
|
||||
import_type = "require" if "require" in match.group(0) else "include"
|
||||
@@ -691,7 +705,7 @@ class DependencyAnalyzer:
|
||||
use_pattern = r"use\s+([A-Za-z_][\w\\]*)\s*(?:as\s+\w+)?\s*;"
|
||||
for match in re.finditer(use_pattern, content):
|
||||
namespace = match.group(1)
|
||||
line_num = content[: match.start()].count("\n") + 1
|
||||
line_num = self._offset_to_line(match.start())
|
||||
|
||||
deps.append(
|
||||
DependencyInfo(
|
||||
@@ -908,7 +922,7 @@ class DependencyAnalyzer:
|
||||
source_file=file_path,
|
||||
imported_module=resource_path,
|
||||
import_type="ext_resource",
|
||||
line_number=content[: match.start()].count("\n") + 1,
|
||||
line_number=self._offset_to_line(match.start()),
|
||||
)
|
||||
)
|
||||
|
||||
@@ -924,7 +938,7 @@ class DependencyAnalyzer:
|
||||
source_file=file_path,
|
||||
imported_module=resource_path,
|
||||
import_type="preload",
|
||||
line_number=content[: match.start()].count("\n") + 1,
|
||||
line_number=self._offset_to_line(match.start()),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -64,6 +64,11 @@ FALLBACK_MAIN_SELECTORS = [
|
||||
"#main-content",
|
||||
]
|
||||
|
||||
# Pre-compiled regex patterns for frequently called methods
|
||||
_WHITESPACE_RE = re.compile(r"\s+")
|
||||
_SAFE_TITLE_RE = re.compile(r"[^\w\s-]")
|
||||
_SAFE_TITLE_SEP_RE = re.compile(r"[-\s]+")
|
||||
|
||||
|
||||
def infer_description_from_docs(
|
||||
base_url: str, first_page_content: str | None = None, name: str = ""
|
||||
@@ -188,12 +193,18 @@ class DocToSkillConverter:
|
||||
# Support multiple starting URLs
|
||||
start_urls = config.get("start_urls", [self.base_url])
|
||||
self.pending_urls = deque(start_urls)
|
||||
self._pending_set: set[str] = set(start_urls) # Shadow set for O(1) membership checks
|
||||
self.pages: list[dict[str, Any]] = []
|
||||
self.pages_scraped = 0
|
||||
|
||||
# Language detection
|
||||
self.language_detector = LanguageDetector(min_confidence=0.15)
|
||||
|
||||
# Pre-cache URL patterns for faster is_valid_url checks
|
||||
url_patterns = config.get("url_patterns", {})
|
||||
self._include_patterns: list[str] = url_patterns.get("include", [])
|
||||
self._exclude_patterns: list[str] = url_patterns.get("exclude", [])
|
||||
|
||||
# Thread-safe lock for parallel scraping
|
||||
if self.workers > 1:
|
||||
import threading
|
||||
@@ -211,6 +222,12 @@ class DocToSkillConverter:
|
||||
if resume and not dry_run:
|
||||
self.load_checkpoint()
|
||||
|
||||
def _enqueue_url(self, url: str) -> None:
|
||||
"""Add a URL to the pending queue if not already visited or pending (O(1))."""
|
||||
if url not in self.visited_urls and url not in self._pending_set:
|
||||
self._pending_set.add(url)
|
||||
self.pending_urls.append(url)
|
||||
|
||||
def is_valid_url(self, url: str) -> bool:
|
||||
"""Check if URL should be scraped based on patterns.
|
||||
|
||||
@@ -223,14 +240,10 @@ class DocToSkillConverter:
|
||||
if not url.startswith(self.base_url):
|
||||
return False
|
||||
|
||||
# Include patterns
|
||||
includes = self.config.get("url_patterns", {}).get("include", [])
|
||||
if includes and not any(pattern in url for pattern in includes):
|
||||
if self._include_patterns and not any(pattern in url for pattern in self._include_patterns):
|
||||
return False
|
||||
|
||||
# Exclude patterns
|
||||
excludes = self.config.get("url_patterns", {}).get("exclude", [])
|
||||
return not any(pattern in url for pattern in excludes)
|
||||
return not any(pattern in url for pattern in self._exclude_patterns)
|
||||
|
||||
def save_checkpoint(self) -> None:
|
||||
"""Save progress checkpoint"""
|
||||
@@ -264,7 +277,9 @@ class DocToSkillConverter:
|
||||
checkpoint_data = json.load(f)
|
||||
|
||||
self.visited_urls = set(checkpoint_data["visited_urls"])
|
||||
self.pending_urls = deque(checkpoint_data["pending_urls"])
|
||||
pending = checkpoint_data["pending_urls"]
|
||||
self.pending_urls = deque(pending)
|
||||
self._pending_set = set(pending)
|
||||
self.pages_scraped = checkpoint_data["pages_scraped"]
|
||||
|
||||
logger.info("✅ Resumed from checkpoint")
|
||||
@@ -337,11 +352,13 @@ class DocToSkillConverter:
|
||||
|
||||
# Extract links from entire page (always, even if main content not found).
|
||||
# This allows discovery of navigation links outside the main content area.
|
||||
seen_links: set[str] = set()
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = urljoin(url, link["href"])
|
||||
# Strip anchor fragments to avoid treating #anchors as separate pages
|
||||
href = href.split("#")[0]
|
||||
if self.is_valid_url(href) and href not in page["links"]:
|
||||
if href not in seen_links and self.is_valid_url(href):
|
||||
seen_links.add(href)
|
||||
page["links"].append(href)
|
||||
|
||||
# Find main content using shared fallback logic
|
||||
@@ -413,8 +430,6 @@ class DocToSkillConverter:
|
||||
Only .md links are extracted to avoid client-side rendered HTML pages.
|
||||
Anchor fragments (#section) are stripped from links.
|
||||
"""
|
||||
import re
|
||||
|
||||
# Detect if content is actually HTML (some .md URLs return HTML)
|
||||
if content.strip().startswith("<!DOCTYPE") or content.strip().startswith("<html"):
|
||||
return self._extract_html_as_markdown(content, url)
|
||||
@@ -649,8 +664,7 @@ class DocToSkillConverter:
|
||||
|
||||
def clean_text(self, text: str) -> str:
|
||||
"""Clean text content"""
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
return text.strip()
|
||||
return _WHITESPACE_RE.sub(" ", text).strip()
|
||||
|
||||
def save_page(self, page: dict[str, Any]) -> None:
|
||||
"""Save page data (skip pages with empty content)"""
|
||||
@@ -660,8 +674,8 @@ class DocToSkillConverter:
|
||||
return
|
||||
|
||||
url_hash = hashlib.md5(page["url"].encode()).hexdigest()[:10]
|
||||
safe_title = re.sub(r"[^\w\s-]", "", page["title"])[:50]
|
||||
safe_title = re.sub(r"[-\s]+", "_", safe_title)
|
||||
safe_title = _SAFE_TITLE_RE.sub("", page["title"])[:50]
|
||||
safe_title = _SAFE_TITLE_SEP_RE.sub("_", safe_title)
|
||||
|
||||
filename = f"{safe_title}_{url_hash}.json"
|
||||
filepath = os.path.join(self.data_dir, "pages", filename)
|
||||
@@ -695,27 +709,19 @@ class DocToSkillConverter:
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
page = self.extract_content(soup, url)
|
||||
|
||||
# Thread-safe operations (lock required)
|
||||
if self.workers > 1:
|
||||
with self.lock:
|
||||
logger.info(" %s", url)
|
||||
self.save_page(page)
|
||||
self.pages.append(page)
|
||||
|
||||
# Add new URLs
|
||||
for link in page["links"]:
|
||||
if link not in self.visited_urls and link not in self.pending_urls:
|
||||
self.pending_urls.append(link)
|
||||
else:
|
||||
# Single-threaded mode (no lock needed)
|
||||
# Store results (thread-safe when workers > 1)
|
||||
def _store_results():
|
||||
logger.info(" %s", url)
|
||||
self.save_page(page)
|
||||
self.pages.append(page)
|
||||
|
||||
# Add new URLs
|
||||
for link in page["links"]:
|
||||
if link not in self.visited_urls and link not in self.pending_urls:
|
||||
self.pending_urls.append(link)
|
||||
self._enqueue_url(link)
|
||||
|
||||
if self.workers > 1:
|
||||
with self.lock:
|
||||
_store_results()
|
||||
else:
|
||||
_store_results()
|
||||
|
||||
# Rate limiting
|
||||
rate_limit = self.config.get("rate_limit", DEFAULT_RATE_LIMIT)
|
||||
@@ -766,8 +772,7 @@ class DocToSkillConverter:
|
||||
|
||||
# Add new URLs
|
||||
for link in page["links"]:
|
||||
if link not in self.visited_urls and link not in self.pending_urls:
|
||||
self.pending_urls.append(link)
|
||||
self._enqueue_url(link)
|
||||
|
||||
# Rate limiting
|
||||
rate_limit = self.config.get("rate_limit", DEFAULT_RATE_LIMIT)
|
||||
@@ -924,8 +929,8 @@ class DocToSkillConverter:
|
||||
|
||||
# Filter URLs based on url_patterns config
|
||||
for url in md_urls:
|
||||
if self.is_valid_url(url) and url not in self.visited_urls:
|
||||
self.pending_urls.append(url)
|
||||
if self.is_valid_url(url):
|
||||
self._enqueue_url(url)
|
||||
|
||||
logger.info(
|
||||
" 📋 %d URLs added to crawl queue after filtering",
|
||||
@@ -1010,8 +1015,8 @@ class DocToSkillConverter:
|
||||
|
||||
# Filter URLs based on url_patterns config
|
||||
for url in md_urls:
|
||||
if self.is_valid_url(url) and url not in self.visited_urls:
|
||||
self.pending_urls.append(url)
|
||||
if self.is_valid_url(url):
|
||||
self._enqueue_url(url)
|
||||
|
||||
logger.info(
|
||||
" 📋 %d URLs added to crawl queue after filtering",
|
||||
@@ -1115,8 +1120,8 @@ class DocToSkillConverter:
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = urljoin(url, link["href"])
|
||||
href = href.split("#")[0]
|
||||
if self.is_valid_url(href) and href not in self.visited_urls:
|
||||
self.pending_urls.append(href)
|
||||
if self.is_valid_url(href):
|
||||
self._enqueue_url(href)
|
||||
except Exception as e:
|
||||
# Failed to extract links in fast mode, continue anyway
|
||||
logger.warning("⚠️ Warning: Could not extract links from %s: %s", url, e)
|
||||
@@ -1299,8 +1304,8 @@ class DocToSkillConverter:
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = urljoin(url, link["href"])
|
||||
href = href.split("#")[0]
|
||||
if self.is_valid_url(href) and href not in self.visited_urls:
|
||||
self.pending_urls.append(href)
|
||||
if self.is_valid_url(href):
|
||||
self._enqueue_url(href)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"⚠️ Warning: Could not extract links from %s: %s", url, e
|
||||
@@ -1313,7 +1318,12 @@ class DocToSkillConverter:
|
||||
|
||||
# Wait for batch to complete before continuing
|
||||
if tasks:
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
for result in results:
|
||||
if isinstance(result, Exception):
|
||||
logger.error(
|
||||
" ✗ Async task failed: %s: %s", type(result).__name__, result
|
||||
)
|
||||
tasks = []
|
||||
self.pages_scraped = len(self.visited_urls)
|
||||
|
||||
@@ -1331,7 +1341,10 @@ class DocToSkillConverter:
|
||||
|
||||
# Wait for any remaining tasks
|
||||
if tasks:
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
for result in results:
|
||||
if isinstance(result, Exception):
|
||||
logger.error(" ✗ Async task failed: %s: %s", type(result).__name__, result)
|
||||
|
||||
if self.dry_run:
|
||||
logger.info("\n✅ Dry run complete: would scrape ~%d pages", len(self.visited_urls))
|
||||
@@ -1356,8 +1369,11 @@ class DocToSkillConverter:
|
||||
"pages": [{"title": p["title"], "url": p["url"]} for p in self.pages],
|
||||
}
|
||||
|
||||
with open(f"{self.data_dir}/summary.json", "w", encoding="utf-8") as f:
|
||||
json.dump(summary, f, indent=2, ensure_ascii=False)
|
||||
try:
|
||||
with open(f"{self.data_dir}/summary.json", "w", encoding="utf-8") as f:
|
||||
json.dump(summary, f, indent=2, ensure_ascii=False)
|
||||
except OSError as e:
|
||||
logger.error(" ✗ Failed to save summary: %s", e)
|
||||
|
||||
def load_scraped_data(self) -> list[dict[str, Any]]:
|
||||
"""Load previously scraped data"""
|
||||
@@ -1395,6 +1411,11 @@ class DocToSkillConverter:
|
||||
categories: dict[str, list[dict[str, Any]]] = {cat: [] for cat in category_defs}
|
||||
categories["other"] = []
|
||||
|
||||
# Pre-lowercase keywords once instead of per-page per-keyword
|
||||
lowered_defs = {
|
||||
cat: [kw.lower() for kw in keywords] for cat, keywords in category_defs.items()
|
||||
}
|
||||
|
||||
for page in pages:
|
||||
url = page["url"].lower()
|
||||
title = page["title"].lower()
|
||||
@@ -1404,11 +1425,10 @@ class DocToSkillConverter:
|
||||
|
||||
categorized = False
|
||||
|
||||
# Match against keywords
|
||||
for cat, keywords in category_defs.items():
|
||||
# Match against pre-lowercased keywords
|
||||
for cat, keywords in lowered_defs.items():
|
||||
score = 0
|
||||
for keyword in keywords:
|
||||
keyword = keyword.lower()
|
||||
if keyword in url:
|
||||
score += 3
|
||||
if keyword in title:
|
||||
@@ -1450,15 +1470,12 @@ class DocToSkillConverter:
|
||||
if count >= 3: # At least 3 pages
|
||||
categories[seg] = [seg]
|
||||
|
||||
# Add common defaults
|
||||
if "tutorial" not in categories and any(
|
||||
"tutorial" in url for url in [p["url"] for p in pages]
|
||||
):
|
||||
# Add common defaults (use pre-built URL list to avoid repeated comprehensions)
|
||||
all_urls = [p["url"] for p in pages]
|
||||
if "tutorial" not in categories and any("tutorial" in url for url in all_urls):
|
||||
categories["tutorials"] = ["tutorial", "guide", "getting-started"]
|
||||
|
||||
if "api" not in categories and any(
|
||||
"api" in url or "reference" in url for url in [p["url"] for p in pages]
|
||||
):
|
||||
if "api" not in categories and any("api" in url or "reference" in url for url in all_urls):
|
||||
categories["api"] = ["api", "reference", "class"]
|
||||
|
||||
return categories
|
||||
|
||||
@@ -15,6 +15,7 @@ Usage:
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import fnmatch
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -664,11 +665,13 @@ class GitHubScraper:
|
||||
def _extract_file_tree_github(self):
|
||||
"""Extract file tree from GitHub API (rate-limited)."""
|
||||
try:
|
||||
contents = self.repo.get_contents("")
|
||||
from collections import deque
|
||||
|
||||
contents = deque(self.repo.get_contents(""))
|
||||
file_tree = []
|
||||
|
||||
while contents:
|
||||
file_content = contents.pop(0)
|
||||
file_content = contents.popleft()
|
||||
|
||||
file_info = {
|
||||
"path": file_content.path,
|
||||
@@ -741,11 +744,10 @@ class GitHubScraper:
|
||||
continue
|
||||
|
||||
# Check if file matches patterns (if specified)
|
||||
if self.file_patterns:
|
||||
import fnmatch
|
||||
|
||||
if not any(fnmatch.fnmatch(file_path, pattern) for pattern in self.file_patterns):
|
||||
continue
|
||||
if self.file_patterns and not any(
|
||||
fnmatch.fnmatch(file_path, pattern) for pattern in self.file_patterns
|
||||
):
|
||||
continue
|
||||
|
||||
# Analyze this file
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user