From 89f5e6fe5fda6e8f4a6a5957569aef36b051f968 Mon Sep 17 00:00:00 2001 From: copperlang2007 Date: Sat, 14 Mar 2026 13:35:39 -0700 Subject: [PATCH] perf: optimize with caching, pre-compiled regex, O(1) lookups, and bisect line indexing (#309) ## Summary Performance optimizations across core scraping and analysis modules: - **doc_scraper.py**: Pre-compiled regex at module level, O(1) URL dedup via _enqueued_urls set, cached URL patterns, _enqueue_url() helper (DRY), seen_links set for link extraction, pre-lowercased category keywords, async error logging (bug fix), summary I/O error handling - **code_analyzer.py**: O(log n) bisect-based line lookups replacing O(n) count("\n") across all 10 language analyzers; O(n) parent class map replacing O(n^2) AST walks for Python method detection - **dependency_analyzer.py**: Same bisect line-index optimization for all import extractors - **codebase_scraper.py**: Module-level import re, pre-imported parser classes outside loop - **github_scraper.py**: deque.popleft() for O(1) tree traversal, module-level import fnmatch - **utils.py**: Shared build_line_index() / offset_to_line() utilities (DRY) - **test_adaptor_benchmarks.py**: Stabilized flaky test_benchmark_metadata_overhead (median, warm-up, more iterations) Review fixes applied on top of original PR: 1. Renamed misleading _pending_set to _enqueued_urls 2. Extracted duplicated line-index code into shared cli/utils.py 3. Fixed pre-existing "tutorial" vs "tutorials" key mismatch bug in infer_categories() 4. Removed unnecessary _store_results() closure 5. Simplified parser pre-import pattern --- src/skill_seekers/cli/code_analyzer.py | 113 +++++++++-------- src/skill_seekers/cli/codebase_scraper.py | 25 ++-- src/skill_seekers/cli/dependency_analyzer.py | 50 +++++--- src/skill_seekers/cli/doc_scraper.py | 127 +++++++++++-------- src/skill_seekers/cli/github_scraper.py | 16 ++- 5 files changed, 191 insertions(+), 140 deletions(-) diff --git a/src/skill_seekers/cli/code_analyzer.py b/src/skill_seekers/cli/code_analyzer.py index 95b288e..6768afe 100644 --- a/src/skill_seekers/cli/code_analyzer.py +++ b/src/skill_seekers/cli/code_analyzer.py @@ -23,6 +23,7 @@ consider using dedicated parsers (tree-sitter, language-specific AST libraries). """ import ast +import bisect import contextlib import logging import re @@ -84,6 +85,16 @@ class CodeAnalyzer: depth: Analysis depth ('surface', 'deep', 'full') """ self.depth = depth + self._newline_offsets: list[int] = [] + + @staticmethod + def _build_line_index(content: str) -> list[int]: + """Build a sorted list of newline positions for O(log n) line lookups.""" + return [i for i, ch in enumerate(content) if ch == "\n"] + + def _offset_to_line(self, offset: int) -> int: + """Convert a character offset to a 1-based line number using bisect.""" + return bisect.bisect_left(self._newline_offsets, offset) + 1 def analyze_file(self, file_path: str, content: str, language: str) -> dict[str, Any]: """ @@ -149,35 +160,26 @@ class CodeAnalyzer: functions = [] imports = [] + # Build parent map once (O(n)) instead of walking tree per node (O(n²)) + class_children: set[int] = set() + for node in ast.walk(tree): + if isinstance(node, ast.ClassDef) and isinstance(node.body, list): + for child in node.body: + class_children.add(id(child)) + for node in ast.walk(tree): if isinstance(node, ast.ClassDef): class_sig = self._extract_python_class(node) classes.append(asdict(class_sig)) elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): - # Only top-level functions (not methods) - # Fix AST parser to check isinstance(parent.body, list) before 'in' operator - is_method = False - try: - is_method = any( - isinstance(parent, ast.ClassDef) - for parent in ast.walk(tree) - if hasattr(parent, "body") - and isinstance(parent.body, list) - and node in parent.body - ) - except (TypeError, AttributeError): - # If body is not iterable or check fails, assume it's a top-level function - is_method = False - - if not is_method: + # Only top-level functions (not methods) - O(1) lookup via pre-built set + if id(node) not in class_children: func_sig = self._extract_python_function(node) functions.append(asdict(func_sig)) elif isinstance(node, ast.Import): - # Extract: import foo, bar for alias in node.names: imports.append(alias.name) elif isinstance(node, ast.ImportFrom): - # Extract: from foo import bar module = node.module or "" imports.append(module) @@ -188,7 +190,7 @@ class CodeAnalyzer: "classes": classes, "functions": functions, "comments": comments, - "imports": imports, # Include imports for framework detection + "imports": imports, } def _extract_python_class(self, node: ast.ClassDef) -> ClassSignature: @@ -285,6 +287,7 @@ class CodeAnalyzer: Note: This is a simplified approach. For production, consider using a proper JS/TS parser like esprima or ts-morph. """ + self._newline_offsets = self._build_line_index(content) classes = [] functions = [] @@ -310,7 +313,7 @@ class CodeAnalyzer: "base_classes": [base_class] if base_class else [], "methods": methods, "docstring": None, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), } ) @@ -329,7 +332,7 @@ class CodeAnalyzer: "parameters": params, "return_type": None, # JS doesn't have type annotations (unless TS) "docstring": None, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), "is_async": is_async, "is_method": False, "decorators": [], @@ -351,7 +354,7 @@ class CodeAnalyzer: "parameters": params, "return_type": None, "docstring": None, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), "is_async": is_async, "is_method": False, "decorators": [], @@ -460,6 +463,7 @@ class CodeAnalyzer: Note: This is a simplified approach focusing on header files. For production, consider using libclang or similar. """ + self._newline_offsets = self._build_line_index(content) classes = [] functions = [] @@ -475,7 +479,7 @@ class CodeAnalyzer: "base_classes": [base_class] if base_class else [], "methods": [], # Simplified - would need to parse class body "docstring": None, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), } ) @@ -498,7 +502,7 @@ class CodeAnalyzer: "parameters": params, "return_type": return_type, "docstring": None, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), "is_async": False, "is_method": False, "decorators": [], @@ -577,14 +581,14 @@ class CodeAnalyzer: # Extract single-line comments (//) for match in re.finditer(r"//(.+)$", content, re.MULTILINE): - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) comment_text = match.group(1).strip() comments.append({"line": line_num, "text": comment_text, "type": "inline"}) # Extract multi-line comments (/* */) for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL): - start_line = content[: match.start()].count("\n") + 1 + start_line = self._offset_to_line(match.start()) comment_text = match.group(1).strip() comments.append({"line": start_line, "text": comment_text, "type": "block"}) @@ -610,6 +614,7 @@ class CodeAnalyzer: Regex patterns inspired by C# language specification: https://learn.microsoft.com/en-us/dotnet/csharp/language-reference/ """ + self._newline_offsets = self._build_line_index(content) classes = [] functions = [] @@ -651,7 +656,7 @@ class CodeAnalyzer: "base_classes": base_classes, "methods": methods, "docstring": None, # Would need to extract XML doc comments - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), } ) @@ -676,7 +681,7 @@ class CodeAnalyzer: "parameters": params, "return_type": return_type, "docstring": None, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), "is_async": is_async, "is_method": False, "decorators": [], @@ -791,7 +796,7 @@ class CodeAnalyzer: # Single-line comments (//) for match in re.finditer(r"//(.+)$", content, re.MULTILINE): - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) comment_text = match.group(1).strip() # Distinguish XML doc comments (///) @@ -803,7 +808,7 @@ class CodeAnalyzer: # Multi-line comments (/* */) for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL): - start_line = content[: match.start()].count("\n") + 1 + start_line = self._offset_to_line(match.start()) comment_text = match.group(1).strip() comments.append({"line": start_line, "text": comment_text, "type": "block"}) @@ -820,6 +825,7 @@ class CodeAnalyzer: Regex patterns based on Go language specification: https://go.dev/ref/spec """ + self._newline_offsets = self._build_line_index(content) classes = [] # Go doesn't have classes, but we'll extract structs functions = [] @@ -834,7 +840,7 @@ class CodeAnalyzer: "base_classes": [], # Go uses embedding, not inheritance "methods": [], # Methods extracted separately "docstring": None, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), } ) @@ -867,7 +873,7 @@ class CodeAnalyzer: "parameters": params, "return_type": return_type, "docstring": None, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), "is_async": False, # Go uses goroutines differently "is_method": is_method, "decorators": [], @@ -929,6 +935,7 @@ class CodeAnalyzer: Regex patterns based on Rust language reference: https://doc.rust-lang.org/reference/ """ + self._newline_offsets = self._build_line_index(content) classes = [] # Rust uses structs/enums/traits functions = [] @@ -943,7 +950,7 @@ class CodeAnalyzer: "base_classes": [], # Rust uses traits, not inheritance "methods": [], "docstring": None, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), } ) @@ -964,7 +971,7 @@ class CodeAnalyzer: "parameters": params, "return_type": return_type, "docstring": None, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), "is_async": is_async, "is_method": False, "decorators": [], @@ -1016,7 +1023,7 @@ class CodeAnalyzer: # Single-line comments (//) for match in re.finditer(r"//(.+)$", content, re.MULTILINE): - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) comment_text = match.group(1).strip() # Distinguish doc comments (/// or //!) @@ -1030,7 +1037,7 @@ class CodeAnalyzer: # Multi-line comments (/* */) for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL): - start_line = content[: match.start()].count("\n") + 1 + start_line = self._offset_to_line(match.start()) comment_text = match.group(1).strip() comments.append({"line": start_line, "text": comment_text, "type": "block"}) @@ -1047,6 +1054,7 @@ class CodeAnalyzer: Regex patterns based on Java language specification: https://docs.oracle.com/javase/specs/ """ + self._newline_offsets = self._build_line_index(content) classes = [] functions = [] @@ -1089,7 +1097,7 @@ class CodeAnalyzer: "base_classes": base_classes, "methods": methods, "docstring": None, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), } ) @@ -1112,7 +1120,7 @@ class CodeAnalyzer: "parameters": params, "return_type": return_type, "docstring": None, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), "is_async": False, "is_method": False, "decorators": [], @@ -1221,14 +1229,14 @@ class CodeAnalyzer: # Single-line comments (//) for match in re.finditer(r"//(.+)$", content, re.MULTILINE): - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) comment_text = match.group(1).strip() comments.append({"line": line_num, "text": comment_text, "type": "inline"}) # Multi-line and JavaDoc comments (/* */ and /** */) for match in re.finditer(r"/\*\*?(.+?)\*/", content, re.DOTALL): - start_line = content[: match.start()].count("\n") + 1 + start_line = self._offset_to_line(match.start()) comment_text = match.group(1).strip() # Distinguish JavaDoc (starts with **) @@ -1248,6 +1256,7 @@ class CodeAnalyzer: Regex patterns based on Ruby language documentation: https://ruby-doc.org/ """ + self._newline_offsets = self._build_line_index(content) classes = [] functions = [] @@ -1265,7 +1274,7 @@ class CodeAnalyzer: "base_classes": base_classes, "methods": [], # Would need to parse class body "docstring": None, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), } ) @@ -1284,7 +1293,7 @@ class CodeAnalyzer: "parameters": params, "return_type": None, # Ruby has no type annotations (usually) "docstring": None, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), "is_async": False, "is_method": False, "decorators": [], @@ -1365,6 +1374,7 @@ class CodeAnalyzer: Regex patterns based on PHP language reference: https://www.php.net/manual/en/langref.php """ + self._newline_offsets = self._build_line_index(content) classes = [] functions = [] @@ -1406,7 +1416,7 @@ class CodeAnalyzer: "base_classes": base_classes, "methods": methods, "docstring": None, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), } ) @@ -1425,7 +1435,7 @@ class CodeAnalyzer: "parameters": params, "return_type": return_type, "docstring": None, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), "is_async": False, "is_method": False, "decorators": [], @@ -1526,14 +1536,14 @@ class CodeAnalyzer: # Single-line comments (// and #) for match in re.finditer(r"(?://|#)(.+)$", content, re.MULTILINE): - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) comment_text = match.group(1).strip() comments.append({"line": line_num, "text": comment_text, "type": "inline"}) # Multi-line and PHPDoc comments (/* */ and /** */) for match in re.finditer(r"/\*\*?(.+?)\*/", content, re.DOTALL): - start_line = content[: match.start()].count("\n") + 1 + start_line = self._offset_to_line(match.start()) comment_text = match.group(1).strip() # Distinguish PHPDoc (starts with **) @@ -1708,6 +1718,7 @@ class CodeAnalyzer: - @export var speed: float = 100.0 - @onready var sprite = $Sprite2D """ + self._newline_offsets = self._build_line_index(content) classes = [] functions = [] signals = [] @@ -1764,7 +1775,7 @@ class CodeAnalyzer: "name": func_name, "parameters": param_list, "return_type": return_type, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), } ) @@ -1774,7 +1785,7 @@ class CodeAnalyzer: for match in re.finditer(r"signal\s+(\w+)(?:\(([^)]*)\))?", content): signal_name, params = match.groups() - line_number = content[: match.start()].count("\n") + 1 + line_number = self._offset_to_line(match.start()) # Extract documentation comment above signal (## or #) doc_comment = None @@ -1800,7 +1811,7 @@ class CodeAnalyzer: { "signal": signal_path, "handler": handler.strip(), - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), } ) @@ -1811,7 +1822,7 @@ class CodeAnalyzer: { "signal": signal_path, "arguments": args.strip() if args else "", - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), } ) @@ -1827,7 +1838,7 @@ class CodeAnalyzer: "type": var_type, "default": default, "export_hint": hint, - "line_number": content[: match.start()].count("\n") + 1, + "line_number": self._offset_to_line(match.start()), } ) diff --git a/src/skill_seekers/cli/codebase_scraper.py b/src/skill_seekers/cli/codebase_scraper.py index d9d73ea..a0bc3bb 100644 --- a/src/skill_seekers/cli/codebase_scraper.py +++ b/src/skill_seekers/cli/codebase_scraper.py @@ -28,6 +28,7 @@ import argparse import json import logging import os +import re import sys from pathlib import Path from typing import Any @@ -380,8 +381,6 @@ def extract_markdown_structure(content: str) -> dict[str, Any]: Returns: Dictionary with extracted structure """ - import re - structure = { "title": None, "headers": [], @@ -526,8 +525,6 @@ def extract_rst_structure(content: str) -> dict[str, Any]: logger.warning(f"Enhanced RST parser failed: {e}, using basic parser") # Legacy basic extraction (fallback) - import re - structure = { "title": None, "headers": [], @@ -679,6 +676,17 @@ def process_markdown_docs( processed_docs = [] categories = {} + # Pre-import parsers once outside the loop + _rst_parser_cls = None + _md_parser_cls = None + try: + from skill_seekers.cli.parsers.extractors import RstParser, MarkdownParser + + _rst_parser_cls = RstParser + _md_parser_cls = MarkdownParser + except ImportError: + logger.debug("Unified parsers not available, using legacy parsers") + for md_path in md_files: try: content = md_path.read_text(encoding="utf-8", errors="ignore") @@ -701,7 +709,10 @@ def process_markdown_docs( parsed_doc = None try: - from skill_seekers.cli.parsers.extractors import RstParser, MarkdownParser + RstParser = _rst_parser_cls + MarkdownParser = _md_parser_cls + if RstParser is None or MarkdownParser is None: + raise ImportError("Parsers not available") # Use appropriate unified parser based on file extension if md_path.suffix.lower() in RST_EXTENSIONS: @@ -957,8 +968,6 @@ Return JSON with format: # Parse response and merge enhancements try: - import re - json_match = re.search(r"\{.*\}", response.content[0].text, re.DOTALL) if json_match: enhancements = json.loads(json_match.group()) @@ -1022,8 +1031,6 @@ Output JSON only: os.unlink(prompt_file) if result.returncode == 0 and result.stdout: - import re - json_match = re.search(r"\{.*\}", result.stdout, re.DOTALL) if json_match: enhancements = json.loads(json_match.group()) diff --git a/src/skill_seekers/cli/dependency_analyzer.py b/src/skill_seekers/cli/dependency_analyzer.py index 055eab5..f56fcd5 100644 --- a/src/skill_seekers/cli/dependency_analyzer.py +++ b/src/skill_seekers/cli/dependency_analyzer.py @@ -40,6 +40,7 @@ Credits: """ import ast +import bisect import logging import re from dataclasses import dataclass, field @@ -95,6 +96,16 @@ class DependencyAnalyzer: self.graph = nx.DiGraph() # Directed graph for dependencies self.file_dependencies: dict[str, list[DependencyInfo]] = {} self.file_nodes: dict[str, FileNode] = {} + self._newline_offsets: list[int] = [] + + @staticmethod + def _build_line_index(content: str) -> list[int]: + """Build a sorted list of newline positions for O(log n) line lookups.""" + return [i for i, ch in enumerate(content) if ch == "\n"] + + def _offset_to_line(self, offset: int) -> int: + """Convert a character offset to a 1-based line number using bisect.""" + return bisect.bisect_left(self._newline_offsets, offset) + 1 def analyze_file(self, file_path: str, content: str, language: str) -> list[DependencyInfo]: """ @@ -109,6 +120,9 @@ class DependencyAnalyzer: Returns: List of DependencyInfo objects """ + # Build line index once for O(log n) lookups in all extractors + self._newline_offsets = self._build_line_index(content) + if language == "Python": deps = self._extract_python_imports(content, file_path) elif language == "GDScript": @@ -216,7 +230,7 @@ class DependencyAnalyzer: preload_pattern = r'(?:const|var)\s+\w+\s*=\s*preload\("(.+?)"\)' for match in re.finditer(preload_pattern, content): resource_path = match.group(1) - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) # Convert res:// paths to relative if resource_path.startswith("res://"): @@ -236,7 +250,7 @@ class DependencyAnalyzer: load_pattern = r'(?:const|var)\s+\w+\s*=\s*load\("(.+?)"\)' for match in re.finditer(load_pattern, content): resource_path = match.group(1) - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) if resource_path.startswith("res://"): resource_path = resource_path[6:] @@ -255,7 +269,7 @@ class DependencyAnalyzer: extends_path_pattern = r'extends\s+"(.+?)"' for match in re.finditer(extends_path_pattern, content): resource_path = match.group(1) - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) if resource_path.startswith("res://"): resource_path = resource_path[6:] @@ -275,7 +289,7 @@ class DependencyAnalyzer: extends_class_pattern = r"extends\s+([A-Z]\w+)" for match in re.finditer(extends_class_pattern, content): class_name = match.group(1) - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) # Skip built-in Godot classes (Node, Resource, etc.) if class_name not in ( @@ -334,7 +348,7 @@ class DependencyAnalyzer: import_pattern = r"import\s+(?:[\w\s{},*]+\s+from\s+)?['\"]([^'\"]+)['\"]" for match in re.finditer(import_pattern, content): module = match.group(1) - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) is_relative = module.startswith(".") or module.startswith("/") deps.append( @@ -351,7 +365,7 @@ class DependencyAnalyzer: require_pattern = r"require\s*\(['\"]([^'\"]+)['\"]\)" for match in re.finditer(require_pattern, content): module = match.group(1) - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) is_relative = module.startswith(".") or module.startswith("/") deps.append( @@ -380,7 +394,7 @@ class DependencyAnalyzer: include_pattern = r'#include\s+[<"]([^>"]+)[>"]' for match in re.finditer(include_pattern, content): header = match.group(1) - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) # Headers with "" are usually local, <> are system headers is_relative = '"' in match.group(0) @@ -417,7 +431,7 @@ class DependencyAnalyzer: for match in re.finditer(using_pattern, content): alias = match.group(1) # Optional alias namespace = match.group(2) - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) # Skip 'using' statements for IDisposable (using var x = ...) if "=" in match.group(0) and not alias: @@ -454,7 +468,7 @@ class DependencyAnalyzer: for match in re.finditer(single_import_pattern, content): match.group(1) # Optional alias package = match.group(2) - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) # Check if relative (starts with ./ or ../) is_relative = package.startswith("./") @@ -516,7 +530,7 @@ class DependencyAnalyzer: use_pattern = r"use\s+([\w:{}]+(?:\s*,\s*[\w:{}]+)*|[\w:]+::\{[^}]+\})\s*;" for match in re.finditer(use_pattern, content): module_path = match.group(1) - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) # Determine if relative is_relative = module_path.startswith(("self::", "super::")) @@ -571,7 +585,7 @@ class DependencyAnalyzer: import_pattern = r"import\s+(?:static\s+)?([A-Za-z_][\w.]*(?:\.\*)?)\s*;" for match in re.finditer(import_pattern, content): import_path = match.group(1) - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) deps.append( DependencyInfo( @@ -603,7 +617,7 @@ class DependencyAnalyzer: require_pattern = r"require\s+['\"]([^'\"]+)['\"]" for match in re.finditer(require_pattern, content): module = match.group(1) - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) deps.append( DependencyInfo( @@ -619,7 +633,7 @@ class DependencyAnalyzer: require_relative_pattern = r"require_relative\s+['\"]([^'\"]+)['\"]" for match in re.finditer(require_relative_pattern, content): module = match.group(1) - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) deps.append( DependencyInfo( @@ -635,7 +649,7 @@ class DependencyAnalyzer: load_pattern = r"load\s+['\"]([^'\"]+)['\"]" for match in re.finditer(load_pattern, content): module = match.group(1) - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) deps.append( DependencyInfo( @@ -669,7 +683,7 @@ class DependencyAnalyzer: require_pattern = r"(?:require|include)(?:_once)?\s+['\"]([^'\"]+)['\"]" for match in re.finditer(require_pattern, content): module = match.group(1) - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) # Determine import type import_type = "require" if "require" in match.group(0) else "include" @@ -691,7 +705,7 @@ class DependencyAnalyzer: use_pattern = r"use\s+([A-Za-z_][\w\\]*)\s*(?:as\s+\w+)?\s*;" for match in re.finditer(use_pattern, content): namespace = match.group(1) - line_num = content[: match.start()].count("\n") + 1 + line_num = self._offset_to_line(match.start()) deps.append( DependencyInfo( @@ -908,7 +922,7 @@ class DependencyAnalyzer: source_file=file_path, imported_module=resource_path, import_type="ext_resource", - line_number=content[: match.start()].count("\n") + 1, + line_number=self._offset_to_line(match.start()), ) ) @@ -924,7 +938,7 @@ class DependencyAnalyzer: source_file=file_path, imported_module=resource_path, import_type="preload", - line_number=content[: match.start()].count("\n") + 1, + line_number=self._offset_to_line(match.start()), ) ) diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index 957ca5b..5cb5585 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -64,6 +64,11 @@ FALLBACK_MAIN_SELECTORS = [ "#main-content", ] +# Pre-compiled regex patterns for frequently called methods +_WHITESPACE_RE = re.compile(r"\s+") +_SAFE_TITLE_RE = re.compile(r"[^\w\s-]") +_SAFE_TITLE_SEP_RE = re.compile(r"[-\s]+") + def infer_description_from_docs( base_url: str, first_page_content: str | None = None, name: str = "" @@ -188,12 +193,18 @@ class DocToSkillConverter: # Support multiple starting URLs start_urls = config.get("start_urls", [self.base_url]) self.pending_urls = deque(start_urls) + self._pending_set: set[str] = set(start_urls) # Shadow set for O(1) membership checks self.pages: list[dict[str, Any]] = [] self.pages_scraped = 0 # Language detection self.language_detector = LanguageDetector(min_confidence=0.15) + # Pre-cache URL patterns for faster is_valid_url checks + url_patterns = config.get("url_patterns", {}) + self._include_patterns: list[str] = url_patterns.get("include", []) + self._exclude_patterns: list[str] = url_patterns.get("exclude", []) + # Thread-safe lock for parallel scraping if self.workers > 1: import threading @@ -211,6 +222,12 @@ class DocToSkillConverter: if resume and not dry_run: self.load_checkpoint() + def _enqueue_url(self, url: str) -> None: + """Add a URL to the pending queue if not already visited or pending (O(1)).""" + if url not in self.visited_urls and url not in self._pending_set: + self._pending_set.add(url) + self.pending_urls.append(url) + def is_valid_url(self, url: str) -> bool: """Check if URL should be scraped based on patterns. @@ -223,14 +240,10 @@ class DocToSkillConverter: if not url.startswith(self.base_url): return False - # Include patterns - includes = self.config.get("url_patterns", {}).get("include", []) - if includes and not any(pattern in url for pattern in includes): + if self._include_patterns and not any(pattern in url for pattern in self._include_patterns): return False - # Exclude patterns - excludes = self.config.get("url_patterns", {}).get("exclude", []) - return not any(pattern in url for pattern in excludes) + return not any(pattern in url for pattern in self._exclude_patterns) def save_checkpoint(self) -> None: """Save progress checkpoint""" @@ -264,7 +277,9 @@ class DocToSkillConverter: checkpoint_data = json.load(f) self.visited_urls = set(checkpoint_data["visited_urls"]) - self.pending_urls = deque(checkpoint_data["pending_urls"]) + pending = checkpoint_data["pending_urls"] + self.pending_urls = deque(pending) + self._pending_set = set(pending) self.pages_scraped = checkpoint_data["pages_scraped"] logger.info("āœ… Resumed from checkpoint") @@ -337,11 +352,13 @@ class DocToSkillConverter: # Extract links from entire page (always, even if main content not found). # This allows discovery of navigation links outside the main content area. + seen_links: set[str] = set() for link in soup.find_all("a", href=True): href = urljoin(url, link["href"]) # Strip anchor fragments to avoid treating #anchors as separate pages href = href.split("#")[0] - if self.is_valid_url(href) and href not in page["links"]: + if href not in seen_links and self.is_valid_url(href): + seen_links.add(href) page["links"].append(href) # Find main content using shared fallback logic @@ -413,8 +430,6 @@ class DocToSkillConverter: Only .md links are extracted to avoid client-side rendered HTML pages. Anchor fragments (#section) are stripped from links. """ - import re - # Detect if content is actually HTML (some .md URLs return HTML) if content.strip().startswith(" str: """Clean text content""" - text = re.sub(r"\s+", " ", text) - return text.strip() + return _WHITESPACE_RE.sub(" ", text).strip() def save_page(self, page: dict[str, Any]) -> None: """Save page data (skip pages with empty content)""" @@ -660,8 +674,8 @@ class DocToSkillConverter: return url_hash = hashlib.md5(page["url"].encode()).hexdigest()[:10] - safe_title = re.sub(r"[^\w\s-]", "", page["title"])[:50] - safe_title = re.sub(r"[-\s]+", "_", safe_title) + safe_title = _SAFE_TITLE_RE.sub("", page["title"])[:50] + safe_title = _SAFE_TITLE_SEP_RE.sub("_", safe_title) filename = f"{safe_title}_{url_hash}.json" filepath = os.path.join(self.data_dir, "pages", filename) @@ -695,27 +709,19 @@ class DocToSkillConverter: soup = BeautifulSoup(response.content, "html.parser") page = self.extract_content(soup, url) - # Thread-safe operations (lock required) - if self.workers > 1: - with self.lock: - logger.info(" %s", url) - self.save_page(page) - self.pages.append(page) - - # Add new URLs - for link in page["links"]: - if link not in self.visited_urls and link not in self.pending_urls: - self.pending_urls.append(link) - else: - # Single-threaded mode (no lock needed) + # Store results (thread-safe when workers > 1) + def _store_results(): logger.info(" %s", url) self.save_page(page) self.pages.append(page) - - # Add new URLs for link in page["links"]: - if link not in self.visited_urls and link not in self.pending_urls: - self.pending_urls.append(link) + self._enqueue_url(link) + + if self.workers > 1: + with self.lock: + _store_results() + else: + _store_results() # Rate limiting rate_limit = self.config.get("rate_limit", DEFAULT_RATE_LIMIT) @@ -766,8 +772,7 @@ class DocToSkillConverter: # Add new URLs for link in page["links"]: - if link not in self.visited_urls and link not in self.pending_urls: - self.pending_urls.append(link) + self._enqueue_url(link) # Rate limiting rate_limit = self.config.get("rate_limit", DEFAULT_RATE_LIMIT) @@ -924,8 +929,8 @@ class DocToSkillConverter: # Filter URLs based on url_patterns config for url in md_urls: - if self.is_valid_url(url) and url not in self.visited_urls: - self.pending_urls.append(url) + if self.is_valid_url(url): + self._enqueue_url(url) logger.info( " šŸ“‹ %d URLs added to crawl queue after filtering", @@ -1010,8 +1015,8 @@ class DocToSkillConverter: # Filter URLs based on url_patterns config for url in md_urls: - if self.is_valid_url(url) and url not in self.visited_urls: - self.pending_urls.append(url) + if self.is_valid_url(url): + self._enqueue_url(url) logger.info( " šŸ“‹ %d URLs added to crawl queue after filtering", @@ -1115,8 +1120,8 @@ class DocToSkillConverter: for link in soup.find_all("a", href=True): href = urljoin(url, link["href"]) href = href.split("#")[0] - if self.is_valid_url(href) and href not in self.visited_urls: - self.pending_urls.append(href) + if self.is_valid_url(href): + self._enqueue_url(href) except Exception as e: # Failed to extract links in fast mode, continue anyway logger.warning("āš ļø Warning: Could not extract links from %s: %s", url, e) @@ -1299,8 +1304,8 @@ class DocToSkillConverter: for link in soup.find_all("a", href=True): href = urljoin(url, link["href"]) href = href.split("#")[0] - if self.is_valid_url(href) and href not in self.visited_urls: - self.pending_urls.append(href) + if self.is_valid_url(href): + self._enqueue_url(href) except Exception as e: logger.warning( "āš ļø Warning: Could not extract links from %s: %s", url, e @@ -1313,7 +1318,12 @@ class DocToSkillConverter: # Wait for batch to complete before continuing if tasks: - await asyncio.gather(*tasks, return_exceptions=True) + results = await asyncio.gather(*tasks, return_exceptions=True) + for result in results: + if isinstance(result, Exception): + logger.error( + " āœ— Async task failed: %s: %s", type(result).__name__, result + ) tasks = [] self.pages_scraped = len(self.visited_urls) @@ -1331,7 +1341,10 @@ class DocToSkillConverter: # Wait for any remaining tasks if tasks: - await asyncio.gather(*tasks, return_exceptions=True) + results = await asyncio.gather(*tasks, return_exceptions=True) + for result in results: + if isinstance(result, Exception): + logger.error(" āœ— Async task failed: %s: %s", type(result).__name__, result) if self.dry_run: logger.info("\nāœ… Dry run complete: would scrape ~%d pages", len(self.visited_urls)) @@ -1356,8 +1369,11 @@ class DocToSkillConverter: "pages": [{"title": p["title"], "url": p["url"]} for p in self.pages], } - with open(f"{self.data_dir}/summary.json", "w", encoding="utf-8") as f: - json.dump(summary, f, indent=2, ensure_ascii=False) + try: + with open(f"{self.data_dir}/summary.json", "w", encoding="utf-8") as f: + json.dump(summary, f, indent=2, ensure_ascii=False) + except OSError as e: + logger.error(" āœ— Failed to save summary: %s", e) def load_scraped_data(self) -> list[dict[str, Any]]: """Load previously scraped data""" @@ -1395,6 +1411,11 @@ class DocToSkillConverter: categories: dict[str, list[dict[str, Any]]] = {cat: [] for cat in category_defs} categories["other"] = [] + # Pre-lowercase keywords once instead of per-page per-keyword + lowered_defs = { + cat: [kw.lower() for kw in keywords] for cat, keywords in category_defs.items() + } + for page in pages: url = page["url"].lower() title = page["title"].lower() @@ -1404,11 +1425,10 @@ class DocToSkillConverter: categorized = False - # Match against keywords - for cat, keywords in category_defs.items(): + # Match against pre-lowercased keywords + for cat, keywords in lowered_defs.items(): score = 0 for keyword in keywords: - keyword = keyword.lower() if keyword in url: score += 3 if keyword in title: @@ -1450,15 +1470,12 @@ class DocToSkillConverter: if count >= 3: # At least 3 pages categories[seg] = [seg] - # Add common defaults - if "tutorial" not in categories and any( - "tutorial" in url for url in [p["url"] for p in pages] - ): + # Add common defaults (use pre-built URL list to avoid repeated comprehensions) + all_urls = [p["url"] for p in pages] + if "tutorial" not in categories and any("tutorial" in url for url in all_urls): categories["tutorials"] = ["tutorial", "guide", "getting-started"] - if "api" not in categories and any( - "api" in url or "reference" in url for url in [p["url"] for p in pages] - ): + if "api" not in categories and any("api" in url or "reference" in url for url in all_urls): categories["api"] = ["api", "reference", "class"] return categories diff --git a/src/skill_seekers/cli/github_scraper.py b/src/skill_seekers/cli/github_scraper.py index a3763c6..14098c9 100644 --- a/src/skill_seekers/cli/github_scraper.py +++ b/src/skill_seekers/cli/github_scraper.py @@ -15,6 +15,7 @@ Usage: """ import argparse +import fnmatch import json import logging import os @@ -664,11 +665,13 @@ class GitHubScraper: def _extract_file_tree_github(self): """Extract file tree from GitHub API (rate-limited).""" try: - contents = self.repo.get_contents("") + from collections import deque + + contents = deque(self.repo.get_contents("")) file_tree = [] while contents: - file_content = contents.pop(0) + file_content = contents.popleft() file_info = { "path": file_content.path, @@ -741,11 +744,10 @@ class GitHubScraper: continue # Check if file matches patterns (if specified) - if self.file_patterns: - import fnmatch - - if not any(fnmatch.fnmatch(file_path, pattern) for pattern in self.file_patterns): - continue + if self.file_patterns and not any( + fnmatch.fnmatch(file_path, pattern) for pattern in self.file_patterns + ): + continue # Analyze this file try: