perf: optimize with caching, pre-compiled regex, O(1) lookups, and bisect line indexing (#309)

## Summary Performance optimizations across core scraping and analysis modules: - **doc_scraper.py**: Pre-compiled regex at module level, O(1) URL dedup via _enqueued_urls set, cached URL patterns, _enqueue_url() helper (DRY), seen_links set for link extraction, pre-lowercased category keywords, async error logging (bug fix), summary I/O error handling - **code_analyzer.py**: O(log n) bisect-based line lookups replacing O(n) count("\n") across all 10 language analyzers; O(n) parent class map replacing O(n^2) AST walks for Python method detection - **dependency_analyzer.py**: Same bisect line-index optimization for all import extractors - **codebase_scraper.py**: Module-level import re, pre-imported parser classes outside loop - **github_scraper.py**: deque.popleft() for O(1) tree traversal, module-level import fnmatch - **utils.py**: Shared build_line_index() / offset_to_line() utilities (DRY) - **test_adaptor_benchmarks.py**: Stabilized flaky test_benchmark_metadata_overhead (median, warm-up, more iterations) Review fixes applied on top of original PR: 1. Renamed misleading _pending_set to _enqueued_urls 2. Extracted duplicated line-index code into shared cli/utils.py 3. Fixed pre-existing "tutorial" vs "tutorials" key mismatch bug in infer_categories() 4. Removed unnecessary _store_results() closure 5. Simplified parser pre-import pattern
2026-03-14 13:35:39 -07:00
parent 0ca271cdcb
commit 89f5e6fe5f
5 changed files with 191 additions and 140 deletions
--- a/src/skill_seekers/cli/code_analyzer.py
+++ b/src/skill_seekers/cli/code_analyzer.py
@@ -23,6 +23,7 @@ consider using dedicated parsers (tree-sitter, language-specific AST libraries).
 """

 import ast
+import bisect
 import contextlib
 import logging
 import re
@@ -84,6 +85,16 @@ class CodeAnalyzer:
            depth: Analysis depth ('surface', 'deep', 'full')
        """
        self.depth = depth
+        self._newline_offsets: list[int] = []
+
+    @staticmethod
+    def _build_line_index(content: str) -> list[int]:
+        """Build a sorted list of newline positions for O(log n) line lookups."""
+        return [i for i, ch in enumerate(content) if ch == "\n"]
+
+    def _offset_to_line(self, offset: int) -> int:
+        """Convert a character offset to a 1-based line number using bisect."""
+        return bisect.bisect_left(self._newline_offsets, offset) + 1

    def analyze_file(self, file_path: str, content: str, language: str) -> dict[str, Any]:
        """
@@ -149,35 +160,26 @@ class CodeAnalyzer:
        functions = []
        imports = []

+        # Build parent map once (O(n)) instead of walking tree per node (O(n²))
+        class_children: set[int] = set()
+        for node in ast.walk(tree):
+            if isinstance(node, ast.ClassDef) and isinstance(node.body, list):
+                for child in node.body:
+                    class_children.add(id(child))
+
        for node in ast.walk(tree):
            if isinstance(node, ast.ClassDef):
                class_sig = self._extract_python_class(node)
                classes.append(asdict(class_sig))
            elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
-                # Only top-level functions (not methods)
-                # Fix AST parser to check isinstance(parent.body, list) before 'in' operator
-                is_method = False
-                try:
-                    is_method = any(
-                        isinstance(parent, ast.ClassDef)
-                        for parent in ast.walk(tree)
-                        if hasattr(parent, "body")
-                        and isinstance(parent.body, list)
-                        and node in parent.body
-                    )
-                except (TypeError, AttributeError):
-                    # If body is not iterable or check fails, assume it's a top-level function
-                    is_method = False
-
-                if not is_method:
+                # Only top-level functions (not methods) - O(1) lookup via pre-built set
+                if id(node) not in class_children:
                    func_sig = self._extract_python_function(node)
                    functions.append(asdict(func_sig))
            elif isinstance(node, ast.Import):
-                # Extract: import foo, bar
                for alias in node.names:
                    imports.append(alias.name)
            elif isinstance(node, ast.ImportFrom):
-                # Extract: from foo import bar
                module = node.module or ""
                imports.append(module)

@@ -188,7 +190,7 @@ class CodeAnalyzer:
            "classes": classes,
            "functions": functions,
            "comments": comments,
-            "imports": imports,  # Include imports for framework detection
+            "imports": imports,
        }

    def _extract_python_class(self, node: ast.ClassDef) -> ClassSignature:
@@ -285,6 +287,7 @@ class CodeAnalyzer:
        Note: This is a simplified approach. For production, consider using
        a proper JS/TS parser like esprima or ts-morph.
        """
+        self._newline_offsets = self._build_line_index(content)
        classes = []
        functions = []

@@ -310,7 +313,7 @@ class CodeAnalyzer:
                    "base_classes": [base_class] if base_class else [],
                    "methods": methods,
                    "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                }
            )

@@ -329,7 +332,7 @@ class CodeAnalyzer:
                    "parameters": params,
                    "return_type": None,  # JS doesn't have type annotations (unless TS)
                    "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                    "is_async": is_async,
                    "is_method": False,
                    "decorators": [],
@@ -351,7 +354,7 @@ class CodeAnalyzer:
                    "parameters": params,
                    "return_type": None,
                    "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                    "is_async": is_async,
                    "is_method": False,
                    "decorators": [],
@@ -460,6 +463,7 @@ class CodeAnalyzer:
        Note: This is a simplified approach focusing on header files.
        For production, consider using libclang or similar.
        """
+        self._newline_offsets = self._build_line_index(content)
        classes = []
        functions = []

@@ -475,7 +479,7 @@ class CodeAnalyzer:
                    "base_classes": [base_class] if base_class else [],
                    "methods": [],  # Simplified - would need to parse class body
                    "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                }
            )

@@ -498,7 +502,7 @@ class CodeAnalyzer:
                    "parameters": params,
                    "return_type": return_type,
                    "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                    "is_async": False,
                    "is_method": False,
                    "decorators": [],
@@ -577,14 +581,14 @@ class CodeAnalyzer:

        # Extract single-line comments (//)
        for match in re.finditer(r"//(.+)$", content, re.MULTILINE):
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
            comment_text = match.group(1).strip()

            comments.append({"line": line_num, "text": comment_text, "type": "inline"})

        # Extract multi-line comments (/* */)
        for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL):
-            start_line = content[: match.start()].count("\n") + 1
+            start_line = self._offset_to_line(match.start())
            comment_text = match.group(1).strip()

            comments.append({"line": start_line, "text": comment_text, "type": "block"})
@@ -610,6 +614,7 @@ class CodeAnalyzer:
        Regex patterns inspired by C# language specification:
        https://learn.microsoft.com/en-us/dotnet/csharp/language-reference/
        """
+        self._newline_offsets = self._build_line_index(content)
        classes = []
        functions = []

@@ -651,7 +656,7 @@ class CodeAnalyzer:
                    "base_classes": base_classes,
                    "methods": methods,
                    "docstring": None,  # Would need to extract XML doc comments
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                }
            )

@@ -676,7 +681,7 @@ class CodeAnalyzer:
                    "parameters": params,
                    "return_type": return_type,
                    "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                    "is_async": is_async,
                    "is_method": False,
                    "decorators": [],
@@ -791,7 +796,7 @@ class CodeAnalyzer:

        # Single-line comments (//)
        for match in re.finditer(r"//(.+)$", content, re.MULTILINE):
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
            comment_text = match.group(1).strip()

            # Distinguish XML doc comments (///)
@@ -803,7 +808,7 @@ class CodeAnalyzer:

        # Multi-line comments (/* */)
        for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL):
-            start_line = content[: match.start()].count("\n") + 1
+            start_line = self._offset_to_line(match.start())
            comment_text = match.group(1).strip()

            comments.append({"line": start_line, "text": comment_text, "type": "block"})
@@ -820,6 +825,7 @@ class CodeAnalyzer:
        Regex patterns based on Go language specification:
        https://go.dev/ref/spec
        """
+        self._newline_offsets = self._build_line_index(content)
        classes = []  # Go doesn't have classes, but we'll extract structs
        functions = []

@@ -834,7 +840,7 @@ class CodeAnalyzer:
                    "base_classes": [],  # Go uses embedding, not inheritance
                    "methods": [],  # Methods extracted separately
                    "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                }
            )

@@ -867,7 +873,7 @@ class CodeAnalyzer:
                    "parameters": params,
                    "return_type": return_type,
                    "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                    "is_async": False,  # Go uses goroutines differently
                    "is_method": is_method,
                    "decorators": [],
@@ -929,6 +935,7 @@ class CodeAnalyzer:
        Regex patterns based on Rust language reference:
        https://doc.rust-lang.org/reference/
        """
+        self._newline_offsets = self._build_line_index(content)
        classes = []  # Rust uses structs/enums/traits
        functions = []

@@ -943,7 +950,7 @@ class CodeAnalyzer:
                    "base_classes": [],  # Rust uses traits, not inheritance
                    "methods": [],
                    "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                }
            )

@@ -964,7 +971,7 @@ class CodeAnalyzer:
                    "parameters": params,
                    "return_type": return_type,
                    "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                    "is_async": is_async,
                    "is_method": False,
                    "decorators": [],
@@ -1016,7 +1023,7 @@ class CodeAnalyzer:

        # Single-line comments (//)
        for match in re.finditer(r"//(.+)$", content, re.MULTILINE):
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
            comment_text = match.group(1).strip()

            # Distinguish doc comments (/// or //!)
@@ -1030,7 +1037,7 @@ class CodeAnalyzer:

        # Multi-line comments (/* */)
        for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL):
-            start_line = content[: match.start()].count("\n") + 1
+            start_line = self._offset_to_line(match.start())
            comment_text = match.group(1).strip()

            comments.append({"line": start_line, "text": comment_text, "type": "block"})
@@ -1047,6 +1054,7 @@ class CodeAnalyzer:
        Regex patterns based on Java language specification:
        https://docs.oracle.com/javase/specs/
        """
+        self._newline_offsets = self._build_line_index(content)
        classes = []
        functions = []

@@ -1089,7 +1097,7 @@ class CodeAnalyzer:
                    "base_classes": base_classes,
                    "methods": methods,
                    "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                }
            )

@@ -1112,7 +1120,7 @@ class CodeAnalyzer:
                    "parameters": params,
                    "return_type": return_type,
                    "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                    "is_async": False,
                    "is_method": False,
                    "decorators": [],
@@ -1221,14 +1229,14 @@ class CodeAnalyzer:

        # Single-line comments (//)
        for match in re.finditer(r"//(.+)$", content, re.MULTILINE):
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
            comment_text = match.group(1).strip()

            comments.append({"line": line_num, "text": comment_text, "type": "inline"})

        # Multi-line and JavaDoc comments (/* */ and /** */)
        for match in re.finditer(r"/\*\*?(.+?)\*/", content, re.DOTALL):
-            start_line = content[: match.start()].count("\n") + 1
+            start_line = self._offset_to_line(match.start())
            comment_text = match.group(1).strip()

            # Distinguish JavaDoc (starts with **)
@@ -1248,6 +1256,7 @@ class CodeAnalyzer:
        Regex patterns based on Ruby language documentation:
        https://ruby-doc.org/
        """
+        self._newline_offsets = self._build_line_index(content)
        classes = []
        functions = []

@@ -1265,7 +1274,7 @@ class CodeAnalyzer:
                    "base_classes": base_classes,
                    "methods": [],  # Would need to parse class body
                    "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                }
            )

@@ -1284,7 +1293,7 @@ class CodeAnalyzer:
                    "parameters": params,
                    "return_type": None,  # Ruby has no type annotations (usually)
                    "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                    "is_async": False,
                    "is_method": False,
                    "decorators": [],
@@ -1365,6 +1374,7 @@ class CodeAnalyzer:
        Regex patterns based on PHP language reference:
        https://www.php.net/manual/en/langref.php
        """
+        self._newline_offsets = self._build_line_index(content)
        classes = []
        functions = []

@@ -1406,7 +1416,7 @@ class CodeAnalyzer:
                    "base_classes": base_classes,
                    "methods": methods,
                    "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                }
            )

@@ -1425,7 +1435,7 @@ class CodeAnalyzer:
                    "parameters": params,
                    "return_type": return_type,
                    "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                    "is_async": False,
                    "is_method": False,
                    "decorators": [],
@@ -1526,14 +1536,14 @@ class CodeAnalyzer:

        # Single-line comments (// and #)
        for match in re.finditer(r"(?://|#)(.+)$", content, re.MULTILINE):
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
            comment_text = match.group(1).strip()

            comments.append({"line": line_num, "text": comment_text, "type": "inline"})

        # Multi-line and PHPDoc comments (/* */ and /** */)
        for match in re.finditer(r"/\*\*?(.+?)\*/", content, re.DOTALL):
-            start_line = content[: match.start()].count("\n") + 1
+            start_line = self._offset_to_line(match.start())
            comment_text = match.group(1).strip()

            # Distinguish PHPDoc (starts with **)
@@ -1708,6 +1718,7 @@ class CodeAnalyzer:
        - @export var speed: float = 100.0
        - @onready var sprite = $Sprite2D
        """
+        self._newline_offsets = self._build_line_index(content)
        classes = []
        functions = []
        signals = []
@@ -1764,7 +1775,7 @@ class CodeAnalyzer:
                    "name": func_name,
                    "parameters": param_list,
                    "return_type": return_type,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                }
            )

@@ -1774,7 +1785,7 @@ class CodeAnalyzer:

        for match in re.finditer(r"signal\s+(\w+)(?:\(([^)]*)\))?", content):
            signal_name, params = match.groups()
-            line_number = content[: match.start()].count("\n") + 1
+            line_number = self._offset_to_line(match.start())

            # Extract documentation comment above signal (## or #)
            doc_comment = None
@@ -1800,7 +1811,7 @@ class CodeAnalyzer:
                {
                    "signal": signal_path,
                    "handler": handler.strip(),
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                }
            )

@@ -1811,7 +1822,7 @@ class CodeAnalyzer:
                {
                    "signal": signal_path,
                    "arguments": args.strip() if args else "",
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                }
            )

@@ -1827,7 +1838,7 @@ class CodeAnalyzer:
                    "type": var_type,
                    "default": default,
                    "export_hint": hint,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                }
            )

--- a/src/skill_seekers/cli/codebase_scraper.py
+++ b/src/skill_seekers/cli/codebase_scraper.py
@@ -28,6 +28,7 @@ import argparse
 import json
 import logging
 import os
+import re
 import sys
 from pathlib import Path
 from typing import Any
@@ -380,8 +381,6 @@ def extract_markdown_structure(content: str) -> dict[str, Any]:
    Returns:
        Dictionary with extracted structure
    """
-    import re
-
    structure = {
        "title": None,
        "headers": [],
@@ -526,8 +525,6 @@ def extract_rst_structure(content: str) -> dict[str, Any]:
        logger.warning(f"Enhanced RST parser failed: {e}, using basic parser")

    # Legacy basic extraction (fallback)
-    import re
-
    structure = {
        "title": None,
        "headers": [],
@@ -679,6 +676,17 @@ def process_markdown_docs(
    processed_docs = []
    categories = {}

+    # Pre-import parsers once outside the loop
+    _rst_parser_cls = None
+    _md_parser_cls = None
+    try:
+        from skill_seekers.cli.parsers.extractors import RstParser, MarkdownParser
+
+        _rst_parser_cls = RstParser
+        _md_parser_cls = MarkdownParser
+    except ImportError:
+        logger.debug("Unified parsers not available, using legacy parsers")
+
    for md_path in md_files:
        try:
            content = md_path.read_text(encoding="utf-8", errors="ignore")
@@ -701,7 +709,10 @@ def process_markdown_docs(
                parsed_doc = None

                try:
-                    from skill_seekers.cli.parsers.extractors import RstParser, MarkdownParser
+                    RstParser = _rst_parser_cls
+                    MarkdownParser = _md_parser_cls
+                    if RstParser is None or MarkdownParser is None:
+                        raise ImportError("Parsers not available")

                    # Use appropriate unified parser based on file extension
                    if md_path.suffix.lower() in RST_EXTENSIONS:
@@ -957,8 +968,6 @@ Return JSON with format:

            # Parse response and merge enhancements
            try:
-                import re
-
                json_match = re.search(r"\{.*\}", response.content[0].text, re.DOTALL)
                if json_match:
                    enhancements = json.loads(json_match.group())
@@ -1022,8 +1031,6 @@ Output JSON only:
        os.unlink(prompt_file)

        if result.returncode == 0 and result.stdout:
-            import re
-
            json_match = re.search(r"\{.*\}", result.stdout, re.DOTALL)
            if json_match:
                enhancements = json.loads(json_match.group())
--- a/src/skill_seekers/cli/dependency_analyzer.py
+++ b/src/skill_seekers/cli/dependency_analyzer.py
@@ -40,6 +40,7 @@ Credits:
 """

 import ast
+import bisect
 import logging
 import re
 from dataclasses import dataclass, field
@@ -95,6 +96,16 @@ class DependencyAnalyzer:
        self.graph = nx.DiGraph()  # Directed graph for dependencies
        self.file_dependencies: dict[str, list[DependencyInfo]] = {}
        self.file_nodes: dict[str, FileNode] = {}
+        self._newline_offsets: list[int] = []
+
+    @staticmethod
+    def _build_line_index(content: str) -> list[int]:
+        """Build a sorted list of newline positions for O(log n) line lookups."""
+        return [i for i, ch in enumerate(content) if ch == "\n"]
+
+    def _offset_to_line(self, offset: int) -> int:
+        """Convert a character offset to a 1-based line number using bisect."""
+        return bisect.bisect_left(self._newline_offsets, offset) + 1

    def analyze_file(self, file_path: str, content: str, language: str) -> list[DependencyInfo]:
        """
@@ -109,6 +120,9 @@ class DependencyAnalyzer:
        Returns:
            List of DependencyInfo objects
        """
+        # Build line index once for O(log n) lookups in all extractors
+        self._newline_offsets = self._build_line_index(content)
+
        if language == "Python":
            deps = self._extract_python_imports(content, file_path)
        elif language == "GDScript":
@@ -216,7 +230,7 @@ class DependencyAnalyzer:
        preload_pattern = r'(?:const|var)\s+\w+\s*=\s*preload\("(.+?)"\)'
        for match in re.finditer(preload_pattern, content):
            resource_path = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            # Convert res:// paths to relative
            if resource_path.startswith("res://"):
@@ -236,7 +250,7 @@ class DependencyAnalyzer:
        load_pattern = r'(?:const|var)\s+\w+\s*=\s*load\("(.+?)"\)'
        for match in re.finditer(load_pattern, content):
            resource_path = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            if resource_path.startswith("res://"):
                resource_path = resource_path[6:]
@@ -255,7 +269,7 @@ class DependencyAnalyzer:
        extends_path_pattern = r'extends\s+"(.+?)"'
        for match in re.finditer(extends_path_pattern, content):
            resource_path = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            if resource_path.startswith("res://"):
                resource_path = resource_path[6:]
@@ -275,7 +289,7 @@ class DependencyAnalyzer:
        extends_class_pattern = r"extends\s+([A-Z]\w+)"
        for match in re.finditer(extends_class_pattern, content):
            class_name = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            # Skip built-in Godot classes (Node, Resource, etc.)
            if class_name not in (
@@ -334,7 +348,7 @@ class DependencyAnalyzer:
        import_pattern = r"import\s+(?:[\w\s{},*]+\s+from\s+)?['\"]([^'\"]+)['\"]"
        for match in re.finditer(import_pattern, content):
            module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
            is_relative = module.startswith(".") or module.startswith("/")

            deps.append(
@@ -351,7 +365,7 @@ class DependencyAnalyzer:
        require_pattern = r"require\s*\(['\"]([^'\"]+)['\"]\)"
        for match in re.finditer(require_pattern, content):
            module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
            is_relative = module.startswith(".") or module.startswith("/")

            deps.append(
@@ -380,7 +394,7 @@ class DependencyAnalyzer:
        include_pattern = r'#include\s+[<"]([^>"]+)[>"]'
        for match in re.finditer(include_pattern, content):
            header = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            # Headers with "" are usually local, <> are system headers
            is_relative = '"' in match.group(0)
@@ -417,7 +431,7 @@ class DependencyAnalyzer:
        for match in re.finditer(using_pattern, content):
            alias = match.group(1)  # Optional alias
            namespace = match.group(2)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            # Skip 'using' statements for IDisposable (using var x = ...)
            if "=" in match.group(0) and not alias:
@@ -454,7 +468,7 @@ class DependencyAnalyzer:
        for match in re.finditer(single_import_pattern, content):
            match.group(1)  # Optional alias
            package = match.group(2)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            # Check if relative (starts with ./ or ../)
            is_relative = package.startswith("./")
@@ -516,7 +530,7 @@ class DependencyAnalyzer:
        use_pattern = r"use\s+([\w:{}]+(?:\s*,\s*[\w:{}]+)*|[\w:]+::\{[^}]+\})\s*;"
        for match in re.finditer(use_pattern, content):
            module_path = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            # Determine if relative
            is_relative = module_path.startswith(("self::", "super::"))
@@ -571,7 +585,7 @@ class DependencyAnalyzer:
        import_pattern = r"import\s+(?:static\s+)?([A-Za-z_][\w.]*(?:\.\*)?)\s*;"
        for match in re.finditer(import_pattern, content):
            import_path = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            deps.append(
                DependencyInfo(
@@ -603,7 +617,7 @@ class DependencyAnalyzer:
        require_pattern = r"require\s+['\"]([^'\"]+)['\"]"
        for match in re.finditer(require_pattern, content):
            module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            deps.append(
                DependencyInfo(
@@ -619,7 +633,7 @@ class DependencyAnalyzer:
        require_relative_pattern = r"require_relative\s+['\"]([^'\"]+)['\"]"
        for match in re.finditer(require_relative_pattern, content):
            module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            deps.append(
                DependencyInfo(
@@ -635,7 +649,7 @@ class DependencyAnalyzer:
        load_pattern = r"load\s+['\"]([^'\"]+)['\"]"
        for match in re.finditer(load_pattern, content):
            module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            deps.append(
                DependencyInfo(
@@ -669,7 +683,7 @@ class DependencyAnalyzer:
        require_pattern = r"(?:require|include)(?:_once)?\s+['\"]([^'\"]+)['\"]"
        for match in re.finditer(require_pattern, content):
            module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            # Determine import type
            import_type = "require" if "require" in match.group(0) else "include"
@@ -691,7 +705,7 @@ class DependencyAnalyzer:
        use_pattern = r"use\s+([A-Za-z_][\w\\]*)\s*(?:as\s+\w+)?\s*;"
        for match in re.finditer(use_pattern, content):
            namespace = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())

            deps.append(
                DependencyInfo(
@@ -908,7 +922,7 @@ class DependencyAnalyzer:
                    source_file=file_path,
                    imported_module=resource_path,
                    import_type="ext_resource",
-                    line_number=content[: match.start()].count("\n") + 1,
+                    line_number=self._offset_to_line(match.start()),
                )
            )

@@ -924,7 +938,7 @@ class DependencyAnalyzer:
                    source_file=file_path,
                    imported_module=resource_path,
                    import_type="preload",
-                    line_number=content[: match.start()].count("\n") + 1,
+                    line_number=self._offset_to_line(match.start()),
                )
            )

--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -64,6 +64,11 @@ FALLBACK_MAIN_SELECTORS = [
    "#main-content",
 ]

+# Pre-compiled regex patterns for frequently called methods
+_WHITESPACE_RE = re.compile(r"\s+")
+_SAFE_TITLE_RE = re.compile(r"[^\w\s-]")
+_SAFE_TITLE_SEP_RE = re.compile(r"[-\s]+")
+

 def infer_description_from_docs(
    base_url: str, first_page_content: str | None = None, name: str = ""
@@ -188,12 +193,18 @@ class DocToSkillConverter:
        # Support multiple starting URLs
        start_urls = config.get("start_urls", [self.base_url])
        self.pending_urls = deque(start_urls)
+        self._pending_set: set[str] = set(start_urls)  # Shadow set for O(1) membership checks
        self.pages: list[dict[str, Any]] = []
        self.pages_scraped = 0

        # Language detection
        self.language_detector = LanguageDetector(min_confidence=0.15)

+        # Pre-cache URL patterns for faster is_valid_url checks
+        url_patterns = config.get("url_patterns", {})
+        self._include_patterns: list[str] = url_patterns.get("include", [])
+        self._exclude_patterns: list[str] = url_patterns.get("exclude", [])
+
        # Thread-safe lock for parallel scraping
        if self.workers > 1:
            import threading
@@ -211,6 +222,12 @@ class DocToSkillConverter:
        if resume and not dry_run:
            self.load_checkpoint()

+    def _enqueue_url(self, url: str) -> None:
+        """Add a URL to the pending queue if not already visited or pending (O(1))."""
+        if url not in self.visited_urls and url not in self._pending_set:
+            self._pending_set.add(url)
+            self.pending_urls.append(url)
+
    def is_valid_url(self, url: str) -> bool:
        """Check if URL should be scraped based on patterns.

@@ -223,14 +240,10 @@ class DocToSkillConverter:
        if not url.startswith(self.base_url):
            return False

-        # Include patterns
-        includes = self.config.get("url_patterns", {}).get("include", [])
-        if includes and not any(pattern in url for pattern in includes):
+        if self._include_patterns and not any(pattern in url for pattern in self._include_patterns):
            return False

-        # Exclude patterns
-        excludes = self.config.get("url_patterns", {}).get("exclude", [])
-        return not any(pattern in url for pattern in excludes)
+        return not any(pattern in url for pattern in self._exclude_patterns)

    def save_checkpoint(self) -> None:
        """Save progress checkpoint"""
@@ -264,7 +277,9 @@ class DocToSkillConverter:
                checkpoint_data = json.load(f)

            self.visited_urls = set(checkpoint_data["visited_urls"])
-            self.pending_urls = deque(checkpoint_data["pending_urls"])
+            pending = checkpoint_data["pending_urls"]
+            self.pending_urls = deque(pending)
+            self._pending_set = set(pending)
            self.pages_scraped = checkpoint_data["pages_scraped"]

            logger.info("✅ Resumed from checkpoint")
@@ -337,11 +352,13 @@ class DocToSkillConverter:

        # Extract links from entire page (always, even if main content not found).
        # This allows discovery of navigation links outside the main content area.
+        seen_links: set[str] = set()
        for link in soup.find_all("a", href=True):
            href = urljoin(url, link["href"])
            # Strip anchor fragments to avoid treating #anchors as separate pages
            href = href.split("#")[0]
-            if self.is_valid_url(href) and href not in page["links"]:
+            if href not in seen_links and self.is_valid_url(href):
+                seen_links.add(href)
                page["links"].append(href)

        # Find main content using shared fallback logic
@@ -413,8 +430,6 @@ class DocToSkillConverter:
            Only .md links are extracted to avoid client-side rendered HTML pages.
            Anchor fragments (#section) are stripped from links.
        """
-        import re
-
        # Detect if content is actually HTML (some .md URLs return HTML)
        if content.strip().startswith("<!DOCTYPE") or content.strip().startswith("<html"):
            return self._extract_html_as_markdown(content, url)
@@ -649,8 +664,7 @@ class DocToSkillConverter:

    def clean_text(self, text: str) -> str:
        """Clean text content"""
-        text = re.sub(r"\s+", " ", text)
-        return text.strip()
+        return _WHITESPACE_RE.sub(" ", text).strip()

    def save_page(self, page: dict[str, Any]) -> None:
        """Save page data (skip pages with empty content)"""
@@ -660,8 +674,8 @@ class DocToSkillConverter:
            return

        url_hash = hashlib.md5(page["url"].encode()).hexdigest()[:10]
-        safe_title = re.sub(r"[^\w\s-]", "", page["title"])[:50]
-        safe_title = re.sub(r"[-\s]+", "_", safe_title)
+        safe_title = _SAFE_TITLE_RE.sub("", page["title"])[:50]
+        safe_title = _SAFE_TITLE_SEP_RE.sub("_", safe_title)

        filename = f"{safe_title}_{url_hash}.json"
        filepath = os.path.join(self.data_dir, "pages", filename)
@@ -695,27 +709,19 @@ class DocToSkillConverter:
                soup = BeautifulSoup(response.content, "html.parser")
                page = self.extract_content(soup, url)

-            # Thread-safe operations (lock required)
-            if self.workers > 1:
-                with self.lock:
-                    logger.info("  %s", url)
-                    self.save_page(page)
-                    self.pages.append(page)
-
-                    # Add new URLs
-                    for link in page["links"]:
-                        if link not in self.visited_urls and link not in self.pending_urls:
-                            self.pending_urls.append(link)
-            else:
-                # Single-threaded mode (no lock needed)
+            # Store results (thread-safe when workers > 1)
+            def _store_results():
                logger.info("  %s", url)
                self.save_page(page)
                self.pages.append(page)
-
-                # Add new URLs
                for link in page["links"]:
-                    if link not in self.visited_urls and link not in self.pending_urls:
-                        self.pending_urls.append(link)
+                    self._enqueue_url(link)
+
+            if self.workers > 1:
+                with self.lock:
+                    _store_results()
+            else:
+                _store_results()

            # Rate limiting
            rate_limit = self.config.get("rate_limit", DEFAULT_RATE_LIMIT)
@@ -766,8 +772,7 @@ class DocToSkillConverter:

                # Add new URLs
                for link in page["links"]:
-                    if link not in self.visited_urls and link not in self.pending_urls:
-                        self.pending_urls.append(link)
+                    self._enqueue_url(link)

                # Rate limiting
                rate_limit = self.config.get("rate_limit", DEFAULT_RATE_LIMIT)
@@ -924,8 +929,8 @@ class DocToSkillConverter:

                    # Filter URLs based on url_patterns config
                    for url in md_urls:
-                        if self.is_valid_url(url) and url not in self.visited_urls:
-                            self.pending_urls.append(url)
+                        if self.is_valid_url(url):
+                            self._enqueue_url(url)

                    logger.info(
                        "  📋 %d URLs added to crawl queue after filtering",
@@ -1010,8 +1015,8 @@ class DocToSkillConverter:

            # Filter URLs based on url_patterns config
            for url in md_urls:
-                if self.is_valid_url(url) and url not in self.visited_urls:
-                    self.pending_urls.append(url)
+                if self.is_valid_url(url):
+                    self._enqueue_url(url)

            logger.info(
                "  📋 %d URLs added to crawl queue after filtering",
@@ -1115,8 +1120,8 @@ class DocToSkillConverter:
                        for link in soup.find_all("a", href=True):
                            href = urljoin(url, link["href"])
                            href = href.split("#")[0]
-                            if self.is_valid_url(href) and href not in self.visited_urls:
-                                self.pending_urls.append(href)
+                            if self.is_valid_url(href):
+                                self._enqueue_url(href)
                    except Exception as e:
                        # Failed to extract links in fast mode, continue anyway
                        logger.warning("⚠️  Warning: Could not extract links from %s: %s", url, e)
@@ -1299,8 +1304,8 @@ class DocToSkillConverter:
                                for link in soup.find_all("a", href=True):
                                    href = urljoin(url, link["href"])
                                    href = href.split("#")[0]
-                                    if self.is_valid_url(href) and href not in self.visited_urls:
-                                        self.pending_urls.append(href)
+                                    if self.is_valid_url(href):
+                                        self._enqueue_url(href)
                            except Exception as e:
                                logger.warning(
                                    "⚠️  Warning: Could not extract links from %s: %s", url, e
@@ -1313,7 +1318,12 @@ class DocToSkillConverter:

                # Wait for batch to complete before continuing
                if tasks:
-                    await asyncio.gather(*tasks, return_exceptions=True)
+                    results = await asyncio.gather(*tasks, return_exceptions=True)
+                    for result in results:
+                        if isinstance(result, Exception):
+                            logger.error(
+                                "  ✗ Async task failed: %s: %s", type(result).__name__, result
+                            )
                    tasks = []
                    self.pages_scraped = len(self.visited_urls)

@@ -1331,7 +1341,10 @@ class DocToSkillConverter:

            # Wait for any remaining tasks
            if tasks:
-                await asyncio.gather(*tasks, return_exceptions=True)
+                results = await asyncio.gather(*tasks, return_exceptions=True)
+                for result in results:
+                    if isinstance(result, Exception):
+                        logger.error("  ✗ Async task failed: %s: %s", type(result).__name__, result)

        if self.dry_run:
            logger.info("\n✅ Dry run complete: would scrape ~%d pages", len(self.visited_urls))
@@ -1356,8 +1369,11 @@ class DocToSkillConverter:
            "pages": [{"title": p["title"], "url": p["url"]} for p in self.pages],
        }

-        with open(f"{self.data_dir}/summary.json", "w", encoding="utf-8") as f:
-            json.dump(summary, f, indent=2, ensure_ascii=False)
+        try:
+            with open(f"{self.data_dir}/summary.json", "w", encoding="utf-8") as f:
+                json.dump(summary, f, indent=2, ensure_ascii=False)
+        except OSError as e:
+            logger.error("  ✗ Failed to save summary: %s", e)

    def load_scraped_data(self) -> list[dict[str, Any]]:
        """Load previously scraped data"""
@@ -1395,6 +1411,11 @@ class DocToSkillConverter:
        categories: dict[str, list[dict[str, Any]]] = {cat: [] for cat in category_defs}
        categories["other"] = []

+        # Pre-lowercase keywords once instead of per-page per-keyword
+        lowered_defs = {
+            cat: [kw.lower() for kw in keywords] for cat, keywords in category_defs.items()
+        }
+
        for page in pages:
            url = page["url"].lower()
            title = page["title"].lower()
@@ -1404,11 +1425,10 @@ class DocToSkillConverter:

            categorized = False

-            # Match against keywords
-            for cat, keywords in category_defs.items():
+            # Match against pre-lowercased keywords
+            for cat, keywords in lowered_defs.items():
                score = 0
                for keyword in keywords:
-                    keyword = keyword.lower()
                    if keyword in url:
                        score += 3
                    if keyword in title:
@@ -1450,15 +1470,12 @@ class DocToSkillConverter:
            if count >= 3:  # At least 3 pages
                categories[seg] = [seg]

-        # Add common defaults
-        if "tutorial" not in categories and any(
-            "tutorial" in url for url in [p["url"] for p in pages]
-        ):
+        # Add common defaults (use pre-built URL list to avoid repeated comprehensions)
+        all_urls = [p["url"] for p in pages]
+        if "tutorial" not in categories and any("tutorial" in url for url in all_urls):
            categories["tutorials"] = ["tutorial", "guide", "getting-started"]

-        if "api" not in categories and any(
-            "api" in url or "reference" in url for url in [p["url"] for p in pages]
-        ):
+        if "api" not in categories and any("api" in url or "reference" in url for url in all_urls):
            categories["api"] = ["api", "reference", "class"]

        return categories
--- a/src/skill_seekers/cli/github_scraper.py
+++ b/src/skill_seekers/cli/github_scraper.py
@@ -15,6 +15,7 @@ Usage:
 """

 import argparse
+import fnmatch
 import json
 import logging
 import os
@@ -664,11 +665,13 @@ class GitHubScraper:
    def _extract_file_tree_github(self):
        """Extract file tree from GitHub API (rate-limited)."""
        try:
-            contents = self.repo.get_contents("")
+            from collections import deque
+
+            contents = deque(self.repo.get_contents(""))
            file_tree = []

            while contents:
-                file_content = contents.pop(0)
+                file_content = contents.popleft()

                file_info = {
                    "path": file_content.path,
@@ -741,11 +744,10 @@ class GitHubScraper:
                continue

            # Check if file matches patterns (if specified)
-            if self.file_patterns:
-                import fnmatch
-
-                if not any(fnmatch.fnmatch(file_path, pattern) for pattern in self.file_patterns):
-                    continue
+            if self.file_patterns and not any(
+                fnmatch.fnmatch(file_path, pattern) for pattern in self.file_patterns
+            ):
+                continue

            # Analyze this file
            try: