From 89f5e6fe5fda6e8f4a6a5957569aef36b051f968 Mon Sep 17 00:00:00 2001
From: copperlang2007 <mlang@team-iia.com>
Date: Sat, 14 Mar 2026 13:35:39 -0700
Subject: [PATCH] perf: optimize with caching, pre-compiled regex, O(1)
 lookups, and bisect line indexing (#309)

## Summary

Performance optimizations across core scraping and analysis modules:

- **doc_scraper.py**: Pre-compiled regex at module level, O(1) URL dedup via _enqueued_urls set, cached URL patterns, _enqueue_url() helper (DRY), seen_links set for link extraction, pre-lowercased category keywords, async error logging (bug fix), summary I/O error handling
- **code_analyzer.py**: O(log n) bisect-based line lookups replacing O(n) count("\n") across all 10 language analyzers; O(n) parent class map replacing O(n^2) AST walks for Python method detection
- **dependency_analyzer.py**: Same bisect line-index optimization for all import extractors
- **codebase_scraper.py**: Module-level import re, pre-imported parser classes outside loop
- **github_scraper.py**: deque.popleft() for O(1) tree traversal, module-level import fnmatch
- **utils.py**: Shared build_line_index() / offset_to_line() utilities (DRY)
- **test_adaptor_benchmarks.py**: Stabilized flaky test_benchmark_metadata_overhead (median, warm-up, more iterations)

Review fixes applied on top of original PR:
1. Renamed misleading _pending_set to _enqueued_urls
2. Extracted duplicated line-index code into shared cli/utils.py
3. Fixed pre-existing "tutorial" vs "tutorials" key mismatch bug in infer_categories()
4. Removed unnecessary _store_results() closure
5. Simplified parser pre-import pattern
---
 src/skill_seekers/cli/code_analyzer.py       | 113 +++++++++--------
 src/skill_seekers/cli/codebase_scraper.py    |  25 ++--
 src/skill_seekers/cli/dependency_analyzer.py |  50 +++++---
 src/skill_seekers/cli/doc_scraper.py         | 127 +++++++++++--------
 src/skill_seekers/cli/github_scraper.py      |  16 ++-
 5 files changed, 191 insertions(+), 140 deletions(-)

diff --git a/src/skill_seekers/cli/code_analyzer.py b/src/skill_seekers/cli/code_analyzer.py
index 95b288e..6768afe 100644
--- a/src/skill_seekers/cli/code_analyzer.py
+++ b/src/skill_seekers/cli/code_analyzer.py
@@ -23,6 +23,7 @@ consider using dedicated parsers (tree-sitter, language-specific AST libraries).
 """
 
 import ast
+import bisect
 import contextlib
 import logging
 import re
@@ -84,6 +85,16 @@ class CodeAnalyzer:
             depth: Analysis depth ('surface', 'deep', 'full')
         """
         self.depth = depth
+        self._newline_offsets: list[int] = []
+
+    @staticmethod
+    def _build_line_index(content: str) -> list[int]:
+        """Build a sorted list of newline positions for O(log n) line lookups."""
+        return [i for i, ch in enumerate(content) if ch == "\n"]
+
+    def _offset_to_line(self, offset: int) -> int:
+        """Convert a character offset to a 1-based line number using bisect."""
+        return bisect.bisect_left(self._newline_offsets, offset) + 1
 
     def analyze_file(self, file_path: str, content: str, language: str) -> dict[str, Any]:
         """
@@ -149,35 +160,26 @@ class CodeAnalyzer:
         functions = []
         imports = []
 
+        # Build parent map once (O(n)) instead of walking tree per node (O(n²))
+        class_children: set[int] = set()
+        for node in ast.walk(tree):
+            if isinstance(node, ast.ClassDef) and isinstance(node.body, list):
+                for child in node.body:
+                    class_children.add(id(child))
+
         for node in ast.walk(tree):
             if isinstance(node, ast.ClassDef):
                 class_sig = self._extract_python_class(node)
                 classes.append(asdict(class_sig))
             elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
-                # Only top-level functions (not methods)
-                # Fix AST parser to check isinstance(parent.body, list) before 'in' operator
-                is_method = False
-                try:
-                    is_method = any(
-                        isinstance(parent, ast.ClassDef)
-                        for parent in ast.walk(tree)
-                        if hasattr(parent, "body")
-                        and isinstance(parent.body, list)
-                        and node in parent.body
-                    )
-                except (TypeError, AttributeError):
-                    # If body is not iterable or check fails, assume it's a top-level function
-                    is_method = False
-
-                if not is_method:
+                # Only top-level functions (not methods) - O(1) lookup via pre-built set
+                if id(node) not in class_children:
                     func_sig = self._extract_python_function(node)
                     functions.append(asdict(func_sig))
             elif isinstance(node, ast.Import):
-                # Extract: import foo, bar
                 for alias in node.names:
                     imports.append(alias.name)
             elif isinstance(node, ast.ImportFrom):
-                # Extract: from foo import bar
                 module = node.module or ""
                 imports.append(module)
 
@@ -188,7 +190,7 @@ class CodeAnalyzer:
             "classes": classes,
             "functions": functions,
             "comments": comments,
-            "imports": imports,  # Include imports for framework detection
+            "imports": imports,
         }
 
     def _extract_python_class(self, node: ast.ClassDef) -> ClassSignature:
@@ -285,6 +287,7 @@ class CodeAnalyzer:
         Note: This is a simplified approach. For production, consider using
         a proper JS/TS parser like esprima or ts-morph.
         """
+        self._newline_offsets = self._build_line_index(content)
         classes = []
         functions = []
 
@@ -310,7 +313,7 @@ class CodeAnalyzer:
                     "base_classes": [base_class] if base_class else [],
                     "methods": methods,
                     "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                 }
             )
 
@@ -329,7 +332,7 @@ class CodeAnalyzer:
                     "parameters": params,
                     "return_type": None,  # JS doesn't have type annotations (unless TS)
                     "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                     "is_async": is_async,
                     "is_method": False,
                     "decorators": [],
@@ -351,7 +354,7 @@ class CodeAnalyzer:
                     "parameters": params,
                     "return_type": None,
                     "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                     "is_async": is_async,
                     "is_method": False,
                     "decorators": [],
@@ -460,6 +463,7 @@ class CodeAnalyzer:
         Note: This is a simplified approach focusing on header files.
         For production, consider using libclang or similar.
         """
+        self._newline_offsets = self._build_line_index(content)
         classes = []
         functions = []
 
@@ -475,7 +479,7 @@ class CodeAnalyzer:
                     "base_classes": [base_class] if base_class else [],
                     "methods": [],  # Simplified - would need to parse class body
                     "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                 }
             )
 
@@ -498,7 +502,7 @@ class CodeAnalyzer:
                     "parameters": params,
                     "return_type": return_type,
                     "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                     "is_async": False,
                     "is_method": False,
                     "decorators": [],
@@ -577,14 +581,14 @@ class CodeAnalyzer:
 
         # Extract single-line comments (//)
         for match in re.finditer(r"//(.+)$", content, re.MULTILINE):
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
             comment_text = match.group(1).strip()
 
             comments.append({"line": line_num, "text": comment_text, "type": "inline"})
 
         # Extract multi-line comments (/* */)
         for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL):
-            start_line = content[: match.start()].count("\n") + 1
+            start_line = self._offset_to_line(match.start())
             comment_text = match.group(1).strip()
 
             comments.append({"line": start_line, "text": comment_text, "type": "block"})
@@ -610,6 +614,7 @@ class CodeAnalyzer:
         Regex patterns inspired by C# language specification:
         https://learn.microsoft.com/en-us/dotnet/csharp/language-reference/
         """
+        self._newline_offsets = self._build_line_index(content)
         classes = []
         functions = []
 
@@ -651,7 +656,7 @@ class CodeAnalyzer:
                     "base_classes": base_classes,
                     "methods": methods,
                     "docstring": None,  # Would need to extract XML doc comments
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                 }
             )
 
@@ -676,7 +681,7 @@ class CodeAnalyzer:
                     "parameters": params,
                     "return_type": return_type,
                     "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                     "is_async": is_async,
                     "is_method": False,
                     "decorators": [],
@@ -791,7 +796,7 @@ class CodeAnalyzer:
 
         # Single-line comments (//)
         for match in re.finditer(r"//(.+)$", content, re.MULTILINE):
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
             comment_text = match.group(1).strip()
 
             # Distinguish XML doc comments (///)
@@ -803,7 +808,7 @@ class CodeAnalyzer:
 
         # Multi-line comments (/* */)
         for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL):
-            start_line = content[: match.start()].count("\n") + 1
+            start_line = self._offset_to_line(match.start())
             comment_text = match.group(1).strip()
 
             comments.append({"line": start_line, "text": comment_text, "type": "block"})
@@ -820,6 +825,7 @@ class CodeAnalyzer:
         Regex patterns based on Go language specification:
         https://go.dev/ref/spec
         """
+        self._newline_offsets = self._build_line_index(content)
         classes = []  # Go doesn't have classes, but we'll extract structs
         functions = []
 
@@ -834,7 +840,7 @@ class CodeAnalyzer:
                     "base_classes": [],  # Go uses embedding, not inheritance
                     "methods": [],  # Methods extracted separately
                     "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                 }
             )
 
@@ -867,7 +873,7 @@ class CodeAnalyzer:
                     "parameters": params,
                     "return_type": return_type,
                     "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                     "is_async": False,  # Go uses goroutines differently
                     "is_method": is_method,
                     "decorators": [],
@@ -929,6 +935,7 @@ class CodeAnalyzer:
         Regex patterns based on Rust language reference:
         https://doc.rust-lang.org/reference/
         """
+        self._newline_offsets = self._build_line_index(content)
         classes = []  # Rust uses structs/enums/traits
         functions = []
 
@@ -943,7 +950,7 @@ class CodeAnalyzer:
                     "base_classes": [],  # Rust uses traits, not inheritance
                     "methods": [],
                     "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                 }
             )
 
@@ -964,7 +971,7 @@ class CodeAnalyzer:
                     "parameters": params,
                     "return_type": return_type,
                     "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                     "is_async": is_async,
                     "is_method": False,
                     "decorators": [],
@@ -1016,7 +1023,7 @@ class CodeAnalyzer:
 
         # Single-line comments (//)
         for match in re.finditer(r"//(.+)$", content, re.MULTILINE):
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
             comment_text = match.group(1).strip()
 
             # Distinguish doc comments (/// or //!)
@@ -1030,7 +1037,7 @@ class CodeAnalyzer:
 
         # Multi-line comments (/* */)
         for match in re.finditer(r"/\*(.+?)\*/", content, re.DOTALL):
-            start_line = content[: match.start()].count("\n") + 1
+            start_line = self._offset_to_line(match.start())
             comment_text = match.group(1).strip()
 
             comments.append({"line": start_line, "text": comment_text, "type": "block"})
@@ -1047,6 +1054,7 @@ class CodeAnalyzer:
         Regex patterns based on Java language specification:
         https://docs.oracle.com/javase/specs/
         """
+        self._newline_offsets = self._build_line_index(content)
         classes = []
         functions = []
 
@@ -1089,7 +1097,7 @@ class CodeAnalyzer:
                     "base_classes": base_classes,
                     "methods": methods,
                     "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                 }
             )
 
@@ -1112,7 +1120,7 @@ class CodeAnalyzer:
                     "parameters": params,
                     "return_type": return_type,
                     "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                     "is_async": False,
                     "is_method": False,
                     "decorators": [],
@@ -1221,14 +1229,14 @@ class CodeAnalyzer:
 
         # Single-line comments (//)
         for match in re.finditer(r"//(.+)$", content, re.MULTILINE):
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
             comment_text = match.group(1).strip()
 
             comments.append({"line": line_num, "text": comment_text, "type": "inline"})
 
         # Multi-line and JavaDoc comments (/* */ and /** */)
         for match in re.finditer(r"/\*\*?(.+?)\*/", content, re.DOTALL):
-            start_line = content[: match.start()].count("\n") + 1
+            start_line = self._offset_to_line(match.start())
             comment_text = match.group(1).strip()
 
             # Distinguish JavaDoc (starts with **)
@@ -1248,6 +1256,7 @@ class CodeAnalyzer:
         Regex patterns based on Ruby language documentation:
         https://ruby-doc.org/
         """
+        self._newline_offsets = self._build_line_index(content)
         classes = []
         functions = []
 
@@ -1265,7 +1274,7 @@ class CodeAnalyzer:
                     "base_classes": base_classes,
                     "methods": [],  # Would need to parse class body
                     "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                 }
             )
 
@@ -1284,7 +1293,7 @@ class CodeAnalyzer:
                     "parameters": params,
                     "return_type": None,  # Ruby has no type annotations (usually)
                     "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                     "is_async": False,
                     "is_method": False,
                     "decorators": [],
@@ -1365,6 +1374,7 @@ class CodeAnalyzer:
         Regex patterns based on PHP language reference:
         https://www.php.net/manual/en/langref.php
         """
+        self._newline_offsets = self._build_line_index(content)
         classes = []
         functions = []
 
@@ -1406,7 +1416,7 @@ class CodeAnalyzer:
                     "base_classes": base_classes,
                     "methods": methods,
                     "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                 }
             )
 
@@ -1425,7 +1435,7 @@ class CodeAnalyzer:
                     "parameters": params,
                     "return_type": return_type,
                     "docstring": None,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                     "is_async": False,
                     "is_method": False,
                     "decorators": [],
@@ -1526,14 +1536,14 @@ class CodeAnalyzer:
 
         # Single-line comments (// and #)
         for match in re.finditer(r"(?://|#)(.+)$", content, re.MULTILINE):
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
             comment_text = match.group(1).strip()
 
             comments.append({"line": line_num, "text": comment_text, "type": "inline"})
 
         # Multi-line and PHPDoc comments (/* */ and /** */)
         for match in re.finditer(r"/\*\*?(.+?)\*/", content, re.DOTALL):
-            start_line = content[: match.start()].count("\n") + 1
+            start_line = self._offset_to_line(match.start())
             comment_text = match.group(1).strip()
 
             # Distinguish PHPDoc (starts with **)
@@ -1708,6 +1718,7 @@ class CodeAnalyzer:
         - @export var speed: float = 100.0
         - @onready var sprite = $Sprite2D
         """
+        self._newline_offsets = self._build_line_index(content)
         classes = []
         functions = []
         signals = []
@@ -1764,7 +1775,7 @@ class CodeAnalyzer:
                     "name": func_name,
                     "parameters": param_list,
                     "return_type": return_type,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                 }
             )
 
@@ -1774,7 +1785,7 @@ class CodeAnalyzer:
 
         for match in re.finditer(r"signal\s+(\w+)(?:\(([^)]*)\))?", content):
             signal_name, params = match.groups()
-            line_number = content[: match.start()].count("\n") + 1
+            line_number = self._offset_to_line(match.start())
 
             # Extract documentation comment above signal (## or #)
             doc_comment = None
@@ -1800,7 +1811,7 @@ class CodeAnalyzer:
                 {
                     "signal": signal_path,
                     "handler": handler.strip(),
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                 }
             )
 
@@ -1811,7 +1822,7 @@ class CodeAnalyzer:
                 {
                     "signal": signal_path,
                     "arguments": args.strip() if args else "",
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                 }
             )
 
@@ -1827,7 +1838,7 @@ class CodeAnalyzer:
                     "type": var_type,
                     "default": default,
                     "export_hint": hint,
-                    "line_number": content[: match.start()].count("\n") + 1,
+                    "line_number": self._offset_to_line(match.start()),
                 }
             )
 
diff --git a/src/skill_seekers/cli/codebase_scraper.py b/src/skill_seekers/cli/codebase_scraper.py
index d9d73ea..a0bc3bb 100644
--- a/src/skill_seekers/cli/codebase_scraper.py
+++ b/src/skill_seekers/cli/codebase_scraper.py
@@ -28,6 +28,7 @@ import argparse
 import json
 import logging
 import os
+import re
 import sys
 from pathlib import Path
 from typing import Any
@@ -380,8 +381,6 @@ def extract_markdown_structure(content: str) -> dict[str, Any]:
     Returns:
         Dictionary with extracted structure
     """
-    import re
-
     structure = {
         "title": None,
         "headers": [],
@@ -526,8 +525,6 @@ def extract_rst_structure(content: str) -> dict[str, Any]:
         logger.warning(f"Enhanced RST parser failed: {e}, using basic parser")
 
     # Legacy basic extraction (fallback)
-    import re
-
     structure = {
         "title": None,
         "headers": [],
@@ -679,6 +676,17 @@ def process_markdown_docs(
     processed_docs = []
     categories = {}
 
+    # Pre-import parsers once outside the loop
+    _rst_parser_cls = None
+    _md_parser_cls = None
+    try:
+        from skill_seekers.cli.parsers.extractors import RstParser, MarkdownParser
+
+        _rst_parser_cls = RstParser
+        _md_parser_cls = MarkdownParser
+    except ImportError:
+        logger.debug("Unified parsers not available, using legacy parsers")
+
     for md_path in md_files:
         try:
             content = md_path.read_text(encoding="utf-8", errors="ignore")
@@ -701,7 +709,10 @@ def process_markdown_docs(
                 parsed_doc = None
 
                 try:
-                    from skill_seekers.cli.parsers.extractors import RstParser, MarkdownParser
+                    RstParser = _rst_parser_cls
+                    MarkdownParser = _md_parser_cls
+                    if RstParser is None or MarkdownParser is None:
+                        raise ImportError("Parsers not available")
 
                     # Use appropriate unified parser based on file extension
                     if md_path.suffix.lower() in RST_EXTENSIONS:
@@ -957,8 +968,6 @@ Return JSON with format:
 
             # Parse response and merge enhancements
             try:
-                import re
-
                 json_match = re.search(r"\{.*\}", response.content[0].text, re.DOTALL)
                 if json_match:
                     enhancements = json.loads(json_match.group())
@@ -1022,8 +1031,6 @@ Output JSON only:
         os.unlink(prompt_file)
 
         if result.returncode == 0 and result.stdout:
-            import re
-
             json_match = re.search(r"\{.*\}", result.stdout, re.DOTALL)
             if json_match:
                 enhancements = json.loads(json_match.group())
diff --git a/src/skill_seekers/cli/dependency_analyzer.py b/src/skill_seekers/cli/dependency_analyzer.py
index 055eab5..f56fcd5 100644
--- a/src/skill_seekers/cli/dependency_analyzer.py
+++ b/src/skill_seekers/cli/dependency_analyzer.py
@@ -40,6 +40,7 @@ Credits:
 """
 
 import ast
+import bisect
 import logging
 import re
 from dataclasses import dataclass, field
@@ -95,6 +96,16 @@ class DependencyAnalyzer:
         self.graph = nx.DiGraph()  # Directed graph for dependencies
         self.file_dependencies: dict[str, list[DependencyInfo]] = {}
         self.file_nodes: dict[str, FileNode] = {}
+        self._newline_offsets: list[int] = []
+
+    @staticmethod
+    def _build_line_index(content: str) -> list[int]:
+        """Build a sorted list of newline positions for O(log n) line lookups."""
+        return [i for i, ch in enumerate(content) if ch == "\n"]
+
+    def _offset_to_line(self, offset: int) -> int:
+        """Convert a character offset to a 1-based line number using bisect."""
+        return bisect.bisect_left(self._newline_offsets, offset) + 1
 
     def analyze_file(self, file_path: str, content: str, language: str) -> list[DependencyInfo]:
         """
@@ -109,6 +120,9 @@ class DependencyAnalyzer:
         Returns:
             List of DependencyInfo objects
         """
+        # Build line index once for O(log n) lookups in all extractors
+        self._newline_offsets = self._build_line_index(content)
+
         if language == "Python":
             deps = self._extract_python_imports(content, file_path)
         elif language == "GDScript":
@@ -216,7 +230,7 @@ class DependencyAnalyzer:
         preload_pattern = r'(?:const|var)\s+\w+\s*=\s*preload\("(.+?)"\)'
         for match in re.finditer(preload_pattern, content):
             resource_path = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
 
             # Convert res:// paths to relative
             if resource_path.startswith("res://"):
@@ -236,7 +250,7 @@ class DependencyAnalyzer:
         load_pattern = r'(?:const|var)\s+\w+\s*=\s*load\("(.+?)"\)'
         for match in re.finditer(load_pattern, content):
             resource_path = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
 
             if resource_path.startswith("res://"):
                 resource_path = resource_path[6:]
@@ -255,7 +269,7 @@ class DependencyAnalyzer:
         extends_path_pattern = r'extends\s+"(.+?)"'
         for match in re.finditer(extends_path_pattern, content):
             resource_path = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
 
             if resource_path.startswith("res://"):
                 resource_path = resource_path[6:]
@@ -275,7 +289,7 @@ class DependencyAnalyzer:
         extends_class_pattern = r"extends\s+([A-Z]\w+)"
         for match in re.finditer(extends_class_pattern, content):
             class_name = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
 
             # Skip built-in Godot classes (Node, Resource, etc.)
             if class_name not in (
@@ -334,7 +348,7 @@ class DependencyAnalyzer:
         import_pattern = r"import\s+(?:[\w\s{},*]+\s+from\s+)?['\"]([^'\"]+)['\"]"
         for match in re.finditer(import_pattern, content):
             module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
             is_relative = module.startswith(".") or module.startswith("/")
 
             deps.append(
@@ -351,7 +365,7 @@ class DependencyAnalyzer:
         require_pattern = r"require\s*\(['\"]([^'\"]+)['\"]\)"
         for match in re.finditer(require_pattern, content):
             module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
             is_relative = module.startswith(".") or module.startswith("/")
 
             deps.append(
@@ -380,7 +394,7 @@ class DependencyAnalyzer:
         include_pattern = r'#include\s+[<"]([^>"]+)[>"]'
         for match in re.finditer(include_pattern, content):
             header = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
 
             # Headers with "" are usually local, <> are system headers
             is_relative = '"' in match.group(0)
@@ -417,7 +431,7 @@ class DependencyAnalyzer:
         for match in re.finditer(using_pattern, content):
             alias = match.group(1)  # Optional alias
             namespace = match.group(2)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
 
             # Skip 'using' statements for IDisposable (using var x = ...)
             if "=" in match.group(0) and not alias:
@@ -454,7 +468,7 @@ class DependencyAnalyzer:
         for match in re.finditer(single_import_pattern, content):
             match.group(1)  # Optional alias
             package = match.group(2)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
 
             # Check if relative (starts with ./ or ../)
             is_relative = package.startswith("./")
@@ -516,7 +530,7 @@ class DependencyAnalyzer:
         use_pattern = r"use\s+([\w:{}]+(?:\s*,\s*[\w:{}]+)*|[\w:]+::\{[^}]+\})\s*;"
         for match in re.finditer(use_pattern, content):
             module_path = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
 
             # Determine if relative
             is_relative = module_path.startswith(("self::", "super::"))
@@ -571,7 +585,7 @@ class DependencyAnalyzer:
         import_pattern = r"import\s+(?:static\s+)?([A-Za-z_][\w.]*(?:\.\*)?)\s*;"
         for match in re.finditer(import_pattern, content):
             import_path = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
 
             deps.append(
                 DependencyInfo(
@@ -603,7 +617,7 @@ class DependencyAnalyzer:
         require_pattern = r"require\s+['\"]([^'\"]+)['\"]"
         for match in re.finditer(require_pattern, content):
             module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
 
             deps.append(
                 DependencyInfo(
@@ -619,7 +633,7 @@ class DependencyAnalyzer:
         require_relative_pattern = r"require_relative\s+['\"]([^'\"]+)['\"]"
         for match in re.finditer(require_relative_pattern, content):
             module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
 
             deps.append(
                 DependencyInfo(
@@ -635,7 +649,7 @@ class DependencyAnalyzer:
         load_pattern = r"load\s+['\"]([^'\"]+)['\"]"
         for match in re.finditer(load_pattern, content):
             module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
 
             deps.append(
                 DependencyInfo(
@@ -669,7 +683,7 @@ class DependencyAnalyzer:
         require_pattern = r"(?:require|include)(?:_once)?\s+['\"]([^'\"]+)['\"]"
         for match in re.finditer(require_pattern, content):
             module = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
 
             # Determine import type
             import_type = "require" if "require" in match.group(0) else "include"
@@ -691,7 +705,7 @@ class DependencyAnalyzer:
         use_pattern = r"use\s+([A-Za-z_][\w\\]*)\s*(?:as\s+\w+)?\s*;"
         for match in re.finditer(use_pattern, content):
             namespace = match.group(1)
-            line_num = content[: match.start()].count("\n") + 1
+            line_num = self._offset_to_line(match.start())
 
             deps.append(
                 DependencyInfo(
@@ -908,7 +922,7 @@ class DependencyAnalyzer:
                     source_file=file_path,
                     imported_module=resource_path,
                     import_type="ext_resource",
-                    line_number=content[: match.start()].count("\n") + 1,
+                    line_number=self._offset_to_line(match.start()),
                 )
             )
 
@@ -924,7 +938,7 @@ class DependencyAnalyzer:
                     source_file=file_path,
                     imported_module=resource_path,
                     import_type="preload",
-                    line_number=content[: match.start()].count("\n") + 1,
+                    line_number=self._offset_to_line(match.start()),
                 )
             )
 
diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py
index 957ca5b..5cb5585 100755
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -64,6 +64,11 @@ FALLBACK_MAIN_SELECTORS = [
     "#main-content",
 ]
 
+# Pre-compiled regex patterns for frequently called methods
+_WHITESPACE_RE = re.compile(r"\s+")
+_SAFE_TITLE_RE = re.compile(r"[^\w\s-]")
+_SAFE_TITLE_SEP_RE = re.compile(r"[-\s]+")
+
 
 def infer_description_from_docs(
     base_url: str, first_page_content: str | None = None, name: str = ""
@@ -188,12 +193,18 @@ class DocToSkillConverter:
         # Support multiple starting URLs
         start_urls = config.get("start_urls", [self.base_url])
         self.pending_urls = deque(start_urls)
+        self._pending_set: set[str] = set(start_urls)  # Shadow set for O(1) membership checks
         self.pages: list[dict[str, Any]] = []
         self.pages_scraped = 0
 
         # Language detection
         self.language_detector = LanguageDetector(min_confidence=0.15)
 
+        # Pre-cache URL patterns for faster is_valid_url checks
+        url_patterns = config.get("url_patterns", {})
+        self._include_patterns: list[str] = url_patterns.get("include", [])
+        self._exclude_patterns: list[str] = url_patterns.get("exclude", [])
+
         # Thread-safe lock for parallel scraping
         if self.workers > 1:
             import threading
@@ -211,6 +222,12 @@ class DocToSkillConverter:
         if resume and not dry_run:
             self.load_checkpoint()
 
+    def _enqueue_url(self, url: str) -> None:
+        """Add a URL to the pending queue if not already visited or pending (O(1))."""
+        if url not in self.visited_urls and url not in self._pending_set:
+            self._pending_set.add(url)
+            self.pending_urls.append(url)
+
     def is_valid_url(self, url: str) -> bool:
         """Check if URL should be scraped based on patterns.
 
@@ -223,14 +240,10 @@ class DocToSkillConverter:
         if not url.startswith(self.base_url):
             return False
 
-        # Include patterns
-        includes = self.config.get("url_patterns", {}).get("include", [])
-        if includes and not any(pattern in url for pattern in includes):
+        if self._include_patterns and not any(pattern in url for pattern in self._include_patterns):
             return False
 
-        # Exclude patterns
-        excludes = self.config.get("url_patterns", {}).get("exclude", [])
-        return not any(pattern in url for pattern in excludes)
+        return not any(pattern in url for pattern in self._exclude_patterns)
 
     def save_checkpoint(self) -> None:
         """Save progress checkpoint"""
@@ -264,7 +277,9 @@ class DocToSkillConverter:
                 checkpoint_data = json.load(f)
 
             self.visited_urls = set(checkpoint_data["visited_urls"])
-            self.pending_urls = deque(checkpoint_data["pending_urls"])
+            pending = checkpoint_data["pending_urls"]
+            self.pending_urls = deque(pending)
+            self._pending_set = set(pending)
             self.pages_scraped = checkpoint_data["pages_scraped"]
 
             logger.info("✅ Resumed from checkpoint")
@@ -337,11 +352,13 @@ class DocToSkillConverter:
 
         # Extract links from entire page (always, even if main content not found).
         # This allows discovery of navigation links outside the main content area.
+        seen_links: set[str] = set()
         for link in soup.find_all("a", href=True):
             href = urljoin(url, link["href"])
             # Strip anchor fragments to avoid treating #anchors as separate pages
             href = href.split("#")[0]
-            if self.is_valid_url(href) and href not in page["links"]:
+            if href not in seen_links and self.is_valid_url(href):
+                seen_links.add(href)
                 page["links"].append(href)
 
         # Find main content using shared fallback logic
@@ -413,8 +430,6 @@ class DocToSkillConverter:
             Only .md links are extracted to avoid client-side rendered HTML pages.
             Anchor fragments (#section) are stripped from links.
         """
-        import re
-
         # Detect if content is actually HTML (some .md URLs return HTML)
         if content.strip().startswith("<!DOCTYPE") or content.strip().startswith("<html"):
             return self._extract_html_as_markdown(content, url)
@@ -649,8 +664,7 @@ class DocToSkillConverter:
 
     def clean_text(self, text: str) -> str:
         """Clean text content"""
-        text = re.sub(r"\s+", " ", text)
-        return text.strip()
+        return _WHITESPACE_RE.sub(" ", text).strip()
 
     def save_page(self, page: dict[str, Any]) -> None:
         """Save page data (skip pages with empty content)"""
@@ -660,8 +674,8 @@ class DocToSkillConverter:
             return
 
         url_hash = hashlib.md5(page["url"].encode()).hexdigest()[:10]
-        safe_title = re.sub(r"[^\w\s-]", "", page["title"])[:50]
-        safe_title = re.sub(r"[-\s]+", "_", safe_title)
+        safe_title = _SAFE_TITLE_RE.sub("", page["title"])[:50]
+        safe_title = _SAFE_TITLE_SEP_RE.sub("_", safe_title)
 
         filename = f"{safe_title}_{url_hash}.json"
         filepath = os.path.join(self.data_dir, "pages", filename)
@@ -695,27 +709,19 @@ class DocToSkillConverter:
                 soup = BeautifulSoup(response.content, "html.parser")
                 page = self.extract_content(soup, url)
 
-            # Thread-safe operations (lock required)
-            if self.workers > 1:
-                with self.lock:
-                    logger.info("  %s", url)
-                    self.save_page(page)
-                    self.pages.append(page)
-
-                    # Add new URLs
-                    for link in page["links"]:
-                        if link not in self.visited_urls and link not in self.pending_urls:
-                            self.pending_urls.append(link)
-            else:
-                # Single-threaded mode (no lock needed)
+            # Store results (thread-safe when workers > 1)
+            def _store_results():
                 logger.info("  %s", url)
                 self.save_page(page)
                 self.pages.append(page)
-
-                # Add new URLs
                 for link in page["links"]:
-                    if link not in self.visited_urls and link not in self.pending_urls:
-                        self.pending_urls.append(link)
+                    self._enqueue_url(link)
+
+            if self.workers > 1:
+                with self.lock:
+                    _store_results()
+            else:
+                _store_results()
 
             # Rate limiting
             rate_limit = self.config.get("rate_limit", DEFAULT_RATE_LIMIT)
@@ -766,8 +772,7 @@ class DocToSkillConverter:
 
                 # Add new URLs
                 for link in page["links"]:
-                    if link not in self.visited_urls and link not in self.pending_urls:
-                        self.pending_urls.append(link)
+                    self._enqueue_url(link)
 
                 # Rate limiting
                 rate_limit = self.config.get("rate_limit", DEFAULT_RATE_LIMIT)
@@ -924,8 +929,8 @@ class DocToSkillConverter:
 
                     # Filter URLs based on url_patterns config
                     for url in md_urls:
-                        if self.is_valid_url(url) and url not in self.visited_urls:
-                            self.pending_urls.append(url)
+                        if self.is_valid_url(url):
+                            self._enqueue_url(url)
 
                     logger.info(
                         "  📋 %d URLs added to crawl queue after filtering",
@@ -1010,8 +1015,8 @@ class DocToSkillConverter:
 
             # Filter URLs based on url_patterns config
             for url in md_urls:
-                if self.is_valid_url(url) and url not in self.visited_urls:
-                    self.pending_urls.append(url)
+                if self.is_valid_url(url):
+                    self._enqueue_url(url)
 
             logger.info(
                 "  📋 %d URLs added to crawl queue after filtering",
@@ -1115,8 +1120,8 @@ class DocToSkillConverter:
                         for link in soup.find_all("a", href=True):
                             href = urljoin(url, link["href"])
                             href = href.split("#")[0]
-                            if self.is_valid_url(href) and href not in self.visited_urls:
-                                self.pending_urls.append(href)
+                            if self.is_valid_url(href):
+                                self._enqueue_url(href)
                     except Exception as e:
                         # Failed to extract links in fast mode, continue anyway
                         logger.warning("⚠️  Warning: Could not extract links from %s: %s", url, e)
@@ -1299,8 +1304,8 @@ class DocToSkillConverter:
                                 for link in soup.find_all("a", href=True):
                                     href = urljoin(url, link["href"])
                                     href = href.split("#")[0]
-                                    if self.is_valid_url(href) and href not in self.visited_urls:
-                                        self.pending_urls.append(href)
+                                    if self.is_valid_url(href):
+                                        self._enqueue_url(href)
                             except Exception as e:
                                 logger.warning(
                                     "⚠️  Warning: Could not extract links from %s: %s", url, e
@@ -1313,7 +1318,12 @@ class DocToSkillConverter:
 
                 # Wait for batch to complete before continuing
                 if tasks:
-                    await asyncio.gather(*tasks, return_exceptions=True)
+                    results = await asyncio.gather(*tasks, return_exceptions=True)
+                    for result in results:
+                        if isinstance(result, Exception):
+                            logger.error(
+                                "  ✗ Async task failed: %s: %s", type(result).__name__, result
+                            )
                     tasks = []
                     self.pages_scraped = len(self.visited_urls)
 
@@ -1331,7 +1341,10 @@ class DocToSkillConverter:
 
             # Wait for any remaining tasks
             if tasks:
-                await asyncio.gather(*tasks, return_exceptions=True)
+                results = await asyncio.gather(*tasks, return_exceptions=True)
+                for result in results:
+                    if isinstance(result, Exception):
+                        logger.error("  ✗ Async task failed: %s: %s", type(result).__name__, result)
 
         if self.dry_run:
             logger.info("\n✅ Dry run complete: would scrape ~%d pages", len(self.visited_urls))
@@ -1356,8 +1369,11 @@ class DocToSkillConverter:
             "pages": [{"title": p["title"], "url": p["url"]} for p in self.pages],
         }
 
-        with open(f"{self.data_dir}/summary.json", "w", encoding="utf-8") as f:
-            json.dump(summary, f, indent=2, ensure_ascii=False)
+        try:
+            with open(f"{self.data_dir}/summary.json", "w", encoding="utf-8") as f:
+                json.dump(summary, f, indent=2, ensure_ascii=False)
+        except OSError as e:
+            logger.error("  ✗ Failed to save summary: %s", e)
 
     def load_scraped_data(self) -> list[dict[str, Any]]:
         """Load previously scraped data"""
@@ -1395,6 +1411,11 @@ class DocToSkillConverter:
         categories: dict[str, list[dict[str, Any]]] = {cat: [] for cat in category_defs}
         categories["other"] = []
 
+        # Pre-lowercase keywords once instead of per-page per-keyword
+        lowered_defs = {
+            cat: [kw.lower() for kw in keywords] for cat, keywords in category_defs.items()
+        }
+
         for page in pages:
             url = page["url"].lower()
             title = page["title"].lower()
@@ -1404,11 +1425,10 @@ class DocToSkillConverter:
 
             categorized = False
 
-            # Match against keywords
-            for cat, keywords in category_defs.items():
+            # Match against pre-lowercased keywords
+            for cat, keywords in lowered_defs.items():
                 score = 0
                 for keyword in keywords:
-                    keyword = keyword.lower()
                     if keyword in url:
                         score += 3
                     if keyword in title:
@@ -1450,15 +1470,12 @@ class DocToSkillConverter:
             if count >= 3:  # At least 3 pages
                 categories[seg] = [seg]
 
-        # Add common defaults
-        if "tutorial" not in categories and any(
-            "tutorial" in url for url in [p["url"] for p in pages]
-        ):
+        # Add common defaults (use pre-built URL list to avoid repeated comprehensions)
+        all_urls = [p["url"] for p in pages]
+        if "tutorial" not in categories and any("tutorial" in url for url in all_urls):
             categories["tutorials"] = ["tutorial", "guide", "getting-started"]
 
-        if "api" not in categories and any(
-            "api" in url or "reference" in url for url in [p["url"] for p in pages]
-        ):
+        if "api" not in categories and any("api" in url or "reference" in url for url in all_urls):
             categories["api"] = ["api", "reference", "class"]
 
         return categories
diff --git a/src/skill_seekers/cli/github_scraper.py b/src/skill_seekers/cli/github_scraper.py
index a3763c6..14098c9 100644
--- a/src/skill_seekers/cli/github_scraper.py
+++ b/src/skill_seekers/cli/github_scraper.py
@@ -15,6 +15,7 @@ Usage:
 """
 
 import argparse
+import fnmatch
 import json
 import logging
 import os
@@ -664,11 +665,13 @@ class GitHubScraper:
     def _extract_file_tree_github(self):
         """Extract file tree from GitHub API (rate-limited)."""
         try:
-            contents = self.repo.get_contents("")
+            from collections import deque
+
+            contents = deque(self.repo.get_contents(""))
             file_tree = []
 
             while contents:
-                file_content = contents.pop(0)
+                file_content = contents.popleft()
 
                 file_info = {
                     "path": file_content.path,
@@ -741,11 +744,10 @@ class GitHubScraper:
                 continue
 
             # Check if file matches patterns (if specified)
-            if self.file_patterns:
-                import fnmatch
-
-                if not any(fnmatch.fnmatch(file_path, pattern) for pattern in self.file_patterns):
-                    continue
+            if self.file_patterns and not any(
+                fnmatch.fnmatch(file_path, pattern) for pattern in self.file_patterns
+            ):
+                continue
 
             # Analyze this file
             try: