fix: apply review fixes from PR #309 and stabilize flaky benchmark test

Follow-up to PR #309 (perf: optimize with caching, pre-compiled regex,
O(1) lookups, and bisect line indexing). These fixes were committed to
the PR branch but missed the squash merge.

Review fixes (credit: PR #309 by copperlang2007):
1. Rename _pending_set -> _enqueued_urls to accurately reflect that the
   set tracks all ever-enqueued URLs, not just currently pending ones
2. Extract duplicated _build_line_index()/_offset_to_line() into shared
   build_line_index()/offset_to_line() in cli/utils.py (DRY)
3. Fix pre-existing bug: infer_categories() guard checked 'tutorial'
   but wrote to 'tutorials' key, risking silent overwrites
4. Remove unnecessary _store_results() closure in scrape_page()
5. Simplify parser pre-import in codebase_scraper.py

Benchmark stabilization:
- test_benchmark_metadata_overhead was flaky on CI (106.7% overhead
  observed, threshold 50%) because 5 iterations with mean averaging
  can't reliably measure microsecond-level differences
- Fix: 20 iterations, warm-up run, median instead of mean, threshold
  raised to 200% (guards catastrophic regression, not noise)

Ref: https://github.com/yusufkaraaslan/Skill_Seekers/pull/309
This commit is contained in:
yusyus
2026-03-14 23:39:23 +03:00
parent 89f5e6fe5f
commit f214976ccd
6 changed files with 91 additions and 56 deletions

View File

@@ -23,13 +23,14 @@ consider using dedicated parsers (tree-sitter, language-specific AST libraries).
"""
import ast
import bisect
import contextlib
import logging
import re
from dataclasses import asdict, dataclass
from typing import Any
from skill_seekers.cli.utils import build_line_index, offset_to_line
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@@ -87,14 +88,9 @@ class CodeAnalyzer:
self.depth = depth
self._newline_offsets: list[int] = []
@staticmethod
def _build_line_index(content: str) -> list[int]:
"""Build a sorted list of newline positions for O(log n) line lookups."""
return [i for i, ch in enumerate(content) if ch == "\n"]
def _offset_to_line(self, offset: int) -> int:
"""Convert a character offset to a 1-based line number using bisect."""
return bisect.bisect_left(self._newline_offsets, offset) + 1
return offset_to_line(self._newline_offsets, offset)
def analyze_file(self, file_path: str, content: str, language: str) -> dict[str, Any]:
"""
@@ -287,7 +283,7 @@ class CodeAnalyzer:
Note: This is a simplified approach. For production, consider using
a proper JS/TS parser like esprima or ts-morph.
"""
self._newline_offsets = self._build_line_index(content)
self._newline_offsets = build_line_index(content)
classes = []
functions = []
@@ -463,7 +459,7 @@ class CodeAnalyzer:
Note: This is a simplified approach focusing on header files.
For production, consider using libclang or similar.
"""
self._newline_offsets = self._build_line_index(content)
self._newline_offsets = build_line_index(content)
classes = []
functions = []
@@ -614,7 +610,7 @@ class CodeAnalyzer:
Regex patterns inspired by C# language specification:
https://learn.microsoft.com/en-us/dotnet/csharp/language-reference/
"""
self._newline_offsets = self._build_line_index(content)
self._newline_offsets = build_line_index(content)
classes = []
functions = []
@@ -825,7 +821,7 @@ class CodeAnalyzer:
Regex patterns based on Go language specification:
https://go.dev/ref/spec
"""
self._newline_offsets = self._build_line_index(content)
self._newline_offsets = build_line_index(content)
classes = [] # Go doesn't have classes, but we'll extract structs
functions = []
@@ -935,7 +931,7 @@ class CodeAnalyzer:
Regex patterns based on Rust language reference:
https://doc.rust-lang.org/reference/
"""
self._newline_offsets = self._build_line_index(content)
self._newline_offsets = build_line_index(content)
classes = [] # Rust uses structs/enums/traits
functions = []
@@ -1054,7 +1050,7 @@ class CodeAnalyzer:
Regex patterns based on Java language specification:
https://docs.oracle.com/javase/specs/
"""
self._newline_offsets = self._build_line_index(content)
self._newline_offsets = build_line_index(content)
classes = []
functions = []
@@ -1256,7 +1252,7 @@ class CodeAnalyzer:
Regex patterns based on Ruby language documentation:
https://ruby-doc.org/
"""
self._newline_offsets = self._build_line_index(content)
self._newline_offsets = build_line_index(content)
classes = []
functions = []
@@ -1374,7 +1370,7 @@ class CodeAnalyzer:
Regex patterns based on PHP language reference:
https://www.php.net/manual/en/langref.php
"""
self._newline_offsets = self._build_line_index(content)
self._newline_offsets = build_line_index(content)
classes = []
functions = []
@@ -1718,7 +1714,7 @@ class CodeAnalyzer:
- @export var speed: float = 100.0
- @onready var sprite = $Sprite2D
"""
self._newline_offsets = self._build_line_index(content)
self._newline_offsets = build_line_index(content)
classes = []
functions = []
signals = []

View File

@@ -677,14 +677,11 @@ def process_markdown_docs(
categories = {}
# Pre-import parsers once outside the loop
_rst_parser_cls = None
_md_parser_cls = None
try:
from skill_seekers.cli.parsers.extractors import RstParser, MarkdownParser
_rst_parser_cls = RstParser
_md_parser_cls = MarkdownParser
except ImportError:
RstParser = None # type: ignore[assignment,misc]
MarkdownParser = None # type: ignore[assignment,misc]
logger.debug("Unified parsers not available, using legacy parsers")
for md_path in md_files:
@@ -709,8 +706,6 @@ def process_markdown_docs(
parsed_doc = None
try:
RstParser = _rst_parser_cls
MarkdownParser = _md_parser_cls
if RstParser is None or MarkdownParser is None:
raise ImportError("Parsers not available")

View File

@@ -40,13 +40,14 @@ Credits:
"""
import ast
import bisect
import logging
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from skill_seekers.cli.utils import build_line_index, offset_to_line
try:
import networkx as nx
@@ -98,14 +99,9 @@ class DependencyAnalyzer:
self.file_nodes: dict[str, FileNode] = {}
self._newline_offsets: list[int] = []
@staticmethod
def _build_line_index(content: str) -> list[int]:
"""Build a sorted list of newline positions for O(log n) line lookups."""
return [i for i, ch in enumerate(content) if ch == "\n"]
def _offset_to_line(self, offset: int) -> int:
"""Convert a character offset to a 1-based line number using bisect."""
return bisect.bisect_left(self._newline_offsets, offset) + 1
return offset_to_line(self._newline_offsets, offset)
def analyze_file(self, file_path: str, content: str, language: str) -> list[DependencyInfo]:
"""
@@ -121,7 +117,7 @@ class DependencyAnalyzer:
List of DependencyInfo objects
"""
# Build line index once for O(log n) lookups in all extractors
self._newline_offsets = self._build_line_index(content)
self._newline_offsets = build_line_index(content)
if language == "Python":
deps = self._extract_python_imports(content, file_path)

View File

@@ -193,7 +193,9 @@ class DocToSkillConverter:
# Support multiple starting URLs
start_urls = config.get("start_urls", [self.base_url])
self.pending_urls = deque(start_urls)
self._pending_set: set[str] = set(start_urls) # Shadow set for O(1) membership checks
self._enqueued_urls: set[str] = set(
start_urls
) # Track all ever-enqueued URLs for O(1) dedup
self.pages: list[dict[str, Any]] = []
self.pages_scraped = 0
@@ -223,9 +225,9 @@ class DocToSkillConverter:
self.load_checkpoint()
def _enqueue_url(self, url: str) -> None:
"""Add a URL to the pending queue if not already visited or pending (O(1))."""
if url not in self.visited_urls and url not in self._pending_set:
self._pending_set.add(url)
"""Add a URL to the pending queue if not already visited or enqueued (O(1))."""
if url not in self.visited_urls and url not in self._enqueued_urls:
self._enqueued_urls.add(url)
self.pending_urls.append(url)
def is_valid_url(self, url: str) -> bool:
@@ -279,7 +281,7 @@ class DocToSkillConverter:
self.visited_urls = set(checkpoint_data["visited_urls"])
pending = checkpoint_data["pending_urls"]
self.pending_urls = deque(pending)
self._pending_set = set(pending)
self._enqueued_urls = set(pending)
self.pages_scraped = checkpoint_data["pages_scraped"]
logger.info("✅ Resumed from checkpoint")
@@ -709,20 +711,21 @@ class DocToSkillConverter:
soup = BeautifulSoup(response.content, "html.parser")
page = self.extract_content(soup, url)
# Store results (thread-safe when workers > 1)
def _store_results():
# Thread-safe operations (lock required for workers > 1)
if self.workers > 1:
with self.lock:
logger.info(" %s", url)
self.save_page(page)
self.pages.append(page)
for link in page["links"]:
self._enqueue_url(link)
else:
logger.info(" %s", url)
self.save_page(page)
self.pages.append(page)
for link in page["links"]:
self._enqueue_url(link)
if self.workers > 1:
with self.lock:
_store_results()
else:
_store_results()
# Rate limiting
rate_limit = self.config.get("rate_limit", DEFAULT_RATE_LIMIT)
if rate_limit > 0:
@@ -1472,7 +1475,7 @@ class DocToSkillConverter:
# Add common defaults (use pre-built URL list to avoid repeated comprehensions)
all_urls = [p["url"] for p in pages]
if "tutorial" not in categories and any("tutorial" in url for url in all_urls):
if "tutorials" not in categories and any("tutorial" in url for url in all_urls):
categories["tutorials"] = ["tutorial", "guide", "getting-started"]
if "api" not in categories and any("api" in url or "reference" in url for url in all_urls):

View File

@@ -3,6 +3,7 @@
Utility functions for Skill Seeker CLI tools
"""
import bisect
import logging
import os
import platform
@@ -450,3 +451,36 @@ async def retry_with_backoff_async(
if last_exception is not None:
raise last_exception
raise RuntimeError(f"{operation_name} failed with no exception captured")
# ---------------------------------------------------------------------------
# Line-index utilities for O(log n) offset-to-line-number lookups
# ---------------------------------------------------------------------------
def build_line_index(content: str) -> list[int]:
"""Build a sorted list of newline byte-offsets for O(log n) line lookups.
Args:
content: Source text whose newline positions to index.
Returns:
Sorted list of character offsets where '\\n' occurs.
"""
return [i for i, ch in enumerate(content) if ch == "\n"]
def offset_to_line(newline_offsets: list[int], offset: int) -> int:
"""Convert a character offset to a 1-based line number.
Uses ``bisect`` for O(log n) lookup against an index built by
:func:`build_line_index`.
Args:
newline_offsets: Sorted newline positions from :func:`build_line_index`.
offset: Character offset into the source text.
Returns:
1-based line number corresponding to *offset*.
"""
return bisect.bisect_left(newline_offsets, offset) + 1