run ruff

2026-01-17 17:29:21 +00:00
parent c89f059712
commit 5ed767ff9a
144 changed files with 14142 additions and 16488 deletions
--- a/src/skill_seekers/cli/llms_txt_parser.py
+++ b/src/skill_seekers/cli/llms_txt_parser.py
@@ -1,10 +1,11 @@
 """ABOUTME: Parses llms.txt markdown content into structured page data"""
+
 """ABOUTME: Extracts titles, content, code samples, and headings from markdown"""

 import re
-from typing import List, Dict
 from urllib.parse import urljoin

+
 class LlmsTxtParser:
    """Parse llms.txt markdown content into page structures"""

@@ -12,7 +13,7 @@ class LlmsTxtParser:
        self.content = content
        self.base_url = base_url

-    def extract_urls(self) -> List[str]:
+    def extract_urls(self) -> list[str]:
        """
        Extract all URLs from the llms.txt content.

@@ -33,13 +34,13 @@ class LlmsTxtParser:
        urls = set()

        # Match markdown links: [text](url)
-        md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', self.content)
+        md_links = re.findall(r"\[([^\]]*)\]\(([^)]+)\)", self.content)
        for _, url in md_links:
-            if url.startswith('http'):
+            if url.startswith("http"):
                clean_url = self._clean_url(url)
                if clean_url:
                    urls.add(clean_url)
-            elif self.base_url and not url.startswith('#'):
+            elif self.base_url and not url.startswith("#"):
                clean_url = self._clean_url(urljoin(self.base_url, url))
                if clean_url:
                    urls.add(clean_url)
@@ -48,7 +49,7 @@ class LlmsTxtParser:
        bare_urls = re.findall(r'https?://[^\s\)\]<>"\']+', self.content)
        for url in bare_urls:
            # Clean trailing punctuation
-            url = url.rstrip('.,;:')
+            url = url.rstrip(".,;:")
            clean_url = self._clean_url(url)
            if clean_url:
                urls.add(clean_url)
@@ -79,16 +80,16 @@ class LlmsTxtParser:
        """
        # Skip URLs with path after anchor (e.g., #section/index.html.md)
        # These are malformed and return duplicate HTML content
-        if '#' in url:
-            anchor_pos = url.index('#')
-            after_anchor = url[anchor_pos + 1:]
+        if "#" in url:
+            anchor_pos = url.index("#")
+            after_anchor = url[anchor_pos + 1 :]
            # If there's a path separator after anchor, it's invalid
-            if '/' in after_anchor:
+            if "/" in after_anchor:
                # Extract the base URL without the malformed anchor
                return url[:anchor_pos]
        return url

-    def parse(self) -> List[Dict]:
+    def parse(self) -> list[dict]:
        """
        Parse markdown content into page structures.

@@ -98,55 +99,50 @@ class LlmsTxtParser:
        pages = []

        # Split by h1 headers (# Title)
-        sections = re.split(r'\n# ', self.content)
+        sections = re.split(r"\n# ", self.content)

        for section in sections:
            if not section.strip():
                continue

            # First line is title
-            lines = section.split('\n')
-            title = lines[0].strip('#').strip()
+            lines = section.split("\n")
+            title = lines[0].strip("#").strip()

            # Parse content
-            page = self._parse_section('\n'.join(lines[1:]), title)
+            page = self._parse_section("\n".join(lines[1:]), title)
            pages.append(page)

        return pages

-    def _parse_section(self, content: str, title: str) -> Dict:
+    def _parse_section(self, content: str, title: str) -> dict:
        """Parse a single section into page structure"""
        page = {
-            'title': title,
-            'content': '',
-            'code_samples': [],
-            'headings': [],
-            'url': f'llms-txt#{title.lower().replace(" ", "-")}',
-            'links': []
+            "title": title,
+            "content": "",
+            "code_samples": [],
+            "headings": [],
+            "url": f"llms-txt#{title.lower().replace(' ', '-')}",
+            "links": [],
        }

        # Extract code blocks
-        code_blocks = re.findall(r'```(\w+)?\n(.*?)```', content, re.DOTALL)
+        code_blocks = re.findall(r"```(\w+)?\n(.*?)```", content, re.DOTALL)
        for lang, code in code_blocks:
-            page['code_samples'].append({
-                'code': code.strip(),
-                'language': lang or 'unknown'
-            })
+            page["code_samples"].append({"code": code.strip(), "language": lang or "unknown"})

        # Extract h2/h3 headings
-        headings = re.findall(r'^(#{2,3})\s+(.+)$', content, re.MULTILINE)
+        headings = re.findall(r"^(#{2,3})\s+(.+)$", content, re.MULTILINE)
        for level_markers, text in headings:
-            page['headings'].append({
-                'level': f'h{len(level_markers)}',
-                'text': text.strip(),
-                'id': text.lower().replace(' ', '-')
-            })
+            page["headings"].append(
+                {"level": f"h{len(level_markers)}", "text": text.strip(), "id": text.lower().replace(" ", "-")}
+            )

        # Remove code blocks from content for plain text
-        content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
+        content_no_code = re.sub(r"```.*?```", "", content, flags=re.DOTALL)

        # Extract paragraphs
-        paragraphs = [p.strip() for p in content_no_code.split('\n\n') if len(p.strip()) > 20]
-        page['content'] = '\n\n'.join(paragraphs)
+        paragraphs = [p.strip() for p in content_no_code.split("\n\n") if len(p.strip()) > 20]
+        page["content"] = "\n\n".join(paragraphs)

        return page