From 04de96f2f58e26244ebc2aedaa929075af40a3b0 Mon Sep 17 00:00:00 2001
From: yusyus <yusufkaraaslan.yk@pm.me>
Date: Sun, 11 Jan 2026 14:01:23 +0300
Subject: [PATCH] fix: Add empty list checks and enhance docstrings (PR #243
 review fixes)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two critical improvements from PR #243 code review:

## Fix 1: Empty List Edge Case Handling

Added early return checks to prevent creating empty index files:

**Files Modified:**
- src/skill_seekers/cli/unified_skill_builder.py

**Changes:**
- _generate_docs_references: Skip if docs_list empty
- _generate_github_references: Skip if github_list empty
- _generate_pdf_references: Skip if pdf_list empty

**Impact:**
Prevents "Combined from 0 sources" index files which look odd.

## Fix 2: Enhanced Method Docstrings

Added comprehensive parameter types and return value documentation:

**Files Modified:**
- src/skill_seekers/cli/llms_txt_parser.py
  - extract_urls: Added detailed examples and behavior notes
  - _clean_url: Added malformed URL pattern examples

- src/skill_seekers/cli/doc_scraper.py
  - _extract_markdown_content: Full return dict structure documented
  - _extract_html_as_markdown: Extraction strategy and fallback behavior

**Impact:**
Improved developer experience with detailed API documentation.

## Testing

All tests passing:
- ✅ 32/32 PR #243 tests (markdown parsing + multi-source)
- ✅ 975/975 core tests
- 159 skipped (optional dependencies)
- 4 failed (missing anthropic - expected)

Co-authored-by: Code Review <claude-sonnet-4.5@anthropic.com>
---
 src/skill_seekers/cli/doc_scraper.py          | 56 ++++++++++++++++---
 src/skill_seekers/cli/llms_txt_parser.py      | 29 +++++++++-
 .../cli/unified_skill_builder.py              | 12 ++++
 3 files changed, 87 insertions(+), 10 deletions(-)

diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py
index 1e52181..021738b 100755
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -350,14 +350,34 @@ class DocToSkillConverter:
         return page
 
     def _extract_markdown_content(self, content: str, url: str) -> Dict[str, Any]:
-        """Extract content from a Markdown file.
+        """Extract structured content from a Markdown file.
+
+        Parses markdown files from llms.txt URLs to extract:
+        - Title from first h1 heading
+        - Headings (h2-h6, excluding h1)
+        - Code blocks with language detection
+        - Internal .md links for BFS crawling
+        - Content paragraphs (>20 chars)
+
+        Auto-detects HTML content and falls back to _extract_html_as_markdown.
 
         Args:
-            content: Raw markdown content (or HTML if server returned HTML)
-            url: Source URL
+            content: Raw markdown content string (or HTML if server returned HTML)
+            url: Source URL for resolving relative links
 
         Returns:
-            Page dict with title, content, code_samples, headings, links
+            Dict with keys:
+                - url: str - Source URL
+                - title: str - Extracted from first # heading
+                - content: str - Paragraphs joined with double newlines
+                - headings: List[Dict] - {'level': 'h2', 'text': str, 'id': str}
+                - code_samples: List[Dict] - {'code': str, 'language': str}
+                - links: List[str] - Absolute URLs to other .md files
+                - patterns: List - Empty (reserved for future use)
+
+        Note:
+            Only .md links are extracted to avoid client-side rendered HTML pages.
+            Anchor fragments (#section) are stripped from links.
         """
         import re
 
@@ -434,12 +454,34 @@ class DocToSkillConverter:
     def _extract_html_as_markdown(self, html_content: str, url: str) -> Dict[str, Any]:
         """Extract content from HTML and convert to markdown-like structure.
 
+        Fallback method when .md URL returns HTML content instead of markdown.
+        Uses BeautifulSoup to extract structured data from HTML elements.
+
+        Extraction strategy:
+        1. Title from <title> tag
+        2. Main content from <main>, <article>, [role="main"], or <body>
+        3. Headings (h1-h6) with text and id attributes
+        4. Code blocks from <pre><code> or <pre> tags
+        5. Text content from paragraphs
+
         Args:
-            html_content: Raw HTML content
-            url: Source URL
+            html_content: Raw HTML content string
+            url: Source URL (for reference in result dict)
 
         Returns:
-            Page dict with title, content, code_samples, headings, links
+            Dict with keys:
+                - url: str - Source URL
+                - title: str - From <title> tag, cleaned
+                - content: str - Text content from main area
+                - headings: List[Dict] - {'level': 'h2', 'text': str, 'id': str}
+                - code_samples: List[Dict] - {'code': str, 'language': str}
+                - links: List - Empty (HTML links not extracted to avoid client-side routes)
+                - patterns: List - Empty (reserved for future use)
+
+        Note:
+            Prefers <main> or <article> tags for content area.
+            Falls back to <body> if no semantic content container found.
+            Language detection uses detect_language() method.
         """
         page = {
             'url': url,
diff --git a/src/skill_seekers/cli/llms_txt_parser.py b/src/skill_seekers/cli/llms_txt_parser.py
index 2e143bf..ae11410 100644
--- a/src/skill_seekers/cli/llms_txt_parser.py
+++ b/src/skill_seekers/cli/llms_txt_parser.py
@@ -16,8 +16,19 @@ class LlmsTxtParser:
         """
         Extract all URLs from the llms.txt content.
 
+        Supports both markdown-style links [text](url) and bare URLs.
+        Resolves relative URLs using base_url if provided.
+        Filters out malformed URLs with invalid anchor patterns.
+
         Returns:
-            List of unique URLs found in the content
+            List of unique, cleaned URLs found in the content.
+            Returns empty list if no valid URLs found.
+
+        Note:
+            - Markdown links: [Getting Started](./docs/guide.md)
+            - Bare URLs: https://example.com/api.md
+            - Relative paths resolved with base_url
+            - Invalid anchors (#section/path.md) are stripped
         """
         urls = set()
 
@@ -48,11 +59,23 @@ class LlmsTxtParser:
         """
         Clean and validate URL, removing invalid anchor patterns.
 
+        Detects and strips malformed anchors that contain path separators.
+        Valid: https://example.com/page.md#section
+        Invalid: https://example.com/page#section/index.html.md
+
         Args:
-            url: URL to clean
+            url: URL to clean (absolute or relative)
 
         Returns:
-            Cleaned URL or empty string if invalid
+            Cleaned URL with malformed anchors stripped.
+            Returns base URL if anchor contains '/' (malformed).
+            Returns original URL if anchor is valid or no anchor present.
+
+        Example:
+            >>> parser._clean_url("https://ex.com/page#sec/path.md")
+            "https://ex.com/page"
+            >>> parser._clean_url("https://ex.com/page.md#section")
+            "https://ex.com/page.md#section"
         """
         # Skip URLs with path after anchor (e.g., #section/index.html.md)
         # These are malformed and return duplicate HTML content
diff --git a/src/skill_seekers/cli/unified_skill_builder.py b/src/skill_seekers/cli/unified_skill_builder.py
index a80f86d..ef6437c 100644
--- a/src/skill_seekers/cli/unified_skill_builder.py
+++ b/src/skill_seekers/cli/unified_skill_builder.py
@@ -287,6 +287,10 @@ This skill combines knowledge from multiple sources:
 
     def _generate_docs_references(self, docs_list: List[Dict]):
         """Generate references from multiple documentation sources."""
+        # Skip if no documentation sources
+        if not docs_list:
+            return
+
         docs_dir = os.path.join(self.skill_dir, 'references', 'documentation')
         os.makedirs(docs_dir, exist_ok=True)
 
@@ -347,6 +351,10 @@ This skill combines knowledge from multiple sources:
 
     def _generate_github_references(self, github_list: List[Dict]):
         """Generate references from multiple GitHub sources."""
+        # Skip if no GitHub sources
+        if not github_list:
+            return
+
         github_dir = os.path.join(self.skill_dir, 'references', 'github')
         os.makedirs(github_dir, exist_ok=True)
 
@@ -429,6 +437,10 @@ This skill combines knowledge from multiple sources:
 
     def _generate_pdf_references(self, pdf_list: List[Dict]):
         """Generate references from PDF sources."""
+        # Skip if no PDF sources
+        if not pdf_list:
+            return
+
         pdf_dir = os.path.join(self.skill_dir, 'references', 'pdf')
         os.makedirs(pdf_dir, exist_ok=True)