From a1bdcd037bbb016fc6f3965492ee9510011974d8 Mon Sep 17 00:00:00 2001
From: yusyus <yusufkaraaslan.yk@pm.me>
Date: Wed, 18 Feb 2026 21:53:14 +0300
Subject: [PATCH] fix: filter h1 headings and short paragraphs in
 _extract_markdown_content

The unified MarkdownParser returns all headings (h1-h6) and all paragraphs
without length filtering. Apply the documented behaviour at the call site:
- Exclude h1 from the headings list (return h2-h6 only)
- Filter out paragraphs shorter than 20 characters from content

Fixes test_extract_headings_h2_to_h6 and test_extract_content_paragraphs.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 src/skill_seekers/cli/doc_scraper.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py
index b22fd8b..b50adea 100755
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -425,10 +425,14 @@ class DocToSkillConverter:
                 return {
                     "url": url,
                     "title": doc.title or "",
-                    "content": doc._extract_content_text(),
+                    "content": "\n\n".join(
+                        p for p in doc._extract_content_text().split("\n\n")
+                        if len(p.strip()) >= 20
+                    ),
                     "headings": [
                         {"level": f"h{h.level}", "text": h.text, "id": h.id or ""}
                         for h in doc.headings
+                        if h.level > 1
                     ],
                     "code_samples": [
                         {"code": cb.code, "language": cb.language or "unknown"}