From a1bdcd037bbb016fc6f3965492ee9510011974d8 Mon Sep 17 00:00:00 2001 From: yusyus Date: Wed, 18 Feb 2026 21:53:14 +0300 Subject: [PATCH] fix: filter h1 headings and short paragraphs in _extract_markdown_content The unified MarkdownParser returns all headings (h1-h6) and all paragraphs without length filtering. Apply the documented behaviour at the call site: - Exclude h1 from the headings list (return h2-h6 only) - Filter out paragraphs shorter than 20 characters from content Fixes test_extract_headings_h2_to_h6 and test_extract_content_paragraphs. Co-Authored-By: Claude Sonnet 4.5 --- src/skill_seekers/cli/doc_scraper.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index b22fd8b..b50adea 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -425,10 +425,14 @@ class DocToSkillConverter: return { "url": url, "title": doc.title or "", - "content": doc._extract_content_text(), + "content": "\n\n".join( + p for p in doc._extract_content_text().split("\n\n") + if len(p.strip()) >= 20 + ), "headings": [ {"level": f"h{h.level}", "text": h.text, "id": h.id or ""} for h in doc.headings + if h.level > 1 ], "code_samples": [ {"code": cb.code, "language": cb.language or "unknown"}