Files
skill-seekers-reference/src/skill_seekers/cli/markdown_cleaner.py
Pablo Estevez 5ed767ff9a run ruff
2026-01-17 17:29:21 +00:00

133 lines
4.2 KiB
Python

#!/usr/bin/env python3
"""
Markdown Cleaner Utility
Removes HTML tags and bloat from markdown content while preserving structure.
Used to clean README files and other documentation for skill generation.
"""
import re
class MarkdownCleaner:
"""Clean HTML from markdown while preserving structure"""
@staticmethod
def remove_html_tags(text: str) -> str:
"""
Remove HTML tags while preserving text content.
Args:
text: Markdown text possibly containing HTML
Returns:
Cleaned markdown with HTML tags removed
"""
# Remove HTML comments
text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
# Remove HTML tags but keep content
text = re.sub(r"<[^>]+>", "", text)
# Remove empty lines created by HTML removal
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)
return text.strip()
@staticmethod
def extract_first_section(text: str, max_chars: int = 500) -> str:
"""
Extract first meaningful content, respecting markdown structure.
Captures content including section headings up to max_chars.
For short READMEs, includes everything. For longer ones, extracts
intro + first few sections (e.g., installation, quick start).
Args:
text: Full markdown text
max_chars: Maximum characters to extract
Returns:
First section content (cleaned, including headings)
"""
# Remove HTML first
text = MarkdownCleaner.remove_html_tags(text)
# If text is short, return it all
if len(text) <= max_chars:
return text.strip()
# For longer text, extract smartly
lines = text.split("\n")
content_lines = []
char_count = 0
section_count = 0
in_code_block = False # Track code fence state to avoid truncating mid-block
for line in lines:
# Check for code fence (```)
if line.strip().startswith("```"):
in_code_block = not in_code_block
# Check for any heading (H1-H6)
is_heading = re.match(r"^#{1,6}\s+", line)
if is_heading:
section_count += 1
# Include first 4 sections (title + 3 sections like Installation, Quick Start, Features)
if section_count <= 4:
content_lines.append(line)
char_count += len(line)
else:
# Stop after 4 sections (but not if in code block)
if not in_code_block:
break
else:
# Include content
content_lines.append(line)
char_count += len(line)
# Stop if we have enough content (but not if in code block)
if char_count >= max_chars and not in_code_block:
break
result = "\n".join(content_lines).strip()
# If we truncated, ensure we don't break markdown (only if not in code block)
if char_count >= max_chars and not in_code_block:
# Find last complete sentence
result = MarkdownCleaner._truncate_at_sentence(result, max_chars)
return result
@staticmethod
def _truncate_at_sentence(text: str, max_chars: int) -> str:
"""
Truncate at last complete sentence before max_chars.
Args:
text: Text to truncate
max_chars: Maximum character count
Returns:
Truncated text ending at sentence boundary
"""
if len(text) <= max_chars:
return text
# Find last sentence boundary before max_chars
truncated = text[:max_chars]
# Look for last period, exclamation, or question mark
last_sentence = max(truncated.rfind(". "), truncated.rfind("! "), truncated.rfind("? "))
if last_sentence > max_chars // 2: # At least half the content
return truncated[: last_sentence + 1]
# Fall back to word boundary
last_space = truncated.rfind(" ")
if last_space > 0:
return truncated[:last_space] + "..."
return truncated + "..."