#!/usr/bin/env python3 """ Markdown Cleaner Utility Removes HTML tags and bloat from markdown content while preserving structure. Used to clean README files and other documentation for skill generation. """ import re class MarkdownCleaner: """Clean HTML from markdown while preserving structure""" @staticmethod def remove_html_tags(text: str) -> str: """ Remove HTML tags while preserving text content. Args: text: Markdown text possibly containing HTML Returns: Cleaned markdown with HTML tags removed """ # Remove HTML comments text = re.sub(r"", "", text, flags=re.DOTALL) # Remove HTML tags but keep content text = re.sub(r"<[^>]+>", "", text) # Remove empty lines created by HTML removal text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text) return text.strip() @staticmethod def extract_first_section(text: str, max_chars: int = 500) -> str: """ Extract first meaningful content, respecting markdown structure. Captures content including section headings up to max_chars. For short READMEs, includes everything. For longer ones, extracts intro + first few sections (e.g., installation, quick start). Args: text: Full markdown text max_chars: Maximum characters to extract Returns: First section content (cleaned, including headings) """ # Remove HTML first text = MarkdownCleaner.remove_html_tags(text) # If text is short, return it all if len(text) <= max_chars: return text.strip() # For longer text, extract smartly lines = text.split("\n") content_lines = [] char_count = 0 section_count = 0 in_code_block = False # Track code fence state to avoid truncating mid-block for line in lines: # Check for code fence (```) if line.strip().startswith("```"): in_code_block = not in_code_block # Check for any heading (H1-H6) is_heading = re.match(r"^#{1,6}\s+", line) if is_heading: section_count += 1 # Include first 4 sections (title + 3 sections like Installation, Quick Start, Features) if section_count <= 4: content_lines.append(line) char_count += len(line) else: # Stop after 4 sections (but not if in code block) if not in_code_block: break else: # Include content content_lines.append(line) char_count += len(line) # Stop if we have enough content (but not if in code block) if char_count >= max_chars and not in_code_block: break result = "\n".join(content_lines).strip() # If we truncated, ensure we don't break markdown (only if not in code block) if char_count >= max_chars and not in_code_block: # Find last complete sentence result = MarkdownCleaner._truncate_at_sentence(result, max_chars) return result @staticmethod def _truncate_at_sentence(text: str, max_chars: int) -> str: """ Truncate at last complete sentence before max_chars. Args: text: Text to truncate max_chars: Maximum character count Returns: Truncated text ending at sentence boundary """ if len(text) <= max_chars: return text # Find last sentence boundary before max_chars truncated = text[:max_chars] # Look for last period, exclamation, or question mark last_sentence = max(truncated.rfind(". "), truncated.rfind("! "), truncated.rfind("? ")) if last_sentence > max_chars // 2: # At least half the content return truncated[: last_sentence + 1] # Fall back to word boundary last_space = truncated.rfind(" ") if last_space > 0: return truncated[:last_space] + "..." return truncated + "..."