fix(security): Address remaining scanning alerts

Tighten the remaining high-signal security findings by switching the todo example to a standard Express rate limiter, removing sensitive metadata from boilerplate logging, and replacing fragile HTML tag filtering with parser-based conversion. Co-Authored-By: Claude <noreply@anthropic.com>
2026-03-18 18:15:28 +01:00
parent 6114c38cda
commit 344854e9e5
7 changed files with 160 additions and 66 deletions
--- a/tools/scripts/convert_html_to_markdown.py
+++ b/tools/scripts/convert_html_to_markdown.py
@@ -11,10 +11,99 @@ import re
 import sys
 import urllib.request
 import urllib.error
+from html import unescape
+from html.parser import HTMLParser
 from pathlib import Path
 from typing import Dict, Optional, Tuple
 from urllib.parse import urlparse, urljoin

+
+class MarkdownHTMLParser(HTMLParser):
+    """Convert a constrained subset of HTML into markdown without regex tag stripping."""
+
+    def __init__(self) -> None:
+        super().__init__(convert_charrefs=True)
+        self._parts: list[str] = []
+        self._ignored_tag: Optional[str] = None
+        self._ignored_depth = 0
+        self._current_link: Optional[str] = None
+        self._list_depth = 0
+        self._in_pre = False
+
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, Optional[str]]]) -> None:
+        if self._ignored_tag:
+            if tag == self._ignored_tag:
+                self._ignored_depth += 1
+            return
+
+        if tag in {"script", "style"}:
+            self._ignored_tag = tag
+            self._ignored_depth = 1
+            return
+
+        attrs_dict = dict(attrs)
+
+        if tag in {"article", "main", "div", "section"}:
+            self._append("\n")
+        elif tag == "br":
+            self._append("\n")
+        elif tag == "p":
+            self._append("\n\n")
+        elif tag in {"h1", "h2", "h3"}:
+            prefix = {"h1": "# ", "h2": "## ", "h3": "### "}[tag]
+            self._append(f"\n\n{prefix}")
+        elif tag in {"ul", "ol"}:
+            self._list_depth += 1
+            self._append("\n")
+        elif tag == "li":
+            indent = "  " * max(0, self._list_depth - 1)
+            self._append(f"\n{indent}- ")
+        elif tag == "a":
+            self._current_link = attrs_dict.get("href")
+            self._append("[")
+        elif tag == "pre":
+            self._in_pre = True
+            self._append("\n\n```\n")
+        elif tag == "code" and not self._in_pre:
+            self._append("`")
+
+    def handle_endtag(self, tag: str) -> None:
+        if self._ignored_tag:
+            if tag == self._ignored_tag:
+                self._ignored_depth -= 1
+                if self._ignored_depth == 0:
+                    self._ignored_tag = None
+            return
+
+        if tag in {"h1", "h2", "h3", "p"}:
+            self._append("\n")
+        elif tag in {"ul", "ol"}:
+            self._list_depth = max(0, self._list_depth - 1)
+            self._append("\n")
+        elif tag == "a":
+            href = self._current_link or ""
+            self._append(f"]({href})")
+            self._current_link = None
+        elif tag == "pre":
+            self._in_pre = False
+            self._append("\n```\n")
+        elif tag == "code" and not self._in_pre:
+            self._append("`")
+
+    def handle_data(self, data: str) -> None:
+        if self._ignored_tag or not data:
+            return
+        self._append(unescape(data))
+
+    def get_markdown(self) -> str:
+        markdown = "".join(self._parts)
+        markdown = re.sub(r"\n{3,}", "\n\n", markdown)
+        return markdown.strip()
+
+    def _append(self, text: str) -> None:
+        if text:
+            self._parts.append(text)
+
 def parse_frontmatter(content: str) -> Optional[Dict]:
    """Parse YAML frontmatter."""
    fm_match = re.search(r'^---\s*\n(.*?)\n---', content, re.DOTALL)
@@ -128,37 +217,10 @@ def extract_markdown_from_html(html_content: str) -> Optional[str]:

 def convert_html_to_markdown(html: str) -> str:
    """Basic HTML to markdown conversion."""
-    # Remove scripts and styles
-    html = re.sub(r'<script\b[^>]*>.*?</script\s*>', '', html, flags=re.DOTALL | re.IGNORECASE)
-    html = re.sub(r'<style\b[^>]*>.*?</style\s*>', '', html, flags=re.DOTALL | re.IGNORECASE)
-    
-    # Headings
-    html = re.sub(r'<h1[^>]*>(.*?)</h1>', r'# \1', html, flags=re.DOTALL | re.IGNORECASE)
-    html = re.sub(r'<h2[^>]*>(.*?)</h2>', r'## \1', html, flags=re.DOTALL | re.IGNORECASE)
-    html = re.sub(r'<h3[^>]*>(.*?)</h3>', r'### \1', html, flags=re.DOTALL | re.IGNORECASE)
-    
-    # Code blocks
-    html = re.sub(r'<pre[^>]*><code[^>]*>(.*?)</code></pre>', r'```\n\1\n```', html, flags=re.DOTALL | re.IGNORECASE)
-    html = re.sub(r'<code[^>]*>(.*?)</code>', r'`\1`', html, flags=re.DOTALL | re.IGNORECASE)
-    
-    # Links
-    html = re.sub(r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>', r'[\2](\1)', html, flags=re.DOTALL | re.IGNORECASE)
-    
-    # Lists
-    html = re.sub(r'<li[^>]*>(.*?)</li>', r'- \1', html, flags=re.DOTALL | re.IGNORECASE)
-    html = re.sub(r'<ul[^>]*>|</ul>|<ol[^>]*>|</ol>', '', html, flags=re.IGNORECASE)
-    
-    # Paragraphs
-    html = re.sub(r'<p[^>]*>(.*?)</p>', r'\1\n\n', html, flags=re.DOTALL | re.IGNORECASE)
-    
-    # Remove remaining HTML tags
-    html = re.sub(r'<[^>]+>', '', html)
-    
-    # Clean up whitespace
-    html = re.sub(r'\n{3,}', '\n\n', html)
-    html = html.strip()
-    
-    return html
+    parser = MarkdownHTMLParser()
+    parser.feed(html)
+    parser.close()
+    return parser.get_markdown()

 def create_minimal_markdown(metadata: Dict, source_url: str) -> str:
    """Create minimal markdown content from metadata."""