Release v1.27.0: Enhance markdown-tools with Heavy Mode
Add multi-tool orchestration for best-quality document conversion: - Dual mode: Quick (fast) and Heavy (best quality, multi-tool merge) - New convert.py - main orchestrator with tool selection matrix - New merge_outputs.py - segment-level multi-tool output merger - New validate_output.py - quality validation with HTML reports - Enhanced extract_pdf_images.py - metadata (page, position, dimensions) - PyMuPDF4LLM integration for LLM-optimized PDF conversion - pandoc integration for DOCX/PPTX structure preservation - Quality metrics: text/table/image retention with pass/warn/fail - New references: heavy-mode-guide.md, tool-comparison.md Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
439
markdown-tools/scripts/merge_outputs.py
Executable file
439
markdown-tools/scripts/merge_outputs.py
Executable file
@@ -0,0 +1,439 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Multi-tool markdown output merger with segment-level comparison.
|
||||
|
||||
Merges markdown outputs from multiple conversion tools by selecting
|
||||
the best version of each segment (tables, images, headings, paragraphs).
|
||||
|
||||
Usage:
|
||||
python merge_outputs.py output1.md output2.md -o merged.md
|
||||
python merge_outputs.py --from-json results.json -o merged.md
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class Segment:
|
||||
"""A segment of markdown content."""
|
||||
type: str # 'heading', 'table', 'image', 'list', 'paragraph', 'code'
|
||||
content: str
|
||||
level: int = 0 # For headings
|
||||
score: float = 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class MergeResult:
|
||||
"""Result from merging multiple markdown files."""
|
||||
markdown: str
|
||||
sources: list[str] = field(default_factory=list)
|
||||
segment_sources: dict = field(default_factory=dict) # segment_idx -> source
|
||||
|
||||
|
||||
def parse_segments(markdown: str) -> list[Segment]:
|
||||
"""Parse markdown into typed segments."""
|
||||
segments = []
|
||||
lines = markdown.split('\n')
|
||||
current_segment = []
|
||||
current_type = 'paragraph'
|
||||
current_level = 0
|
||||
in_code_block = False
|
||||
in_table = False
|
||||
|
||||
def flush_segment():
|
||||
nonlocal current_segment, current_type, current_level
|
||||
if current_segment:
|
||||
content = '\n'.join(current_segment).strip()
|
||||
if content:
|
||||
segments.append(Segment(
|
||||
type=current_type,
|
||||
content=content,
|
||||
level=current_level
|
||||
))
|
||||
current_segment = []
|
||||
current_type = 'paragraph'
|
||||
current_level = 0
|
||||
|
||||
for line in lines:
|
||||
# Code block detection
|
||||
if line.startswith('```'):
|
||||
if in_code_block:
|
||||
current_segment.append(line)
|
||||
flush_segment()
|
||||
in_code_block = False
|
||||
continue
|
||||
else:
|
||||
flush_segment()
|
||||
in_code_block = True
|
||||
current_type = 'code'
|
||||
current_segment.append(line)
|
||||
continue
|
||||
|
||||
if in_code_block:
|
||||
current_segment.append(line)
|
||||
continue
|
||||
|
||||
# Heading detection
|
||||
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line)
|
||||
if heading_match:
|
||||
flush_segment()
|
||||
current_type = 'heading'
|
||||
current_level = len(heading_match.group(1))
|
||||
current_segment.append(line)
|
||||
flush_segment()
|
||||
continue
|
||||
|
||||
# Table detection
|
||||
if '|' in line and re.match(r'^\s*\|.*\|\s*$', line):
|
||||
if not in_table:
|
||||
flush_segment()
|
||||
in_table = True
|
||||
current_type = 'table'
|
||||
current_segment.append(line)
|
||||
continue
|
||||
elif in_table:
|
||||
flush_segment()
|
||||
in_table = False
|
||||
|
||||
# Image detection
|
||||
if re.match(r'!\[.*\]\(.*\)', line):
|
||||
flush_segment()
|
||||
current_type = 'image'
|
||||
current_segment.append(line)
|
||||
flush_segment()
|
||||
continue
|
||||
|
||||
# List detection
|
||||
if re.match(r'^[\s]*[-*+]\s+', line) or re.match(r'^[\s]*\d+\.\s+', line):
|
||||
if current_type != 'list':
|
||||
flush_segment()
|
||||
current_type = 'list'
|
||||
current_segment.append(line)
|
||||
continue
|
||||
elif current_type == 'list' and line.strip() == '':
|
||||
flush_segment()
|
||||
continue
|
||||
|
||||
# Empty line - potential paragraph break
|
||||
if line.strip() == '':
|
||||
if current_type == 'paragraph' and current_segment:
|
||||
flush_segment()
|
||||
continue
|
||||
|
||||
# Default: paragraph
|
||||
if current_type not in ['list']:
|
||||
current_type = 'paragraph'
|
||||
current_segment.append(line)
|
||||
|
||||
flush_segment()
|
||||
return segments
|
||||
|
||||
|
||||
def score_segment(segment: Segment) -> float:
|
||||
"""Score a segment for quality comparison."""
|
||||
score = 0.0
|
||||
content = segment.content
|
||||
|
||||
if segment.type == 'table':
|
||||
# Count rows and columns
|
||||
rows = [l for l in content.split('\n') if '|' in l]
|
||||
if rows:
|
||||
cols = rows[0].count('|') - 1
|
||||
score += len(rows) * 0.5 # More rows = better
|
||||
score += cols * 0.3 # More columns = better
|
||||
# Penalize separator-only tables
|
||||
if all(re.match(r'^[\s|:-]+$', r) for r in rows):
|
||||
score -= 5.0
|
||||
# Bonus for proper header separator
|
||||
if len(rows) > 1 and re.match(r'^[\s|:-]+$', rows[1]):
|
||||
score += 1.0
|
||||
|
||||
elif segment.type == 'heading':
|
||||
# Prefer proper heading hierarchy
|
||||
score += 1.0
|
||||
# Penalize very long headings
|
||||
if len(content) > 100:
|
||||
score -= 0.5
|
||||
|
||||
elif segment.type == 'image':
|
||||
# Prefer images with alt text
|
||||
if re.search(r'!\[.+\]', content):
|
||||
score += 1.0
|
||||
# Prefer local paths over base64
|
||||
if 'data:image' not in content:
|
||||
score += 0.5
|
||||
|
||||
elif segment.type == 'list':
|
||||
items = re.findall(r'^[\s]*[-*+\d.]+\s+', content, re.MULTILINE)
|
||||
score += len(items) * 0.3
|
||||
# Bonus for nested lists
|
||||
if re.search(r'^\s{2,}[-*+]', content, re.MULTILINE):
|
||||
score += 0.5
|
||||
|
||||
elif segment.type == 'code':
|
||||
lines = content.split('\n')
|
||||
score += min(len(lines) * 0.2, 3.0)
|
||||
# Bonus for language specification
|
||||
if re.match(r'^```\w+', content):
|
||||
score += 0.5
|
||||
|
||||
else: # paragraph
|
||||
words = len(content.split())
|
||||
score += min(words * 0.05, 2.0)
|
||||
# Penalize very short paragraphs
|
||||
if words < 5:
|
||||
score -= 0.5
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def find_matching_segment(
|
||||
segment: Segment,
|
||||
candidates: list[Segment],
|
||||
used_indices: set
|
||||
) -> Optional[int]:
|
||||
"""Find a matching segment in candidates by type and similarity."""
|
||||
best_match = None
|
||||
best_similarity = 0.3 # Minimum threshold
|
||||
|
||||
for i, candidate in enumerate(candidates):
|
||||
if i in used_indices:
|
||||
continue
|
||||
if candidate.type != segment.type:
|
||||
continue
|
||||
|
||||
# Calculate similarity
|
||||
if segment.type == 'heading':
|
||||
# Compare heading text (ignore # symbols)
|
||||
s1 = re.sub(r'^#+\s*', '', segment.content).lower()
|
||||
s2 = re.sub(r'^#+\s*', '', candidate.content).lower()
|
||||
similarity = _text_similarity(s1, s2)
|
||||
elif segment.type == 'table':
|
||||
# Compare first row (header)
|
||||
h1 = segment.content.split('\n')[0] if segment.content else ''
|
||||
h2 = candidate.content.split('\n')[0] if candidate.content else ''
|
||||
similarity = _text_similarity(h1, h2)
|
||||
else:
|
||||
# Compare content directly
|
||||
similarity = _text_similarity(segment.content, candidate.content)
|
||||
|
||||
if similarity > best_similarity:
|
||||
best_similarity = similarity
|
||||
best_match = i
|
||||
|
||||
return best_match
|
||||
|
||||
|
||||
def _text_similarity(s1: str, s2: str) -> float:
|
||||
"""Calculate simple text similarity (Jaccard on words)."""
|
||||
if not s1 or not s2:
|
||||
return 0.0
|
||||
|
||||
words1 = set(s1.lower().split())
|
||||
words2 = set(s2.lower().split())
|
||||
|
||||
if not words1 or not words2:
|
||||
return 0.0
|
||||
|
||||
intersection = len(words1 & words2)
|
||||
union = len(words1 | words2)
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
|
||||
def merge_markdown_files(
|
||||
files: list[Path],
|
||||
source_names: Optional[list[str]] = None
|
||||
) -> MergeResult:
|
||||
"""Merge multiple markdown files by selecting best segments."""
|
||||
if not files:
|
||||
return MergeResult(markdown="", sources=[])
|
||||
|
||||
if source_names is None:
|
||||
source_names = [f.stem for f in files]
|
||||
|
||||
# Parse all files into segments
|
||||
all_segments = []
|
||||
for i, file_path in enumerate(files):
|
||||
content = file_path.read_text()
|
||||
segments = parse_segments(content)
|
||||
# Score each segment
|
||||
for seg in segments:
|
||||
seg.score = score_segment(seg)
|
||||
all_segments.append((source_names[i], segments))
|
||||
|
||||
if len(all_segments) == 1:
|
||||
return MergeResult(
|
||||
markdown=files[0].read_text(),
|
||||
sources=[source_names[0]]
|
||||
)
|
||||
|
||||
# Use first file as base structure
|
||||
base_name, base_segments = all_segments[0]
|
||||
merged_segments = []
|
||||
segment_sources = {}
|
||||
|
||||
for i, base_seg in enumerate(base_segments):
|
||||
best_segment = base_seg
|
||||
best_source = base_name
|
||||
|
||||
# Find matching segments in other files
|
||||
for other_name, other_segments in all_segments[1:]:
|
||||
used = set()
|
||||
match_idx = find_matching_segment(base_seg, other_segments, used)
|
||||
|
||||
if match_idx is not None:
|
||||
other_seg = other_segments[match_idx]
|
||||
if other_seg.score > best_segment.score:
|
||||
best_segment = other_seg
|
||||
best_source = other_name
|
||||
|
||||
merged_segments.append(best_segment)
|
||||
segment_sources[i] = best_source
|
||||
|
||||
# Check for segments in other files that weren't matched
|
||||
# (content that only appears in secondary sources)
|
||||
base_used = set(range(len(base_segments)))
|
||||
for other_name, other_segments in all_segments[1:]:
|
||||
for j, other_seg in enumerate(other_segments):
|
||||
match_idx = find_matching_segment(other_seg, base_segments, set())
|
||||
if match_idx is None and other_seg.score > 0.5:
|
||||
# This segment doesn't exist in base - consider adding
|
||||
merged_segments.append(other_seg)
|
||||
segment_sources[len(merged_segments) - 1] = other_name
|
||||
|
||||
# Reconstruct markdown
|
||||
merged_md = '\n\n'.join(seg.content for seg in merged_segments)
|
||||
|
||||
return MergeResult(
|
||||
markdown=merged_md,
|
||||
sources=source_names,
|
||||
segment_sources=segment_sources
|
||||
)
|
||||
|
||||
|
||||
def merge_from_json(json_path: Path) -> MergeResult:
|
||||
"""Merge from JSON results file (from convert.py)."""
|
||||
with open(json_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
results = data.get('results', [])
|
||||
if not results:
|
||||
return MergeResult(markdown="", sources=[])
|
||||
|
||||
# Filter successful results
|
||||
successful = [r for r in results if r.get('success') and r.get('markdown')]
|
||||
if not successful:
|
||||
return MergeResult(markdown="", sources=[])
|
||||
|
||||
if len(successful) == 1:
|
||||
return MergeResult(
|
||||
markdown=successful[0]['markdown'],
|
||||
sources=[successful[0]['tool']]
|
||||
)
|
||||
|
||||
# Parse and merge
|
||||
all_segments = []
|
||||
for result in successful:
|
||||
tool = result['tool']
|
||||
segments = parse_segments(result['markdown'])
|
||||
for seg in segments:
|
||||
seg.score = score_segment(seg)
|
||||
all_segments.append((tool, segments))
|
||||
|
||||
# Same merge logic as merge_markdown_files
|
||||
base_name, base_segments = all_segments[0]
|
||||
merged_segments = []
|
||||
segment_sources = {}
|
||||
|
||||
for i, base_seg in enumerate(base_segments):
|
||||
best_segment = base_seg
|
||||
best_source = base_name
|
||||
|
||||
for other_name, other_segments in all_segments[1:]:
|
||||
match_idx = find_matching_segment(base_seg, other_segments, set())
|
||||
if match_idx is not None:
|
||||
other_seg = other_segments[match_idx]
|
||||
if other_seg.score > best_segment.score:
|
||||
best_segment = other_seg
|
||||
best_source = other_name
|
||||
|
||||
merged_segments.append(best_segment)
|
||||
segment_sources[i] = best_source
|
||||
|
||||
merged_md = '\n\n'.join(seg.content for seg in merged_segments)
|
||||
|
||||
return MergeResult(
|
||||
markdown=merged_md,
|
||||
sources=[r['tool'] for r in successful],
|
||||
segment_sources=segment_sources
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Merge markdown outputs from multiple conversion tools"
|
||||
)
|
||||
parser.add_argument(
|
||||
"inputs",
|
||||
nargs="*",
|
||||
type=Path,
|
||||
help="Input markdown files to merge"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o", "--output",
|
||||
type=Path,
|
||||
help="Output merged markdown file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--from-json",
|
||||
type=Path,
|
||||
help="Merge from JSON results file (from convert.py)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="Show segment source attribution"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.from_json:
|
||||
result = merge_from_json(args.from_json)
|
||||
elif args.inputs:
|
||||
# Validate inputs
|
||||
for f in args.inputs:
|
||||
if not f.exists():
|
||||
print(f"Error: File not found: {f}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
result = merge_markdown_files(args.inputs)
|
||||
else:
|
||||
parser.error("Either input files or --from-json is required")
|
||||
|
||||
if not result.markdown:
|
||||
print("Error: No content to merge", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Output
|
||||
if args.output:
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
args.output.write_text(result.markdown)
|
||||
print(f"Merged output: {args.output}")
|
||||
print(f"Sources: {', '.join(result.sources)}")
|
||||
else:
|
||||
print(result.markdown)
|
||||
|
||||
if args.verbose and result.segment_sources:
|
||||
print("\n--- Segment Attribution ---", file=sys.stderr)
|
||||
for idx, source in result.segment_sources.items():
|
||||
print(f" Segment {idx}: {source}", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user