claude-code-skills-reference/transcript-fixer/scripts/utils/diff_formats/text_splitter.py

#!/usr/bin/env python3
"""
Text splitter utility for word-level diff generation

SINGLE RESPONSIBILITY: Split text into words while preserving structure
"""

from __future__ import annotations

import re


def split_into_words(text: str) -> list[str]:
    """
    Split text into words, preserving whitespace and punctuation

    This enables word-level diff generation for Chinese and English text

    Args:
        text: Input text to split

    Returns:
        List of word tokens (Chinese words, English words, numbers, punctuation)
    """
    # Pattern: Chinese chars, English words, numbers, non-alphanumeric chars
    pattern = r'[\u4e00-\u9fff]+|[a-zA-Z]+|[0-9]+|[^\u4e00-\u9fffa-zA-Z0-9]'
    return re.findall(pattern, text)


def read_file(file_path: str) -> str:
    """Read file contents"""
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()