""" Tokenizer that preserves HTML tags as atomic units while splitting on whitespace. This tokenizer is specifically designed for HTML content where: - HTML tags should remain intact (e.g., '
', '')
- Whitespace tokens are preserved for accurate diff reconstruction
- Words are split on whitespace boundaries
"""
from typing import List
def tokenize_words_and_html(text: str) -> List[str]:
"""
Split text into words and boundaries (spaces, HTML tags).
This tokenizer preserves HTML tags as atomic units while splitting on whitespace.
Useful for content that contains HTML markup.
Args:
text: Input text to tokenize
Returns:
List of tokens (words, spaces, HTML tags)
Examples:
>>> tokenize_words_and_html(" Hello world ', 'Hello', ' ', 'world', '