Files
changedetection.io/changedetectionio/diff/tokenizers/words_and_html.py
dgtlmoon 961994abcf refactor
2025-10-13 17:46:53 +02:00

62 lines
1.7 KiB
Python

"""
Tokenizer that preserves HTML tags as atomic units while splitting on whitespace.
This tokenizer is specifically designed for HTML content where:
- HTML tags should remain intact (e.g., '<p>', '<a href="...">')
- Whitespace tokens are preserved for accurate diff reconstruction
- Words are split on whitespace boundaries
"""
from typing import List
def tokenize_words_and_html(text: str) -> List[str]:
"""
Split text into words and boundaries (spaces, HTML tags).
This tokenizer preserves HTML tags as atomic units while splitting on whitespace.
Useful for content that contains HTML markup.
Args:
text: Input text to tokenize
Returns:
List of tokens (words, spaces, HTML tags)
Examples:
>>> tokenize_words_and_html("<p>Hello world</p>")
['<p>', 'Hello', ' ', 'world', '</p>']
>>> tokenize_words_and_html("<a href='test.com'>link</a>")
['<a href=\\'test.com\\'>', 'link', '</a>']
"""
tokens = []
current = ''
in_tag = False
for char in text:
if char == '<':
# Start of HTML tag
if current:
tokens.append(current)
current = ''
current = '<'
in_tag = True
elif char == '>' and in_tag:
# End of HTML tag
current += '>'
tokens.append(current)
current = ''
in_tag = False
elif char.isspace() and not in_tag:
# Space outside of tag
if current:
tokens.append(current)
current = ''
tokens.append(char)
else:
current += char
if current:
tokens.append(current)
return tokens