mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-09 11:06:47 +00:00
62 lines
1.7 KiB
Python
62 lines
1.7 KiB
Python
"""
|
|
Tokenizer that preserves HTML tags as atomic units while splitting on whitespace.
|
|
|
|
This tokenizer is specifically designed for HTML content where:
|
|
- HTML tags should remain intact (e.g., '<p>', '<a href="...">')
|
|
- Whitespace tokens are preserved for accurate diff reconstruction
|
|
- Words are split on whitespace boundaries
|
|
"""
|
|
|
|
from typing import List
|
|
|
|
|
|
def tokenize_words_and_html(text: str) -> List[str]:
|
|
"""
|
|
Split text into words and boundaries (spaces, HTML tags).
|
|
|
|
This tokenizer preserves HTML tags as atomic units while splitting on whitespace.
|
|
Useful for content that contains HTML markup.
|
|
|
|
Args:
|
|
text: Input text to tokenize
|
|
|
|
Returns:
|
|
List of tokens (words, spaces, HTML tags)
|
|
|
|
Examples:
|
|
>>> tokenize_words_and_html("<p>Hello world</p>")
|
|
['<p>', 'Hello', ' ', 'world', '</p>']
|
|
>>> tokenize_words_and_html("<a href='test.com'>link</a>")
|
|
['<a href=\\'test.com\\'>', 'link', '</a>']
|
|
"""
|
|
tokens = []
|
|
current = ''
|
|
in_tag = False
|
|
|
|
for char in text:
|
|
if char == '<':
|
|
# Start of HTML tag
|
|
if current:
|
|
tokens.append(current)
|
|
current = ''
|
|
current = '<'
|
|
in_tag = True
|
|
elif char == '>' and in_tag:
|
|
# End of HTML tag
|
|
current += '>'
|
|
tokens.append(current)
|
|
current = ''
|
|
in_tag = False
|
|
elif char.isspace() and not in_tag:
|
|
# Space outside of tag
|
|
if current:
|
|
tokens.append(current)
|
|
current = ''
|
|
tokens.append(char)
|
|
else:
|
|
current += char
|
|
|
|
if current:
|
|
tokens.append(current)
|
|
return tokens
|