mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-12-17 13:35:50 +00:00
Some checks failed
Build and push containers / metadata (push) Has been cancelled
Build and push containers / build-push-containers (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (alpine) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/amd64 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v7 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm/v8 (main) (push) Has been cancelled
ChangeDetection.io Container Build Test / Build linux/arm64 (main) (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
62 lines
1.7 KiB
Python
62 lines
1.7 KiB
Python
"""
|
|
Tokenizer that preserves HTML tags as atomic units while splitting on whitespace.
|
|
|
|
This tokenizer is specifically designed for HTML content where:
|
|
- HTML tags should remain intact (e.g., '<p>', '<a href="...">')
|
|
- Whitespace tokens are preserved for accurate diff reconstruction
|
|
- Words are split on whitespace boundaries
|
|
"""
|
|
|
|
from typing import List
|
|
|
|
|
|
def tokenize_words_and_html(text: str) -> List[str]:
|
|
"""
|
|
Split text into words and boundaries (spaces, HTML tags).
|
|
|
|
This tokenizer preserves HTML tags as atomic units while splitting on whitespace.
|
|
Useful for content that contains HTML markup.
|
|
|
|
Args:
|
|
text: Input text to tokenize
|
|
|
|
Returns:
|
|
List of tokens (words, spaces, HTML tags)
|
|
|
|
Examples:
|
|
>>> tokenize_words_and_html("<p>Hello world</p>")
|
|
['<p>', 'Hello', ' ', 'world', '</p>']
|
|
>>> tokenize_words_and_html("<a href='test.com'>link</a>")
|
|
['<a href=\\'test.com\\'>', 'link', '</a>']
|
|
"""
|
|
tokens = []
|
|
current = ''
|
|
in_tag = False
|
|
|
|
for char in text:
|
|
if char == '<':
|
|
# Start of HTML tag
|
|
if current:
|
|
tokens.append(current)
|
|
current = ''
|
|
current = '<'
|
|
in_tag = True
|
|
elif char == '>' and in_tag:
|
|
# End of HTML tag
|
|
current += '>'
|
|
tokens.append(current)
|
|
current = ''
|
|
in_tag = False
|
|
elif char.isspace() and not in_tag:
|
|
# Space outside of tag
|
|
if current:
|
|
tokens.append(current)
|
|
current = ''
|
|
tokens.append(char)
|
|
else:
|
|
current += char
|
|
|
|
if current:
|
|
tokens.append(current)
|
|
return tokens
|