mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-05 09:04:55 +00:00
45 lines
1.0 KiB
Python
45 lines
1.0 KiB
Python
"""
|
|
Simple word tokenizer using whitespace boundaries.
|
|
|
|
This is a simpler tokenizer that treats all whitespace as token boundaries
|
|
without special handling for HTML tags or other markup.
|
|
"""
|
|
|
|
from typing import List
|
|
|
|
|
|
def tokenize_words(text: str) -> List[str]:
|
|
"""
|
|
Split text into words using simple whitespace boundaries.
|
|
|
|
This is a simpler tokenizer that treats all whitespace as token boundaries
|
|
without special handling for HTML tags.
|
|
|
|
Args:
|
|
text: Input text to tokenize
|
|
|
|
Returns:
|
|
List of tokens (words and whitespace)
|
|
|
|
Examples:
|
|
>>> tokenize_words("Hello world")
|
|
['Hello', ' ', 'world']
|
|
>>> tokenize_words("one two")
|
|
['one', ' ', ' ', 'two']
|
|
"""
|
|
tokens = []
|
|
current = ''
|
|
|
|
for char in text:
|
|
if char.isspace():
|
|
if current:
|
|
tokens.append(current)
|
|
current = ''
|
|
tokens.append(char)
|
|
else:
|
|
current += char
|
|
|
|
if current:
|
|
tokens.append(current)
|
|
return tokens
|