changedetection.io/changedetectionio/diff/tokenizers/natural_text.py

"""
Simple word tokenizer using whitespace boundaries.

This is a simpler tokenizer that treats all whitespace as token boundaries
without special handling for HTML tags or other markup.
"""

from typing import List


def tokenize_words(text: str) -> List[str]:
    """
    Split text into words using simple whitespace boundaries.

    This is a simpler tokenizer that treats all whitespace as token boundaries
    without special handling for HTML tags.

    Args:
        text: Input text to tokenize

    Returns:
        List of tokens (words and whitespace)

    Examples:
        >>> tokenize_words("Hello world")
        ['Hello', ' ', 'world']
        >>> tokenize_words("one  two")
        ['one', ' ', ' ', 'two']
    """
    tokens = []
    current = ''

    for char in text:
        if char.isspace():
            if current:
                tokens.append(current)
                current = ''
            tokens.append(char)
        else:
            current += char

    if current:
        tokens.append(current)
    return tokens