refactor

2025-12-16 21:18:15 +00:00 · 2025-10-13 17:46:53 +02:00
parent 2709ba6772
commit 961994abcf
6 changed files with 171 additions and 43 deletions
--- a/changedetectionio/blueprint/ui/views.py
+++ b/changedetectionio/blueprint/ui/views.py
@@ -240,6 +240,9 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
        content = diff.render_diff(previous_version_file_contents=from_version_file_contents,
                                   newest_version_file_contents=to_version_file_contents,
 #                                   include_removed=diff_prefs.get('removed'),
 #                                   include_added=diff_prefs.get('added'),
 #                                   include_replaced=diff_prefs.get('replaced'),
                                   html_colour=True,
                                   ignore_junk=diff_prefs.get('ignoreWhitespace'),
                                   include_equal=not diff_prefs.get('changesOnly'),
--- a/changedetectionio/diff/init.py
+++ b/changedetectionio/diff/init.py
@@ -1,8 +1,17 @@
 """
 Diff rendering module for change detection.
 This module provides functions for rendering differences between text content,
 with support for various output formats and tokenization strategies.
 """
 import difflib
 from typing import List, Iterator, Union
 import diff_match_patch as dmp_module
 import re
 from .tokenizers import TOKENIZERS, tokenize_words_and_html
 # Remember! gmail, outlook etc dont support <style> must be inline.
 # Gmail: strips <ins> and <del> tags entirely.
 REMOVED_STYLE = "background-color: #fadad7; color: #b30000;"
@@ -23,7 +32,8 @@ DIFF_HTML_LABEL_INSERTED = f'<span style="{ADDED_STYLE}" title="Inserted">{{cont
 # Compiled regex patterns for performance
 WHITESPACE_NORMALIZE_RE = re.compile(r'\s+')
-def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool = False, ignore_junk: bool = False, markdown_style: str = None) -> tuple[str, bool]:
+
 def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool = False, ignore_junk: bool = False, markdown_style: str = None, tokenizer: str = 'words_and_html') -> tuple[str, bool]:
    """
    Render word-level differences between two lines inline using diff-match-patch library.
@@ -33,6 +43,7 @@ def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool
        html_colour: Use HTML background colors for differences
        ignore_junk: Ignore whitespace-only changes
        markdown_style: Unused (kept for backwards compatibility)
        tokenizer: Name of tokenizer to use from TOKENIZERS registry (default: 'words_and_html')
    Returns:
        tuple[str, bool]: (diff output with inline word-level highlighting, has_changes flag)
@@ -50,42 +61,12 @@ def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool
    # Strategy: Use linesToChars to treat words as atomic units
    dmp = dmp_module.diff_match_patch()
-    # Split into words while preserving boundaries
+    # Get the tokenizer function from the registry
-    def tokenize_with_boundaries(text):
+    tokenizer_func = TOKENIZERS.get(tokenizer, tokenize_words_and_html)
        """Split text into words and boundaries (spaces, HTML tags)"""
        tokens = []
        current = ''
        in_tag = False
-        for char in text:
+    # Tokenize both lines using the selected tokenizer
-            if char == '<':
+    before_tokens = tokenizer_func(before_normalized)
-                # Start of HTML tag
+    after_tokens = tokenizer_func(after_normalized or ' ')
                if current:
                    tokens.append(current)
                    current = ''
                current = '<'
                in_tag = True
            elif char == '>' and in_tag:
                # End of HTML tag
                current += '>'
                tokens.append(current)
                current = ''
                in_tag = False
            elif char.isspace() and not in_tag:
                # Space outside of tag
                if current:
                    tokens.append(current)
                    current = ''
                tokens.append(char)
            else:
                current += char
        if current:
            tokens.append(current)
        return tokens
    before_tokens = tokenize_with_boundaries(before_normalized)
    after_tokens = tokenize_with_boundaries(after_normalized or ' ')
    # Create mappings for linesToChars (using it for word-mode)
    # Join tokens with newline so each "line" is a token
@@ -166,7 +147,8 @@ def customSequenceMatcher(
    word_diff: bool = False,
    context_lines: int = 0,
    case_insensitive: bool = False,
-    ignore_junk: bool = False
+    ignore_junk: bool = False,
    tokenizer: str = 'words_and_html'
 ) -> Iterator[List[str]]:
    """
    Compare two sequences and yield differences based on specified parameters.
@@ -180,10 +162,11 @@ def customSequenceMatcher(
        include_replaced (bool): Include replaced parts
        include_change_type_prefix (bool): Add prefixes to indicate change types
        html_colour (bool): Use HTML background colors for differences
-        word_diff (bool): Use word-level diffing for replaced lines
+        word_diff (bool): Use word-level diffing for replaced lines (controls inline rendering)
        context_lines (int): Number of unchanged lines to show around changes (like grep -C)
        case_insensitive (bool): Perform case-insensitive comparison
        ignore_junk (bool): Ignore whitespace-only changes
        tokenizer (str): Name of tokenizer to use from TOKENIZERS registry (default: 'words_and_html')
    Yields:
        List[str]: Differences between sequences
@@ -250,7 +233,7 @@ def customSequenceMatcher(
            # Use word-level diff for single line replacements when enabled
            if word_diff and len(before_lines) == 1 and len(after_lines) == 1:
-                inline_diff, has_changes = render_inline_word_diff(before_lines[0], after_lines[0], html_colour, ignore_junk)
+                inline_diff, has_changes = render_inline_word_diff(before_lines[0], after_lines[0], html_colour, ignore_junk, tokenizer=tokenizer)
                # Check if there are any actual changes (not just whitespace when ignore_junk is enabled)
                if ignore_junk and not has_changes:
                    # No real changes, skip this line
@@ -284,7 +267,8 @@ def render_diff(
    word_diff: bool = True,
    context_lines: int = 0,
    case_insensitive: bool = False,
-    ignore_junk: bool = False
+    ignore_junk: bool = False,
    tokenizer: str = 'words_and_html'
 ) -> str:
    """
    Render the difference between two file contents.
@@ -300,10 +284,11 @@ def render_diff(
        include_change_type_prefix (bool): Add prefixes to indicate change types
        patch_format (bool): Use patch format for output
        html_colour (bool): Use HTML background colors for differences
-        word_diff (bool): Use word-level diffing for replaced lines
+        word_diff (bool): Use word-level diffing for replaced lines (controls inline rendering)
        context_lines (int): Number of unchanged lines to show around changes (like grep -C)
        case_insensitive (bool): Perform case-insensitive comparison, By default the test_json_diff/process.py is case sensitive, so this follows same logic
        ignore_junk (bool): Ignore whitespace-only changes
        tokenizer (str): Name of tokenizer to use from TOKENIZERS registry (default: 'words_and_html')
    Returns:
        str: Rendered difference
@@ -327,7 +312,8 @@ def render_diff(
        word_diff=word_diff,
        context_lines=context_lines,
        case_insensitive=case_insensitive,
-        ignore_junk=ignore_junk
+        ignore_junk=ignore_junk,
        tokenizer=tokenizer
    )
    def flatten(lst: List[Union[str, List[str]]]) -> str:
@@ -339,4 +325,13 @@ def render_diff(
                result.append(x)
        return line_feed_sep.join(result)
-    return flatten(rendered_diff)
+    return flatten(rendered_diff)
 # Export main public API
 __all__ = [
    'render_diff',
    'customSequenceMatcher',
    'render_inline_word_diff',
    'TOKENIZERS',
 ]
--- a/changedetectionio/diff/tokenizers/init.py
+++ b/changedetectionio/diff/tokenizers/init.py
@@ -0,0 +1,24 @@
 """
 Tokenizers for diff operations.
 This module provides various tokenization strategies for use with the diff system.
 New tokenizers can be easily added by:
 1. Creating a new module in this directory
 2. Importing and registering it in the TOKENIZERS dictionary below
 """
 from .natural_text import tokenize_words
 from .words_and_html import tokenize_words_and_html
 # Tokenizer registry - maps tokenizer names to functions
 TOKENIZERS = {
    'words': tokenize_words,
    'words_and_html': tokenize_words_and_html,
    'html_tags': tokenize_words_and_html,  # Alias for backwards compatibility
 }
 __all__ = [
    'tokenize_words',
    'tokenize_words_and_html',
    'TOKENIZERS',
 ]
--- a/changedetectionio/diff/tokenizers/natural_text.py
+++ b/changedetectionio/diff/tokenizers/natural_text.py
@@ -0,0 +1,44 @@
 """
 Simple word tokenizer using whitespace boundaries.
 This is a simpler tokenizer that treats all whitespace as token boundaries
 without special handling for HTML tags or other markup.
 """
 from typing import List
 def tokenize_words(text: str) -> List[str]:
    """
    Split text into words using simple whitespace boundaries.
    This is a simpler tokenizer that treats all whitespace as token boundaries
    without special handling for HTML tags.
    Args:
        text: Input text to tokenize
    Returns:
        List of tokens (words and whitespace)
    Examples:
        >>> tokenize_words("Hello world")
        ['Hello', ' ', 'world']
        >>> tokenize_words("one  two")
        ['one', ' ', ' ', 'two']
    """
    tokens = []
    current = ''
    for char in text:
        if char.isspace():
            if current:
                tokens.append(current)
                current = ''
            tokens.append(char)
        else:
            current += char
    if current:
        tokens.append(current)
    return tokens
--- a/changedetectionio/diff/tokenizers/words_and_html.py
+++ b/changedetectionio/diff/tokenizers/words_and_html.py
@@ -0,0 +1,61 @@
 """
 Tokenizer that preserves HTML tags as atomic units while splitting on whitespace.
 This tokenizer is specifically designed for HTML content where:
 - HTML tags should remain intact (e.g., '<p>', '<a href="...">')
 - Whitespace tokens are preserved for accurate diff reconstruction
 - Words are split on whitespace boundaries
 """
 from typing import List
 def tokenize_words_and_html(text: str) -> List[str]:
    """
    Split text into words and boundaries (spaces, HTML tags).
    This tokenizer preserves HTML tags as atomic units while splitting on whitespace.
    Useful for content that contains HTML markup.
    Args:
        text: Input text to tokenize
    Returns:
        List of tokens (words, spaces, HTML tags)
    Examples:
        >>> tokenize_words_and_html("<p>Hello world</p>")
        ['<p>', 'Hello', ' ', 'world', '</p>']
        >>> tokenize_words_and_html("<a href='test.com'>link</a>")
        ['<a href=\\'test.com\\'>', 'link', '</a>']
    """
    tokens = []
    current = ''
    in_tag = False
    for char in text:
        if char == '<':
            # Start of HTML tag
            if current:
                tokens.append(current)
                current = ''
            current = '<'
            in_tag = True
        elif char == '>' and in_tag:
            # End of HTML tag
            current += '>'
            tokens.append(current)
            current = ''
            in_tag = False
        elif char.isspace() and not in_tag:
            # Space outside of tag
            if current:
                tokens.append(current)
                current = ''
            tokens.append(char)
        else:
            current += char
    if current:
        tokens.append(current)
    return tokens
--- a/changedetectionio/tests/test_source.py
+++ b/changedetectionio/tests/test_source.py
@@ -48,6 +48,7 @@ def test_check_basic_change_detection_functionality_source(client, live_server,
    # With diff-match-patch, HTML tags are properly tokenized and excluded from diff spans
    # Only "modified" is shown as added, while <head> and <title> tags remain unchanged
    assert b'&lt;head&gt;&lt;title&gt;' in res.data
    assert b'title="Added"' in res.data
    assert b'>modified<' in res.data
    assert b'head title&lt;/title&gt;&lt;/head&gt;' in res.data