refactor

2025-12-16 21:18:15 +00:00 · 2025-10-13 17:46:53 +02:00
parent 2709ba6772
commit 961994abcf
6 changed files with 171 additions and 43 deletions
--- a/changedetectionio/blueprint/ui/views.py
+++ b/changedetectionio/blueprint/ui/views.py
@@ -240,6 +240,9 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe

        content = diff.render_diff(previous_version_file_contents=from_version_file_contents,
                                   newest_version_file_contents=to_version_file_contents,
+#                                   include_removed=diff_prefs.get('removed'),
+#                                   include_added=diff_prefs.get('added'),
+#                                   include_replaced=diff_prefs.get('replaced'),
                                   html_colour=True,
                                   ignore_junk=diff_prefs.get('ignoreWhitespace'),
                                   include_equal=not diff_prefs.get('changesOnly'),
--- a/changedetectionio/diff/init.py
+++ b/changedetectionio/diff/init.py
@@ -1,8 +1,17 @@
+"""
+Diff rendering module for change detection.
+
+This module provides functions for rendering differences between text content,
+with support for various output formats and tokenization strategies.
+"""
+
 import difflib
 from typing import List, Iterator, Union
 import diff_match_patch as dmp_module
 import re

+from .tokenizers import TOKENIZERS, tokenize_words_and_html
+
 # Remember! gmail, outlook etc dont support <style> must be inline.
 # Gmail: strips <ins> and <del> tags entirely.
 REMOVED_STYLE = "background-color: #fadad7; color: #b30000;"
@@ -23,7 +32,8 @@ DIFF_HTML_LABEL_INSERTED = f'<span style="{ADDED_STYLE}" title="Inserted">{{cont
 # Compiled regex patterns for performance
 WHITESPACE_NORMALIZE_RE = re.compile(r'\s+')

-def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool = False, ignore_junk: bool = False, markdown_style: str = None) -> tuple[str, bool]:
+
+def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool = False, ignore_junk: bool = False, markdown_style: str = None, tokenizer: str = 'words_and_html') -> tuple[str, bool]:
    """
    Render word-level differences between two lines inline using diff-match-patch library.

@@ -33,6 +43,7 @@ def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool
        html_colour: Use HTML background colors for differences
        ignore_junk: Ignore whitespace-only changes
        markdown_style: Unused (kept for backwards compatibility)
+        tokenizer: Name of tokenizer to use from TOKENIZERS registry (default: 'words_and_html')

    Returns:
        tuple[str, bool]: (diff output with inline word-level highlighting, has_changes flag)
@@ -50,42 +61,12 @@ def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool
    # Strategy: Use linesToChars to treat words as atomic units
    dmp = dmp_module.diff_match_patch()

-    # Split into words while preserving boundaries
-    def tokenize_with_boundaries(text):
-        """Split text into words and boundaries (spaces, HTML tags)"""
-        tokens = []
-        current = ''
-        in_tag = False
+    # Get the tokenizer function from the registry
+    tokenizer_func = TOKENIZERS.get(tokenizer, tokenize_words_and_html)

-        for char in text:
-            if char == '<':
-                # Start of HTML tag
-                if current:
-                    tokens.append(current)
-                    current = ''
-                current = '<'
-                in_tag = True
-            elif char == '>' and in_tag:
-                # End of HTML tag
-                current += '>'
-                tokens.append(current)
-                current = ''
-                in_tag = False
-            elif char.isspace() and not in_tag:
-                # Space outside of tag
-                if current:
-                    tokens.append(current)
-                    current = ''
-                tokens.append(char)
-            else:
-                current += char
-
-        if current:
-            tokens.append(current)
-        return tokens
-
-    before_tokens = tokenize_with_boundaries(before_normalized)
-    after_tokens = tokenize_with_boundaries(after_normalized or ' ')
+    # Tokenize both lines using the selected tokenizer
+    before_tokens = tokenizer_func(before_normalized)
+    after_tokens = tokenizer_func(after_normalized or ' ')

    # Create mappings for linesToChars (using it for word-mode)
    # Join tokens with newline so each "line" is a token
@@ -166,7 +147,8 @@ def customSequenceMatcher(
    word_diff: bool = False,
    context_lines: int = 0,
    case_insensitive: bool = False,
-    ignore_junk: bool = False
+    ignore_junk: bool = False,
+    tokenizer: str = 'words_and_html'
 ) -> Iterator[List[str]]:
    """
    Compare two sequences and yield differences based on specified parameters.
@@ -180,10 +162,11 @@ def customSequenceMatcher(
        include_replaced (bool): Include replaced parts
        include_change_type_prefix (bool): Add prefixes to indicate change types
        html_colour (bool): Use HTML background colors for differences
-        word_diff (bool): Use word-level diffing for replaced lines
+        word_diff (bool): Use word-level diffing for replaced lines (controls inline rendering)
        context_lines (int): Number of unchanged lines to show around changes (like grep -C)
        case_insensitive (bool): Perform case-insensitive comparison
        ignore_junk (bool): Ignore whitespace-only changes
+        tokenizer (str): Name of tokenizer to use from TOKENIZERS registry (default: 'words_and_html')

    Yields:
        List[str]: Differences between sequences
@@ -250,7 +233,7 @@ def customSequenceMatcher(

            # Use word-level diff for single line replacements when enabled
            if word_diff and len(before_lines) == 1 and len(after_lines) == 1:
-                inline_diff, has_changes = render_inline_word_diff(before_lines[0], after_lines[0], html_colour, ignore_junk)
+                inline_diff, has_changes = render_inline_word_diff(before_lines[0], after_lines[0], html_colour, ignore_junk, tokenizer=tokenizer)
                # Check if there are any actual changes (not just whitespace when ignore_junk is enabled)
                if ignore_junk and not has_changes:
                    # No real changes, skip this line
@@ -284,7 +267,8 @@ def render_diff(
    word_diff: bool = True,
    context_lines: int = 0,
    case_insensitive: bool = False,
-    ignore_junk: bool = False
+    ignore_junk: bool = False,
+    tokenizer: str = 'words_and_html'
 ) -> str:
    """
    Render the difference between two file contents.
@@ -300,10 +284,11 @@ def render_diff(
        include_change_type_prefix (bool): Add prefixes to indicate change types
        patch_format (bool): Use patch format for output
        html_colour (bool): Use HTML background colors for differences
-        word_diff (bool): Use word-level diffing for replaced lines
+        word_diff (bool): Use word-level diffing for replaced lines (controls inline rendering)
        context_lines (int): Number of unchanged lines to show around changes (like grep -C)
        case_insensitive (bool): Perform case-insensitive comparison, By default the test_json_diff/process.py is case sensitive, so this follows same logic
        ignore_junk (bool): Ignore whitespace-only changes
+        tokenizer (str): Name of tokenizer to use from TOKENIZERS registry (default: 'words_and_html')

    Returns:
        str: Rendered difference
@@ -327,7 +312,8 @@ def render_diff(
        word_diff=word_diff,
        context_lines=context_lines,
        case_insensitive=case_insensitive,
-        ignore_junk=ignore_junk
+        ignore_junk=ignore_junk,
+        tokenizer=tokenizer
    )

    def flatten(lst: List[Union[str, List[str]]]) -> str:
@@ -339,4 +325,13 @@ def render_diff(
                result.append(x)
        return line_feed_sep.join(result)

-    return flatten(rendered_diff)
+    return flatten(rendered_diff)
+
+
+# Export main public API
+__all__ = [
+    'render_diff',
+    'customSequenceMatcher',
+    'render_inline_word_diff',
+    'TOKENIZERS',
+]
--- a/changedetectionio/diff/tokenizers/init.py
+++ b/changedetectionio/diff/tokenizers/init.py
@@ -0,0 +1,24 @@
+"""
+Tokenizers for diff operations.
+
+This module provides various tokenization strategies for use with the diff system.
+New tokenizers can be easily added by:
+1. Creating a new module in this directory
+2. Importing and registering it in the TOKENIZERS dictionary below
+"""
+
+from .natural_text import tokenize_words
+from .words_and_html import tokenize_words_and_html
+
+# Tokenizer registry - maps tokenizer names to functions
+TOKENIZERS = {
+    'words': tokenize_words,
+    'words_and_html': tokenize_words_and_html,
+    'html_tags': tokenize_words_and_html,  # Alias for backwards compatibility
+}
+
+__all__ = [
+    'tokenize_words',
+    'tokenize_words_and_html',
+    'TOKENIZERS',
+]
--- a/changedetectionio/diff/tokenizers/natural_text.py
+++ b/changedetectionio/diff/tokenizers/natural_text.py
@@ -0,0 +1,44 @@
+"""
+Simple word tokenizer using whitespace boundaries.
+
+This is a simpler tokenizer that treats all whitespace as token boundaries
+without special handling for HTML tags or other markup.
+"""
+
+from typing import List
+
+
+def tokenize_words(text: str) -> List[str]:
+    """
+    Split text into words using simple whitespace boundaries.
+
+    This is a simpler tokenizer that treats all whitespace as token boundaries
+    without special handling for HTML tags.
+
+    Args:
+        text: Input text to tokenize
+
+    Returns:
+        List of tokens (words and whitespace)
+
+    Examples:
+        >>> tokenize_words("Hello world")
+        ['Hello', ' ', 'world']
+        >>> tokenize_words("one  two")
+        ['one', ' ', ' ', 'two']
+    """
+    tokens = []
+    current = ''
+
+    for char in text:
+        if char.isspace():
+            if current:
+                tokens.append(current)
+                current = ''
+            tokens.append(char)
+        else:
+            current += char
+
+    if current:
+        tokens.append(current)
+    return tokens
--- a/changedetectionio/diff/tokenizers/words_and_html.py
+++ b/changedetectionio/diff/tokenizers/words_and_html.py
@@ -0,0 +1,61 @@
+"""
+Tokenizer that preserves HTML tags as atomic units while splitting on whitespace.
+
+This tokenizer is specifically designed for HTML content where:
+- HTML tags should remain intact (e.g., '<p>', '<a href="...">')
+- Whitespace tokens are preserved for accurate diff reconstruction
+- Words are split on whitespace boundaries
+"""
+
+from typing import List
+
+
+def tokenize_words_and_html(text: str) -> List[str]:
+    """
+    Split text into words and boundaries (spaces, HTML tags).
+
+    This tokenizer preserves HTML tags as atomic units while splitting on whitespace.
+    Useful for content that contains HTML markup.
+
+    Args:
+        text: Input text to tokenize
+
+    Returns:
+        List of tokens (words, spaces, HTML tags)
+
+    Examples:
+        >>> tokenize_words_and_html("<p>Hello world</p>")
+        ['<p>', 'Hello', ' ', 'world', '</p>']
+        >>> tokenize_words_and_html("<a href='test.com'>link</a>")
+        ['<a href=\\'test.com\\'>', 'link', '</a>']
+    """
+    tokens = []
+    current = ''
+    in_tag = False
+
+    for char in text:
+        if char == '<':
+            # Start of HTML tag
+            if current:
+                tokens.append(current)
+                current = ''
+            current = '<'
+            in_tag = True
+        elif char == '>' and in_tag:
+            # End of HTML tag
+            current += '>'
+            tokens.append(current)
+            current = ''
+            in_tag = False
+        elif char.isspace() and not in_tag:
+            # Space outside of tag
+            if current:
+                tokens.append(current)
+                current = ''
+            tokens.append(char)
+        else:
+            current += char
+
+    if current:
+        tokens.append(current)
+    return tokens
--- a/changedetectionio/tests/test_source.py
+++ b/changedetectionio/tests/test_source.py
@@ -48,6 +48,7 @@ def test_check_basic_change_detection_functionality_source(client, live_server,
    # With diff-match-patch, HTML tags are properly tokenized and excluded from diff spans
    # Only "modified" is shown as added, while <head> and <title> tags remain unchanged
    assert b'&lt;head&gt;&lt;title&gt;' in res.data
+
    assert b'title="Added"' in res.data
    assert b'>modified<' in res.data
    assert b'head title&lt;/title&gt;&lt;/head&gt;' in res.data