This commit is contained in:
dgtlmoon
2025-10-13 17:46:53 +02:00
parent 2709ba6772
commit 961994abcf
6 changed files with 171 additions and 43 deletions

View File

@@ -240,6 +240,9 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
content = diff.render_diff(previous_version_file_contents=from_version_file_contents, content = diff.render_diff(previous_version_file_contents=from_version_file_contents,
newest_version_file_contents=to_version_file_contents, newest_version_file_contents=to_version_file_contents,
# include_removed=diff_prefs.get('removed'),
# include_added=diff_prefs.get('added'),
# include_replaced=diff_prefs.get('replaced'),
html_colour=True, html_colour=True,
ignore_junk=diff_prefs.get('ignoreWhitespace'), ignore_junk=diff_prefs.get('ignoreWhitespace'),
include_equal=not diff_prefs.get('changesOnly'), include_equal=not diff_prefs.get('changesOnly'),

View File

@@ -1,8 +1,17 @@
"""
Diff rendering module for change detection.
This module provides functions for rendering differences between text content,
with support for various output formats and tokenization strategies.
"""
import difflib import difflib
from typing import List, Iterator, Union from typing import List, Iterator, Union
import diff_match_patch as dmp_module import diff_match_patch as dmp_module
import re import re
from .tokenizers import TOKENIZERS, tokenize_words_and_html
# Remember! gmail, outlook etc dont support <style> must be inline. # Remember! gmail, outlook etc dont support <style> must be inline.
# Gmail: strips <ins> and <del> tags entirely. # Gmail: strips <ins> and <del> tags entirely.
REMOVED_STYLE = "background-color: #fadad7; color: #b30000;" REMOVED_STYLE = "background-color: #fadad7; color: #b30000;"
@@ -23,7 +32,8 @@ DIFF_HTML_LABEL_INSERTED = f'<span style="{ADDED_STYLE}" title="Inserted">{{cont
# Compiled regex patterns for performance # Compiled regex patterns for performance
WHITESPACE_NORMALIZE_RE = re.compile(r'\s+') WHITESPACE_NORMALIZE_RE = re.compile(r'\s+')
def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool = False, ignore_junk: bool = False, markdown_style: str = None) -> tuple[str, bool]:
def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool = False, ignore_junk: bool = False, markdown_style: str = None, tokenizer: str = 'words_and_html') -> tuple[str, bool]:
""" """
Render word-level differences between two lines inline using diff-match-patch library. Render word-level differences between two lines inline using diff-match-patch library.
@@ -33,6 +43,7 @@ def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool
html_colour: Use HTML background colors for differences html_colour: Use HTML background colors for differences
ignore_junk: Ignore whitespace-only changes ignore_junk: Ignore whitespace-only changes
markdown_style: Unused (kept for backwards compatibility) markdown_style: Unused (kept for backwards compatibility)
tokenizer: Name of tokenizer to use from TOKENIZERS registry (default: 'words_and_html')
Returns: Returns:
tuple[str, bool]: (diff output with inline word-level highlighting, has_changes flag) tuple[str, bool]: (diff output with inline word-level highlighting, has_changes flag)
@@ -50,42 +61,12 @@ def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool
# Strategy: Use linesToChars to treat words as atomic units # Strategy: Use linesToChars to treat words as atomic units
dmp = dmp_module.diff_match_patch() dmp = dmp_module.diff_match_patch()
# Split into words while preserving boundaries # Get the tokenizer function from the registry
def tokenize_with_boundaries(text): tokenizer_func = TOKENIZERS.get(tokenizer, tokenize_words_and_html)
"""Split text into words and boundaries (spaces, HTML tags)"""
tokens = []
current = ''
in_tag = False
for char in text: # Tokenize both lines using the selected tokenizer
if char == '<': before_tokens = tokenizer_func(before_normalized)
# Start of HTML tag after_tokens = tokenizer_func(after_normalized or ' ')
if current:
tokens.append(current)
current = ''
current = '<'
in_tag = True
elif char == '>' and in_tag:
# End of HTML tag
current += '>'
tokens.append(current)
current = ''
in_tag = False
elif char.isspace() and not in_tag:
# Space outside of tag
if current:
tokens.append(current)
current = ''
tokens.append(char)
else:
current += char
if current:
tokens.append(current)
return tokens
before_tokens = tokenize_with_boundaries(before_normalized)
after_tokens = tokenize_with_boundaries(after_normalized or ' ')
# Create mappings for linesToChars (using it for word-mode) # Create mappings for linesToChars (using it for word-mode)
# Join tokens with newline so each "line" is a token # Join tokens with newline so each "line" is a token
@@ -166,7 +147,8 @@ def customSequenceMatcher(
word_diff: bool = False, word_diff: bool = False,
context_lines: int = 0, context_lines: int = 0,
case_insensitive: bool = False, case_insensitive: bool = False,
ignore_junk: bool = False ignore_junk: bool = False,
tokenizer: str = 'words_and_html'
) -> Iterator[List[str]]: ) -> Iterator[List[str]]:
""" """
Compare two sequences and yield differences based on specified parameters. Compare two sequences and yield differences based on specified parameters.
@@ -180,10 +162,11 @@ def customSequenceMatcher(
include_replaced (bool): Include replaced parts include_replaced (bool): Include replaced parts
include_change_type_prefix (bool): Add prefixes to indicate change types include_change_type_prefix (bool): Add prefixes to indicate change types
html_colour (bool): Use HTML background colors for differences html_colour (bool): Use HTML background colors for differences
word_diff (bool): Use word-level diffing for replaced lines word_diff (bool): Use word-level diffing for replaced lines (controls inline rendering)
context_lines (int): Number of unchanged lines to show around changes (like grep -C) context_lines (int): Number of unchanged lines to show around changes (like grep -C)
case_insensitive (bool): Perform case-insensitive comparison case_insensitive (bool): Perform case-insensitive comparison
ignore_junk (bool): Ignore whitespace-only changes ignore_junk (bool): Ignore whitespace-only changes
tokenizer (str): Name of tokenizer to use from TOKENIZERS registry (default: 'words_and_html')
Yields: Yields:
List[str]: Differences between sequences List[str]: Differences between sequences
@@ -250,7 +233,7 @@ def customSequenceMatcher(
# Use word-level diff for single line replacements when enabled # Use word-level diff for single line replacements when enabled
if word_diff and len(before_lines) == 1 and len(after_lines) == 1: if word_diff and len(before_lines) == 1 and len(after_lines) == 1:
inline_diff, has_changes = render_inline_word_diff(before_lines[0], after_lines[0], html_colour, ignore_junk) inline_diff, has_changes = render_inline_word_diff(before_lines[0], after_lines[0], html_colour, ignore_junk, tokenizer=tokenizer)
# Check if there are any actual changes (not just whitespace when ignore_junk is enabled) # Check if there are any actual changes (not just whitespace when ignore_junk is enabled)
if ignore_junk and not has_changes: if ignore_junk and not has_changes:
# No real changes, skip this line # No real changes, skip this line
@@ -284,7 +267,8 @@ def render_diff(
word_diff: bool = True, word_diff: bool = True,
context_lines: int = 0, context_lines: int = 0,
case_insensitive: bool = False, case_insensitive: bool = False,
ignore_junk: bool = False ignore_junk: bool = False,
tokenizer: str = 'words_and_html'
) -> str: ) -> str:
""" """
Render the difference between two file contents. Render the difference between two file contents.
@@ -300,10 +284,11 @@ def render_diff(
include_change_type_prefix (bool): Add prefixes to indicate change types include_change_type_prefix (bool): Add prefixes to indicate change types
patch_format (bool): Use patch format for output patch_format (bool): Use patch format for output
html_colour (bool): Use HTML background colors for differences html_colour (bool): Use HTML background colors for differences
word_diff (bool): Use word-level diffing for replaced lines word_diff (bool): Use word-level diffing for replaced lines (controls inline rendering)
context_lines (int): Number of unchanged lines to show around changes (like grep -C) context_lines (int): Number of unchanged lines to show around changes (like grep -C)
case_insensitive (bool): Perform case-insensitive comparison, By default the test_json_diff/process.py is case sensitive, so this follows same logic case_insensitive (bool): Perform case-insensitive comparison, By default the test_json_diff/process.py is case sensitive, so this follows same logic
ignore_junk (bool): Ignore whitespace-only changes ignore_junk (bool): Ignore whitespace-only changes
tokenizer (str): Name of tokenizer to use from TOKENIZERS registry (default: 'words_and_html')
Returns: Returns:
str: Rendered difference str: Rendered difference
@@ -327,7 +312,8 @@ def render_diff(
word_diff=word_diff, word_diff=word_diff,
context_lines=context_lines, context_lines=context_lines,
case_insensitive=case_insensitive, case_insensitive=case_insensitive,
ignore_junk=ignore_junk ignore_junk=ignore_junk,
tokenizer=tokenizer
) )
def flatten(lst: List[Union[str, List[str]]]) -> str: def flatten(lst: List[Union[str, List[str]]]) -> str:
@@ -339,4 +325,13 @@ def render_diff(
result.append(x) result.append(x)
return line_feed_sep.join(result) return line_feed_sep.join(result)
return flatten(rendered_diff) return flatten(rendered_diff)
# Export main public API
__all__ = [
'render_diff',
'customSequenceMatcher',
'render_inline_word_diff',
'TOKENIZERS',
]

View File

@@ -0,0 +1,24 @@
"""
Tokenizers for diff operations.
This module provides various tokenization strategies for use with the diff system.
New tokenizers can be easily added by:
1. Creating a new module in this directory
2. Importing and registering it in the TOKENIZERS dictionary below
"""
from .natural_text import tokenize_words
from .words_and_html import tokenize_words_and_html
# Tokenizer registry - maps tokenizer names to functions
TOKENIZERS = {
'words': tokenize_words,
'words_and_html': tokenize_words_and_html,
'html_tags': tokenize_words_and_html, # Alias for backwards compatibility
}
__all__ = [
'tokenize_words',
'tokenize_words_and_html',
'TOKENIZERS',
]

View File

@@ -0,0 +1,44 @@
"""
Simple word tokenizer using whitespace boundaries.
This is a simpler tokenizer that treats all whitespace as token boundaries
without special handling for HTML tags or other markup.
"""
from typing import List
def tokenize_words(text: str) -> List[str]:
"""
Split text into words using simple whitespace boundaries.
This is a simpler tokenizer that treats all whitespace as token boundaries
without special handling for HTML tags.
Args:
text: Input text to tokenize
Returns:
List of tokens (words and whitespace)
Examples:
>>> tokenize_words("Hello world")
['Hello', ' ', 'world']
>>> tokenize_words("one two")
['one', ' ', ' ', 'two']
"""
tokens = []
current = ''
for char in text:
if char.isspace():
if current:
tokens.append(current)
current = ''
tokens.append(char)
else:
current += char
if current:
tokens.append(current)
return tokens

View File

@@ -0,0 +1,61 @@
"""
Tokenizer that preserves HTML tags as atomic units while splitting on whitespace.
This tokenizer is specifically designed for HTML content where:
- HTML tags should remain intact (e.g., '<p>', '<a href="...">')
- Whitespace tokens are preserved for accurate diff reconstruction
- Words are split on whitespace boundaries
"""
from typing import List
def tokenize_words_and_html(text: str) -> List[str]:
"""
Split text into words and boundaries (spaces, HTML tags).
This tokenizer preserves HTML tags as atomic units while splitting on whitespace.
Useful for content that contains HTML markup.
Args:
text: Input text to tokenize
Returns:
List of tokens (words, spaces, HTML tags)
Examples:
>>> tokenize_words_and_html("<p>Hello world</p>")
['<p>', 'Hello', ' ', 'world', '</p>']
>>> tokenize_words_and_html("<a href='test.com'>link</a>")
['<a href=\\'test.com\\'>', 'link', '</a>']
"""
tokens = []
current = ''
in_tag = False
for char in text:
if char == '<':
# Start of HTML tag
if current:
tokens.append(current)
current = ''
current = '<'
in_tag = True
elif char == '>' and in_tag:
# End of HTML tag
current += '>'
tokens.append(current)
current = ''
in_tag = False
elif char.isspace() and not in_tag:
# Space outside of tag
if current:
tokens.append(current)
current = ''
tokens.append(char)
else:
current += char
if current:
tokens.append(current)
return tokens

View File

@@ -48,6 +48,7 @@ def test_check_basic_change_detection_functionality_source(client, live_server,
# With diff-match-patch, HTML tags are properly tokenized and excluded from diff spans # With diff-match-patch, HTML tags are properly tokenized and excluded from diff spans
# Only "modified" is shown as added, while <head> and <title> tags remain unchanged # Only "modified" is shown as added, while <head> and <title> tags remain unchanged
assert b'&lt;head&gt;&lt;title&gt;' in res.data assert b'&lt;head&gt;&lt;title&gt;' in res.data
assert b'title="Added"' in res.data assert b'title="Added"' in res.data
assert b'>modified<' in res.data assert b'>modified<' in res.data
assert b'head title&lt;/title&gt;&lt;/head&gt;' in res.data assert b'head title&lt;/title&gt;&lt;/head&gt;' in res.data