mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-12-16 21:18:15 +00:00
refactor
This commit is contained in:
@@ -240,6 +240,9 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
|
|||||||
|
|
||||||
content = diff.render_diff(previous_version_file_contents=from_version_file_contents,
|
content = diff.render_diff(previous_version_file_contents=from_version_file_contents,
|
||||||
newest_version_file_contents=to_version_file_contents,
|
newest_version_file_contents=to_version_file_contents,
|
||||||
|
# include_removed=diff_prefs.get('removed'),
|
||||||
|
# include_added=diff_prefs.get('added'),
|
||||||
|
# include_replaced=diff_prefs.get('replaced'),
|
||||||
html_colour=True,
|
html_colour=True,
|
||||||
ignore_junk=diff_prefs.get('ignoreWhitespace'),
|
ignore_junk=diff_prefs.get('ignoreWhitespace'),
|
||||||
include_equal=not diff_prefs.get('changesOnly'),
|
include_equal=not diff_prefs.get('changesOnly'),
|
||||||
|
|||||||
@@ -1,8 +1,17 @@
|
|||||||
|
"""
|
||||||
|
Diff rendering module for change detection.
|
||||||
|
|
||||||
|
This module provides functions for rendering differences between text content,
|
||||||
|
with support for various output formats and tokenization strategies.
|
||||||
|
"""
|
||||||
|
|
||||||
import difflib
|
import difflib
|
||||||
from typing import List, Iterator, Union
|
from typing import List, Iterator, Union
|
||||||
import diff_match_patch as dmp_module
|
import diff_match_patch as dmp_module
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from .tokenizers import TOKENIZERS, tokenize_words_and_html
|
||||||
|
|
||||||
# Remember! gmail, outlook etc dont support <style> must be inline.
|
# Remember! gmail, outlook etc dont support <style> must be inline.
|
||||||
# Gmail: strips <ins> and <del> tags entirely.
|
# Gmail: strips <ins> and <del> tags entirely.
|
||||||
REMOVED_STYLE = "background-color: #fadad7; color: #b30000;"
|
REMOVED_STYLE = "background-color: #fadad7; color: #b30000;"
|
||||||
@@ -23,7 +32,8 @@ DIFF_HTML_LABEL_INSERTED = f'<span style="{ADDED_STYLE}" title="Inserted">{{cont
|
|||||||
# Compiled regex patterns for performance
|
# Compiled regex patterns for performance
|
||||||
WHITESPACE_NORMALIZE_RE = re.compile(r'\s+')
|
WHITESPACE_NORMALIZE_RE = re.compile(r'\s+')
|
||||||
|
|
||||||
def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool = False, ignore_junk: bool = False, markdown_style: str = None) -> tuple[str, bool]:
|
|
||||||
|
def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool = False, ignore_junk: bool = False, markdown_style: str = None, tokenizer: str = 'words_and_html') -> tuple[str, bool]:
|
||||||
"""
|
"""
|
||||||
Render word-level differences between two lines inline using diff-match-patch library.
|
Render word-level differences between two lines inline using diff-match-patch library.
|
||||||
|
|
||||||
@@ -33,6 +43,7 @@ def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool
|
|||||||
html_colour: Use HTML background colors for differences
|
html_colour: Use HTML background colors for differences
|
||||||
ignore_junk: Ignore whitespace-only changes
|
ignore_junk: Ignore whitespace-only changes
|
||||||
markdown_style: Unused (kept for backwards compatibility)
|
markdown_style: Unused (kept for backwards compatibility)
|
||||||
|
tokenizer: Name of tokenizer to use from TOKENIZERS registry (default: 'words_and_html')
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tuple[str, bool]: (diff output with inline word-level highlighting, has_changes flag)
|
tuple[str, bool]: (diff output with inline word-level highlighting, has_changes flag)
|
||||||
@@ -50,42 +61,12 @@ def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool
|
|||||||
# Strategy: Use linesToChars to treat words as atomic units
|
# Strategy: Use linesToChars to treat words as atomic units
|
||||||
dmp = dmp_module.diff_match_patch()
|
dmp = dmp_module.diff_match_patch()
|
||||||
|
|
||||||
# Split into words while preserving boundaries
|
# Get the tokenizer function from the registry
|
||||||
def tokenize_with_boundaries(text):
|
tokenizer_func = TOKENIZERS.get(tokenizer, tokenize_words_and_html)
|
||||||
"""Split text into words and boundaries (spaces, HTML tags)"""
|
|
||||||
tokens = []
|
|
||||||
current = ''
|
|
||||||
in_tag = False
|
|
||||||
|
|
||||||
for char in text:
|
# Tokenize both lines using the selected tokenizer
|
||||||
if char == '<':
|
before_tokens = tokenizer_func(before_normalized)
|
||||||
# Start of HTML tag
|
after_tokens = tokenizer_func(after_normalized or ' ')
|
||||||
if current:
|
|
||||||
tokens.append(current)
|
|
||||||
current = ''
|
|
||||||
current = '<'
|
|
||||||
in_tag = True
|
|
||||||
elif char == '>' and in_tag:
|
|
||||||
# End of HTML tag
|
|
||||||
current += '>'
|
|
||||||
tokens.append(current)
|
|
||||||
current = ''
|
|
||||||
in_tag = False
|
|
||||||
elif char.isspace() and not in_tag:
|
|
||||||
# Space outside of tag
|
|
||||||
if current:
|
|
||||||
tokens.append(current)
|
|
||||||
current = ''
|
|
||||||
tokens.append(char)
|
|
||||||
else:
|
|
||||||
current += char
|
|
||||||
|
|
||||||
if current:
|
|
||||||
tokens.append(current)
|
|
||||||
return tokens
|
|
||||||
|
|
||||||
before_tokens = tokenize_with_boundaries(before_normalized)
|
|
||||||
after_tokens = tokenize_with_boundaries(after_normalized or ' ')
|
|
||||||
|
|
||||||
# Create mappings for linesToChars (using it for word-mode)
|
# Create mappings for linesToChars (using it for word-mode)
|
||||||
# Join tokens with newline so each "line" is a token
|
# Join tokens with newline so each "line" is a token
|
||||||
@@ -166,7 +147,8 @@ def customSequenceMatcher(
|
|||||||
word_diff: bool = False,
|
word_diff: bool = False,
|
||||||
context_lines: int = 0,
|
context_lines: int = 0,
|
||||||
case_insensitive: bool = False,
|
case_insensitive: bool = False,
|
||||||
ignore_junk: bool = False
|
ignore_junk: bool = False,
|
||||||
|
tokenizer: str = 'words_and_html'
|
||||||
) -> Iterator[List[str]]:
|
) -> Iterator[List[str]]:
|
||||||
"""
|
"""
|
||||||
Compare two sequences and yield differences based on specified parameters.
|
Compare two sequences and yield differences based on specified parameters.
|
||||||
@@ -180,10 +162,11 @@ def customSequenceMatcher(
|
|||||||
include_replaced (bool): Include replaced parts
|
include_replaced (bool): Include replaced parts
|
||||||
include_change_type_prefix (bool): Add prefixes to indicate change types
|
include_change_type_prefix (bool): Add prefixes to indicate change types
|
||||||
html_colour (bool): Use HTML background colors for differences
|
html_colour (bool): Use HTML background colors for differences
|
||||||
word_diff (bool): Use word-level diffing for replaced lines
|
word_diff (bool): Use word-level diffing for replaced lines (controls inline rendering)
|
||||||
context_lines (int): Number of unchanged lines to show around changes (like grep -C)
|
context_lines (int): Number of unchanged lines to show around changes (like grep -C)
|
||||||
case_insensitive (bool): Perform case-insensitive comparison
|
case_insensitive (bool): Perform case-insensitive comparison
|
||||||
ignore_junk (bool): Ignore whitespace-only changes
|
ignore_junk (bool): Ignore whitespace-only changes
|
||||||
|
tokenizer (str): Name of tokenizer to use from TOKENIZERS registry (default: 'words_and_html')
|
||||||
|
|
||||||
Yields:
|
Yields:
|
||||||
List[str]: Differences between sequences
|
List[str]: Differences between sequences
|
||||||
@@ -250,7 +233,7 @@ def customSequenceMatcher(
|
|||||||
|
|
||||||
# Use word-level diff for single line replacements when enabled
|
# Use word-level diff for single line replacements when enabled
|
||||||
if word_diff and len(before_lines) == 1 and len(after_lines) == 1:
|
if word_diff and len(before_lines) == 1 and len(after_lines) == 1:
|
||||||
inline_diff, has_changes = render_inline_word_diff(before_lines[0], after_lines[0], html_colour, ignore_junk)
|
inline_diff, has_changes = render_inline_word_diff(before_lines[0], after_lines[0], html_colour, ignore_junk, tokenizer=tokenizer)
|
||||||
# Check if there are any actual changes (not just whitespace when ignore_junk is enabled)
|
# Check if there are any actual changes (not just whitespace when ignore_junk is enabled)
|
||||||
if ignore_junk and not has_changes:
|
if ignore_junk and not has_changes:
|
||||||
# No real changes, skip this line
|
# No real changes, skip this line
|
||||||
@@ -284,7 +267,8 @@ def render_diff(
|
|||||||
word_diff: bool = True,
|
word_diff: bool = True,
|
||||||
context_lines: int = 0,
|
context_lines: int = 0,
|
||||||
case_insensitive: bool = False,
|
case_insensitive: bool = False,
|
||||||
ignore_junk: bool = False
|
ignore_junk: bool = False,
|
||||||
|
tokenizer: str = 'words_and_html'
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Render the difference between two file contents.
|
Render the difference between two file contents.
|
||||||
@@ -300,10 +284,11 @@ def render_diff(
|
|||||||
include_change_type_prefix (bool): Add prefixes to indicate change types
|
include_change_type_prefix (bool): Add prefixes to indicate change types
|
||||||
patch_format (bool): Use patch format for output
|
patch_format (bool): Use patch format for output
|
||||||
html_colour (bool): Use HTML background colors for differences
|
html_colour (bool): Use HTML background colors for differences
|
||||||
word_diff (bool): Use word-level diffing for replaced lines
|
word_diff (bool): Use word-level diffing for replaced lines (controls inline rendering)
|
||||||
context_lines (int): Number of unchanged lines to show around changes (like grep -C)
|
context_lines (int): Number of unchanged lines to show around changes (like grep -C)
|
||||||
case_insensitive (bool): Perform case-insensitive comparison, By default the test_json_diff/process.py is case sensitive, so this follows same logic
|
case_insensitive (bool): Perform case-insensitive comparison, By default the test_json_diff/process.py is case sensitive, so this follows same logic
|
||||||
ignore_junk (bool): Ignore whitespace-only changes
|
ignore_junk (bool): Ignore whitespace-only changes
|
||||||
|
tokenizer (str): Name of tokenizer to use from TOKENIZERS registry (default: 'words_and_html')
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: Rendered difference
|
str: Rendered difference
|
||||||
@@ -327,7 +312,8 @@ def render_diff(
|
|||||||
word_diff=word_diff,
|
word_diff=word_diff,
|
||||||
context_lines=context_lines,
|
context_lines=context_lines,
|
||||||
case_insensitive=case_insensitive,
|
case_insensitive=case_insensitive,
|
||||||
ignore_junk=ignore_junk
|
ignore_junk=ignore_junk,
|
||||||
|
tokenizer=tokenizer
|
||||||
)
|
)
|
||||||
|
|
||||||
def flatten(lst: List[Union[str, List[str]]]) -> str:
|
def flatten(lst: List[Union[str, List[str]]]) -> str:
|
||||||
@@ -339,4 +325,13 @@ def render_diff(
|
|||||||
result.append(x)
|
result.append(x)
|
||||||
return line_feed_sep.join(result)
|
return line_feed_sep.join(result)
|
||||||
|
|
||||||
return flatten(rendered_diff)
|
return flatten(rendered_diff)
|
||||||
|
|
||||||
|
|
||||||
|
# Export main public API
|
||||||
|
__all__ = [
|
||||||
|
'render_diff',
|
||||||
|
'customSequenceMatcher',
|
||||||
|
'render_inline_word_diff',
|
||||||
|
'TOKENIZERS',
|
||||||
|
]
|
||||||
24
changedetectionio/diff/tokenizers/__init__.py
Normal file
24
changedetectionio/diff/tokenizers/__init__.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
"""
|
||||||
|
Tokenizers for diff operations.
|
||||||
|
|
||||||
|
This module provides various tokenization strategies for use with the diff system.
|
||||||
|
New tokenizers can be easily added by:
|
||||||
|
1. Creating a new module in this directory
|
||||||
|
2. Importing and registering it in the TOKENIZERS dictionary below
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .natural_text import tokenize_words
|
||||||
|
from .words_and_html import tokenize_words_and_html
|
||||||
|
|
||||||
|
# Tokenizer registry - maps tokenizer names to functions
|
||||||
|
TOKENIZERS = {
|
||||||
|
'words': tokenize_words,
|
||||||
|
'words_and_html': tokenize_words_and_html,
|
||||||
|
'html_tags': tokenize_words_and_html, # Alias for backwards compatibility
|
||||||
|
}
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'tokenize_words',
|
||||||
|
'tokenize_words_and_html',
|
||||||
|
'TOKENIZERS',
|
||||||
|
]
|
||||||
44
changedetectionio/diff/tokenizers/natural_text.py
Normal file
44
changedetectionio/diff/tokenizers/natural_text.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
"""
|
||||||
|
Simple word tokenizer using whitespace boundaries.
|
||||||
|
|
||||||
|
This is a simpler tokenizer that treats all whitespace as token boundaries
|
||||||
|
without special handling for HTML tags or other markup.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize_words(text: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Split text into words using simple whitespace boundaries.
|
||||||
|
|
||||||
|
This is a simpler tokenizer that treats all whitespace as token boundaries
|
||||||
|
without special handling for HTML tags.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Input text to tokenize
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of tokens (words and whitespace)
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> tokenize_words("Hello world")
|
||||||
|
['Hello', ' ', 'world']
|
||||||
|
>>> tokenize_words("one two")
|
||||||
|
['one', ' ', ' ', 'two']
|
||||||
|
"""
|
||||||
|
tokens = []
|
||||||
|
current = ''
|
||||||
|
|
||||||
|
for char in text:
|
||||||
|
if char.isspace():
|
||||||
|
if current:
|
||||||
|
tokens.append(current)
|
||||||
|
current = ''
|
||||||
|
tokens.append(char)
|
||||||
|
else:
|
||||||
|
current += char
|
||||||
|
|
||||||
|
if current:
|
||||||
|
tokens.append(current)
|
||||||
|
return tokens
|
||||||
61
changedetectionio/diff/tokenizers/words_and_html.py
Normal file
61
changedetectionio/diff/tokenizers/words_and_html.py
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
"""
|
||||||
|
Tokenizer that preserves HTML tags as atomic units while splitting on whitespace.
|
||||||
|
|
||||||
|
This tokenizer is specifically designed for HTML content where:
|
||||||
|
- HTML tags should remain intact (e.g., '<p>', '<a href="...">')
|
||||||
|
- Whitespace tokens are preserved for accurate diff reconstruction
|
||||||
|
- Words are split on whitespace boundaries
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize_words_and_html(text: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Split text into words and boundaries (spaces, HTML tags).
|
||||||
|
|
||||||
|
This tokenizer preserves HTML tags as atomic units while splitting on whitespace.
|
||||||
|
Useful for content that contains HTML markup.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Input text to tokenize
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of tokens (words, spaces, HTML tags)
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> tokenize_words_and_html("<p>Hello world</p>")
|
||||||
|
['<p>', 'Hello', ' ', 'world', '</p>']
|
||||||
|
>>> tokenize_words_and_html("<a href='test.com'>link</a>")
|
||||||
|
['<a href=\\'test.com\\'>', 'link', '</a>']
|
||||||
|
"""
|
||||||
|
tokens = []
|
||||||
|
current = ''
|
||||||
|
in_tag = False
|
||||||
|
|
||||||
|
for char in text:
|
||||||
|
if char == '<':
|
||||||
|
# Start of HTML tag
|
||||||
|
if current:
|
||||||
|
tokens.append(current)
|
||||||
|
current = ''
|
||||||
|
current = '<'
|
||||||
|
in_tag = True
|
||||||
|
elif char == '>' and in_tag:
|
||||||
|
# End of HTML tag
|
||||||
|
current += '>'
|
||||||
|
tokens.append(current)
|
||||||
|
current = ''
|
||||||
|
in_tag = False
|
||||||
|
elif char.isspace() and not in_tag:
|
||||||
|
# Space outside of tag
|
||||||
|
if current:
|
||||||
|
tokens.append(current)
|
||||||
|
current = ''
|
||||||
|
tokens.append(char)
|
||||||
|
else:
|
||||||
|
current += char
|
||||||
|
|
||||||
|
if current:
|
||||||
|
tokens.append(current)
|
||||||
|
return tokens
|
||||||
@@ -48,6 +48,7 @@ def test_check_basic_change_detection_functionality_source(client, live_server,
|
|||||||
# With diff-match-patch, HTML tags are properly tokenized and excluded from diff spans
|
# With diff-match-patch, HTML tags are properly tokenized and excluded from diff spans
|
||||||
# Only "modified" is shown as added, while <head> and <title> tags remain unchanged
|
# Only "modified" is shown as added, while <head> and <title> tags remain unchanged
|
||||||
assert b'<head><title>' in res.data
|
assert b'<head><title>' in res.data
|
||||||
|
|
||||||
assert b'title="Added"' in res.data
|
assert b'title="Added"' in res.data
|
||||||
assert b'>modified<' in res.data
|
assert b'>modified<' in res.data
|
||||||
assert b'head title</title></head>' in res.data
|
assert b'head title</title></head>' in res.data
|
||||||
|
|||||||
Reference in New Issue
Block a user