Files
changedetection.io/changedetectionio/diff/__init__.py
dgtlmoon 961994abcf refactor
2025-10-13 17:46:53 +02:00

338 lines
15 KiB
Python

"""
Diff rendering module for change detection.
This module provides functions for rendering differences between text content,
with support for various output formats and tokenization strategies.
"""
import difflib
from typing import List, Iterator, Union
import diff_match_patch as dmp_module
import re
from .tokenizers import TOKENIZERS, tokenize_words_and_html
# Remember! gmail, outlook etc dont support <style> must be inline.
# Gmail: strips <ins> and <del> tags entirely.
REMOVED_STYLE = "background-color: #fadad7; color: #b30000;"
ADDED_STYLE = "background-color: #eaf2c2; color: #406619;"
# Diff label text formats (use {content} as placeholder)
DIFF_LABEL_TEXT_ADDED = '(added) {content}'
DIFF_LABEL_TEXT_REMOVED = '(removed) {content}'
DIFF_LABEL_TEXT_CHANGED = '(changed) {content}'
DIFF_LABEL_TEXT_INTO = '(into) {content}'
# Diff HTML label formats (use {content} as placeholder)
DIFF_HTML_LABEL_REMOVED = f'<span style="{REMOVED_STYLE}" title="Removed">{{content}}</span>'
DIFF_HTML_LABEL_ADDED = f'<span style="{ADDED_STYLE}" title="Added">{{content}}</span>'
DIFF_HTML_LABEL_REPLACED = f'<span style="{ADDED_STYLE}" title="Replaced">{{content}}</span>'
DIFF_HTML_LABEL_INSERTED = f'<span style="{ADDED_STYLE}" title="Inserted">{{content}}</span>'
# Compiled regex patterns for performance
WHITESPACE_NORMALIZE_RE = re.compile(r'\s+')
def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool = False, ignore_junk: bool = False, markdown_style: str = None, tokenizer: str = 'words_and_html') -> tuple[str, bool]:
"""
Render word-level differences between two lines inline using diff-match-patch library.
Args:
before_line: Original line text
after_line: Modified line text
html_colour: Use HTML background colors for differences
ignore_junk: Ignore whitespace-only changes
markdown_style: Unused (kept for backwards compatibility)
tokenizer: Name of tokenizer to use from TOKENIZERS registry (default: 'words_and_html')
Returns:
tuple[str, bool]: (diff output with inline word-level highlighting, has_changes flag)
"""
# Normalize whitespace if ignore_junk is enabled
if ignore_junk:
# Normalize whitespace: replace multiple spaces/tabs with single space
before_normalized = WHITESPACE_NORMALIZE_RE.sub(' ', before_line)
after_normalized = WHITESPACE_NORMALIZE_RE.sub(' ', after_line)
else:
before_normalized = before_line
after_normalized = after_line
# Use diff-match-patch with word-level tokenization
# Strategy: Use linesToChars to treat words as atomic units
dmp = dmp_module.diff_match_patch()
# Get the tokenizer function from the registry
tokenizer_func = TOKENIZERS.get(tokenizer, tokenize_words_and_html)
# Tokenize both lines using the selected tokenizer
before_tokens = tokenizer_func(before_normalized)
after_tokens = tokenizer_func(after_normalized or ' ')
# Create mappings for linesToChars (using it for word-mode)
# Join tokens with newline so each "line" is a token
before_text = '\n'.join(before_tokens)
after_text = '\n'.join(after_tokens)
# Use linesToChars for word-mode diffing
lines_result = dmp.diff_linesToChars(before_text, after_text)
line_before, line_after, line_array = lines_result
# Perform diff on the encoded strings
diffs = dmp.diff_main(line_before, line_after, False)
# Convert back to original text
dmp.diff_charsToLines(diffs, line_array)
# Remove the newlines we added for tokenization
diffs = [(op, text.replace('\n', '')) for op, text in diffs]
# Apply semantic cleanup for more human-readable diffs
dmp.diff_cleanupSemantic(diffs)
# Check if there are any changes
has_changes = any(op != 0 for op, _ in diffs)
if ignore_junk and not has_changes:
return after_line, False
# Check if the whole line is replaced (no unchanged content)
whole_line_replaced = not any(op == 0 and text.strip() for op, text in diffs)
# Build the output
result_parts = []
for op, text in diffs:
if op == 0: # Equal
result_parts.append(text)
elif op == 1: # Insertion
if html_colour:
content = text.rstrip()
trailing = text[len(content):] if len(text) > len(content) else ''
line_break = '\n' if whole_line_replaced else ''
result_parts.append(f'{DIFF_HTML_LABEL_ADDED.format(content=content)}{trailing}{line_break}')
else:
content = text.rstrip()
trailing = text[len(content):] if len(text) > len(content) else ''
line_break = '\n' if whole_line_replaced else ''
label = DIFF_LABEL_TEXT_INTO if whole_line_replaced else DIFF_LABEL_TEXT_ADDED
result_parts.append(f'{label.format(content=content)}{trailing}{line_break}')
elif op == -1: # Deletion
if html_colour:
content = text.rstrip()
trailing = text[len(content):] if len(text) > len(content) else ''
line_break = '\n' if whole_line_replaced else ''
result_parts.append(f'{DIFF_HTML_LABEL_REMOVED.format(content=content)}{trailing}{line_break}')
else:
content = text.rstrip()
trailing = text[len(content):] if len(text) > len(content) else ''
line_break = '\n' if whole_line_replaced else ''
label = DIFF_LABEL_TEXT_CHANGED if whole_line_replaced else DIFF_LABEL_TEXT_REMOVED
result_parts.append(f'{label.format(content=content)}{trailing}{line_break}')
return ''.join(result_parts), has_changes
def same_slicer(lst: List[str], start: int, end: int) -> List[str]:
"""Return a slice of the list, or a single element if start == end."""
return lst[start:end] if start != end else [lst[start]]
def customSequenceMatcher(
before: List[str],
after: List[str],
include_equal: bool = False,
include_removed: bool = True,
include_added: bool = True,
include_replaced: bool = True,
include_change_type_prefix: bool = True,
html_colour: bool = False,
word_diff: bool = False,
context_lines: int = 0,
case_insensitive: bool = False,
ignore_junk: bool = False,
tokenizer: str = 'words_and_html'
) -> Iterator[List[str]]:
"""
Compare two sequences and yield differences based on specified parameters.
Args:
before (List[str]): Original sequence
after (List[str]): Modified sequence
include_equal (bool): Include unchanged parts
include_removed (bool): Include removed parts
include_added (bool): Include added parts
include_replaced (bool): Include replaced parts
include_change_type_prefix (bool): Add prefixes to indicate change types
html_colour (bool): Use HTML background colors for differences
word_diff (bool): Use word-level diffing for replaced lines (controls inline rendering)
context_lines (int): Number of unchanged lines to show around changes (like grep -C)
case_insensitive (bool): Perform case-insensitive comparison
ignore_junk (bool): Ignore whitespace-only changes
tokenizer (str): Name of tokenizer to use from TOKENIZERS registry (default: 'words_and_html')
Yields:
List[str]: Differences between sequences
"""
# Prepare sequences for comparison (lowercase if case-insensitive, normalize whitespace if ignore_junk)
def prepare_line(line):
if case_insensitive:
line = line.lower()
if ignore_junk:
# Normalize whitespace: replace multiple spaces/tabs with single space
line = WHITESPACE_NORMALIZE_RE.sub(' ', line)
return line
compare_before = [prepare_line(line) for line in before]
compare_after = [prepare_line(line) for line in after]
cruncher = difflib.SequenceMatcher(isjunk=lambda x: x in " \t", a=compare_before, b=compare_after)
# When context_lines is set and include_equal is False, we need to track which equal lines to include
if context_lines > 0 and not include_equal:
opcodes = list(cruncher.get_opcodes())
# Mark equal ranges that should be included based on context
included_equal_ranges = set()
for i, (tag, alo, ahi, blo, bhi) in enumerate(opcodes):
if tag != 'equal':
# Include context lines before this change
for j in range(max(0, i - 1), i):
if opcodes[j][0] == 'equal':
prev_alo, prev_ahi = opcodes[j][1], opcodes[j][2]
# Include last N lines of the previous equal block
context_start = max(prev_alo, prev_ahi - context_lines)
for line_num in range(context_start, prev_ahi):
included_equal_ranges.add(line_num)
# Include context lines after this change
for j in range(i + 1, min(len(opcodes), i + 2)):
if opcodes[j][0] == 'equal':
next_alo, next_ahi = opcodes[j][1], opcodes[j][2]
# Include first N lines of the next equal block
context_end = min(next_ahi, next_alo + context_lines)
for line_num in range(next_alo, context_end):
included_equal_ranges.add(line_num)
# Remember! gmail, outlook etc dont support <style> must be inline.
# Gmail: strips <ins> and <del> tags entirely.
for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
if tag == 'equal':
if include_equal:
yield before[alo:ahi]
elif context_lines > 0:
# Only include equal lines that are in the context range
context_lines_to_include = [before[i] for i in range(alo, ahi) if i in included_equal_ranges]
if context_lines_to_include:
yield context_lines_to_include
elif include_removed and tag == 'delete':
if html_colour:
yield [DIFF_HTML_LABEL_REMOVED.format(content=line) for line in same_slicer(before, alo, ahi)]
else:
yield [DIFF_LABEL_TEXT_REMOVED.format(content=line) for line in same_slicer(before, alo, ahi)] if include_change_type_prefix else same_slicer(before, alo, ahi)
elif include_replaced and tag == 'replace':
before_lines = same_slicer(before, alo, ahi)
after_lines = same_slicer(after, blo, bhi)
# Use word-level diff for single line replacements when enabled
if word_diff and len(before_lines) == 1 and len(after_lines) == 1:
inline_diff, has_changes = render_inline_word_diff(before_lines[0], after_lines[0], html_colour, ignore_junk, tokenizer=tokenizer)
# Check if there are any actual changes (not just whitespace when ignore_junk is enabled)
if ignore_junk and not has_changes:
# No real changes, skip this line
continue
yield [inline_diff]
else:
# Fall back to line-level diff for multi-line changes or when word_diff disabled
if html_colour:
yield [DIFF_HTML_LABEL_REMOVED.format(content=line) for line in before_lines] + \
[DIFF_HTML_LABEL_REPLACED.format(content=line) for line in after_lines]
else:
yield [DIFF_LABEL_TEXT_CHANGED.format(content=line) for line in before_lines] + \
[DIFF_LABEL_TEXT_INTO.format(content=line) for line in after_lines] if include_change_type_prefix else before_lines + after_lines
elif include_added and tag == 'insert':
if html_colour:
yield [DIFF_HTML_LABEL_INSERTED.format(content=line) for line in same_slicer(after, blo, bhi)]
else:
yield [DIFF_LABEL_TEXT_ADDED.format(content=line) for line in same_slicer(after, blo, bhi)] if include_change_type_prefix else same_slicer(after, blo, bhi)
def render_diff(
previous_version_file_contents: str,
newest_version_file_contents: str,
include_equal: bool = False,
include_removed: bool = True,
include_added: bool = True,
include_replaced: bool = True,
line_feed_sep: str = "\n",
include_change_type_prefix: bool = True,
patch_format: bool = False,
html_colour: bool = False,
word_diff: bool = True,
context_lines: int = 0,
case_insensitive: bool = False,
ignore_junk: bool = False,
tokenizer: str = 'words_and_html'
) -> str:
"""
Render the difference between two file contents.
Args:
previous_version_file_contents (str): Original file contents
newest_version_file_contents (str): Modified file contents
include_equal (bool): Include unchanged parts
include_removed (bool): Include removed parts
include_added (bool): Include added parts
include_replaced (bool): Include replaced parts
line_feed_sep (str): Separator for lines in output
include_change_type_prefix (bool): Add prefixes to indicate change types
patch_format (bool): Use patch format for output
html_colour (bool): Use HTML background colors for differences
word_diff (bool): Use word-level diffing for replaced lines (controls inline rendering)
context_lines (int): Number of unchanged lines to show around changes (like grep -C)
case_insensitive (bool): Perform case-insensitive comparison, By default the test_json_diff/process.py is case sensitive, so this follows same logic
ignore_junk (bool): Ignore whitespace-only changes
tokenizer (str): Name of tokenizer to use from TOKENIZERS registry (default: 'words_and_html')
Returns:
str: Rendered difference
"""
newest_lines = [line.rstrip() for line in newest_version_file_contents.splitlines()]
previous_lines = [line.rstrip() for line in previous_version_file_contents.splitlines()] if previous_version_file_contents else []
if patch_format:
patch = difflib.unified_diff(previous_lines, newest_lines)
return line_feed_sep.join(patch)
rendered_diff = customSequenceMatcher(
before=previous_lines,
after=newest_lines,
include_equal=include_equal,
include_removed=include_removed,
include_added=include_added,
include_replaced=include_replaced,
include_change_type_prefix=include_change_type_prefix,
html_colour=html_colour,
word_diff=word_diff,
context_lines=context_lines,
case_insensitive=case_insensitive,
ignore_junk=ignore_junk,
tokenizer=tokenizer
)
def flatten(lst: List[Union[str, List[str]]]) -> str:
result = []
for x in lst:
if isinstance(x, list):
result.extend(x)
else:
result.append(x)
return line_feed_sep.join(result)
return flatten(rendered_diff)
# Export main public API
__all__ = [
'render_diff',
'customSequenceMatcher',
'render_inline_word_diff',
'TOKENIZERS',
]