changedetection.io/changedetectionio/diff.py

import difflib
from typing import List, Iterator, Union
import diff_match_patch as dmp_module
import re

# Remember! gmail, outlook etc dont support <style> must be inline.
# Gmail: strips <ins> and <del> tags entirely.
REMOVED_STYLE = "background-color: #fadad7; color: #b30000;"
ADDED_STYLE = "background-color: #eaf2c2; color: #406619;"

# Diff label text formats (use {content} as placeholder)
DIFF_LABEL_TEXT_ADDED = '(added) {content}'
DIFF_LABEL_TEXT_REMOVED = '(removed) {content}'
DIFF_LABEL_TEXT_CHANGED = '(changed) {content}'
DIFF_LABEL_TEXT_INTO = '(into) {content}'

# Diff HTML label formats (use {content} as placeholder)
DIFF_HTML_LABEL_REMOVED = f'<span style="{REMOVED_STYLE}" title="Removed">{{content}}</span>'
DIFF_HTML_LABEL_ADDED = f'<span style="{ADDED_STYLE}" title="Added">{{content}}</span>'
DIFF_HTML_LABEL_REPLACED = f'<span style="{ADDED_STYLE}" title="Replaced">{{content}}</span>'
DIFF_HTML_LABEL_INSERTED = f'<span style="{ADDED_STYLE}" title="Inserted">{{content}}</span>'

# Compiled regex patterns for performance
WHITESPACE_NORMALIZE_RE = re.compile(r'\s+')

def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool = False, ignore_junk: bool = False, markdown_style: str = None) -> tuple[str, bool]:
    """
    Render word-level differences between two lines inline using diff-match-patch library.

    Args:
        before_line: Original line text
        after_line: Modified line text
        html_colour: Use HTML background colors for differences
        ignore_junk: Ignore whitespace-only changes
        markdown_style: Unused (kept for backwards compatibility)

    Returns:
        tuple[str, bool]: (diff output with inline word-level highlighting, has_changes flag)
    """
    # Normalize whitespace if ignore_junk is enabled
    if ignore_junk:
        # Normalize whitespace: replace multiple spaces/tabs with single space
        before_normalized = WHITESPACE_NORMALIZE_RE.sub(' ', before_line)
        after_normalized = WHITESPACE_NORMALIZE_RE.sub(' ', after_line)
    else:
        before_normalized = before_line
        after_normalized = after_line

    # Use diff-match-patch with word-level tokenization
    # Strategy: Use linesToChars to treat words as atomic units
    dmp = dmp_module.diff_match_patch()

    # Split into words while preserving boundaries
    def tokenize_with_boundaries(text):
        """Split text into words and boundaries (spaces, HTML tags)"""
        tokens = []
        current = ''
        in_tag = False

        for char in text:
            if char == '<':
                # Start of HTML tag
                if current:
                    tokens.append(current)
                    current = ''
                current = '<'
                in_tag = True
            elif char == '>' and in_tag:
                # End of HTML tag
                current += '>'
                tokens.append(current)
                current = ''
                in_tag = False
            elif char.isspace() and not in_tag:
                # Space outside of tag
                if current:
                    tokens.append(current)
                    current = ''
                tokens.append(char)
            else:
                current += char

        if current:
            tokens.append(current)
        return tokens

    before_tokens = tokenize_with_boundaries(before_normalized)
    after_tokens = tokenize_with_boundaries(after_normalized or ' ')

    # Create mappings for linesToChars (using it for word-mode)
    # Join tokens with newline so each "line" is a token
    before_text = '\n'.join(before_tokens)
    after_text = '\n'.join(after_tokens)

    # Use linesToChars for word-mode diffing
    lines_result = dmp.diff_linesToChars(before_text, after_text)
    line_before, line_after, line_array = lines_result

    # Perform diff on the encoded strings
    diffs = dmp.diff_main(line_before, line_after, False)

    # Convert back to original text
    dmp.diff_charsToLines(diffs, line_array)

    # Remove the newlines we added for tokenization
    diffs = [(op, text.replace('\n', '')) for op, text in diffs]

    # Apply semantic cleanup for more human-readable diffs
    dmp.diff_cleanupSemantic(diffs)

    # Check if there are any changes
    has_changes = any(op != 0 for op, _ in diffs)

    if ignore_junk and not has_changes:
        return after_line, False

    # Check if the whole line is replaced (no unchanged content)
    whole_line_replaced = not any(op == 0 and text.strip() for op, text in diffs)

    # Build the output
    result_parts = []

    for op, text in diffs:
        if op == 0:  # Equal
            result_parts.append(text)
        elif op == 1:  # Insertion
            if html_colour:
                content = text.rstrip()
                trailing = text[len(content):] if len(text) > len(content) else ''
                line_break = '\n' if whole_line_replaced else ''
                result_parts.append(f'{DIFF_HTML_LABEL_ADDED.format(content=content)}{trailing}{line_break}')
            else:
                content = text.rstrip()
                trailing = text[len(content):] if len(text) > len(content) else ''
                line_break = '\n' if whole_line_replaced else ''
                label = DIFF_LABEL_TEXT_INTO if whole_line_replaced else DIFF_LABEL_TEXT_ADDED
                result_parts.append(f'{label.format(content=content)}{trailing}{line_break}')
        elif op == -1:  # Deletion
            if html_colour:
                content = text.rstrip()
                trailing = text[len(content):] if len(text) > len(content) else ''
                line_break = '\n' if whole_line_replaced else ''
                result_parts.append(f'{DIFF_HTML_LABEL_REMOVED.format(content=content)}{trailing}{line_break}')
            else:
                content = text.rstrip()
                trailing = text[len(content):] if len(text) > len(content) else ''
                line_break = '\n' if whole_line_replaced else ''
                label = DIFF_LABEL_TEXT_CHANGED if whole_line_replaced else DIFF_LABEL_TEXT_REMOVED
                result_parts.append(f'{label.format(content=content)}{trailing}{line_break}')

    return ''.join(result_parts), has_changes

def same_slicer(lst: List[str], start: int, end: int) -> List[str]:
    """Return a slice of the list, or a single element if start == end."""
    return lst[start:end] if start != end else [lst[start]]

def customSequenceMatcher(
    before: List[str],
    after: List[str],
    include_equal: bool = False,
    include_removed: bool = True,
    include_added: bool = True,
    include_replaced: bool = True,
    include_change_type_prefix: bool = True,
    html_colour: bool = False,
    word_diff: bool = False,
    context_lines: int = 0,
    case_insensitive: bool = False,
    ignore_junk: bool = False
) -> Iterator[List[str]]:
    """
    Compare two sequences and yield differences based on specified parameters.

    Args:
        before (List[str]): Original sequence
        after (List[str]): Modified sequence
        include_equal (bool): Include unchanged parts
        include_removed (bool): Include removed parts
        include_added (bool): Include added parts
        include_replaced (bool): Include replaced parts
        include_change_type_prefix (bool): Add prefixes to indicate change types
        html_colour (bool): Use HTML background colors for differences
        word_diff (bool): Use word-level diffing for replaced lines
        context_lines (int): Number of unchanged lines to show around changes (like grep -C)
        case_insensitive (bool): Perform case-insensitive comparison
        ignore_junk (bool): Ignore whitespace-only changes

    Yields:
        List[str]: Differences between sequences
    """
    # Prepare sequences for comparison (lowercase if case-insensitive, normalize whitespace if ignore_junk)
    def prepare_line(line):
        if case_insensitive:
            line = line.lower()
        if ignore_junk:
            # Normalize whitespace: replace multiple spaces/tabs with single space
            line = WHITESPACE_NORMALIZE_RE.sub(' ', line)
        return line

    compare_before = [prepare_line(line) for line in before]
    compare_after = [prepare_line(line) for line in after]

    cruncher = difflib.SequenceMatcher(isjunk=lambda x: x in " \t", a=compare_before, b=compare_after)

    # When context_lines is set and include_equal is False, we need to track which equal lines to include
    if context_lines > 0 and not include_equal:
        opcodes = list(cruncher.get_opcodes())
        # Mark equal ranges that should be included based on context
        included_equal_ranges = set()

        for i, (tag, alo, ahi, blo, bhi) in enumerate(opcodes):
            if tag != 'equal':
                # Include context lines before this change
                for j in range(max(0, i - 1), i):
                    if opcodes[j][0] == 'equal':
                        prev_alo, prev_ahi = opcodes[j][1], opcodes[j][2]
                        # Include last N lines of the previous equal block
                        context_start = max(prev_alo, prev_ahi - context_lines)
                        for line_num in range(context_start, prev_ahi):
                            included_equal_ranges.add(line_num)

                # Include context lines after this change
                for j in range(i + 1, min(len(opcodes), i + 2)):
                    if opcodes[j][0] == 'equal':
                        next_alo, next_ahi = opcodes[j][1], opcodes[j][2]
                        # Include first N lines of the next equal block
                        context_end = min(next_ahi, next_alo + context_lines)
                        for line_num in range(next_alo, context_end):
                            included_equal_ranges.add(line_num)

    # Remember! gmail, outlook etc dont support <style> must be inline.
    # Gmail: strips <ins> and <del> tags entirely.
    for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
        if tag == 'equal':
            if include_equal:
                yield before[alo:ahi]
            elif context_lines > 0:
                # Only include equal lines that are in the context range
                context_lines_to_include = [before[i] for i in range(alo, ahi) if i in included_equal_ranges]
                if context_lines_to_include:
                    yield context_lines_to_include
        elif include_removed and tag == 'delete':
            if html_colour:
                yield [DIFF_HTML_LABEL_REMOVED.format(content=line) for line in same_slicer(before, alo, ahi)]
            else:
                yield [DIFF_LABEL_TEXT_REMOVED.format(content=line) for line in same_slicer(before, alo, ahi)] if include_change_type_prefix else same_slicer(before, alo, ahi)
        elif include_replaced and tag == 'replace':
            before_lines = same_slicer(before, alo, ahi)
            after_lines = same_slicer(after, blo, bhi)

            # Use word-level diff for single line replacements when enabled
            if word_diff and len(before_lines) == 1 and len(after_lines) == 1:
                inline_diff, has_changes = render_inline_word_diff(before_lines[0], after_lines[0], html_colour, ignore_junk)
                # Check if there are any actual changes (not just whitespace when ignore_junk is enabled)
                if ignore_junk and not has_changes:
                    # No real changes, skip this line
                    continue
                yield [inline_diff]
            else:
                # Fall back to line-level diff for multi-line changes or when word_diff disabled
                if html_colour:
                    yield [DIFF_HTML_LABEL_REMOVED.format(content=line) for line in before_lines] + \
                          [DIFF_HTML_LABEL_REPLACED.format(content=line) for line in after_lines]
                else:
                    yield [DIFF_LABEL_TEXT_CHANGED.format(content=line) for line in before_lines] + \
                          [DIFF_LABEL_TEXT_INTO.format(content=line) for line in after_lines] if include_change_type_prefix else before_lines + after_lines
        elif include_added and tag == 'insert':
            if html_colour:
                yield [DIFF_HTML_LABEL_INSERTED.format(content=line) for line in same_slicer(after, blo, bhi)]
            else:
                yield [DIFF_LABEL_TEXT_ADDED.format(content=line) for line in same_slicer(after, blo, bhi)] if include_change_type_prefix else same_slicer(after, blo, bhi)

def render_diff(
    previous_version_file_contents: str,
    newest_version_file_contents: str,
    include_equal: bool = False,
    include_removed: bool = True,
    include_added: bool = True,
    include_replaced: bool = True,
    line_feed_sep: str = "\n",
    include_change_type_prefix: bool = True,
    patch_format: bool = False,
    html_colour: bool = False,
    word_diff: bool = True,
    context_lines: int = 0,
    case_insensitive: bool = False,
    ignore_junk: bool = False
) -> str:
    """
    Render the difference between two file contents.

    Args:
        previous_version_file_contents (str): Original file contents
        newest_version_file_contents (str): Modified file contents
        include_equal (bool): Include unchanged parts
        include_removed (bool): Include removed parts
        include_added (bool): Include added parts
        include_replaced (bool): Include replaced parts
        line_feed_sep (str): Separator for lines in output
        include_change_type_prefix (bool): Add prefixes to indicate change types
        patch_format (bool): Use patch format for output
        html_colour (bool): Use HTML background colors for differences
        word_diff (bool): Use word-level diffing for replaced lines
        context_lines (int): Number of unchanged lines to show around changes (like grep -C)
        case_insensitive (bool): Perform case-insensitive comparison, By default the test_json_diff/process.py is case sensitive, so this follows same logic
        ignore_junk (bool): Ignore whitespace-only changes

    Returns:
        str: Rendered difference
    """
    newest_lines = [line.rstrip() for line in newest_version_file_contents.splitlines()]
    previous_lines = [line.rstrip() for line in previous_version_file_contents.splitlines()] if previous_version_file_contents else []

    if patch_format:
        patch = difflib.unified_diff(previous_lines, newest_lines)
        return line_feed_sep.join(patch)

    rendered_diff = customSequenceMatcher(
        before=previous_lines,
        after=newest_lines,
        include_equal=include_equal,
        include_removed=include_removed,
        include_added=include_added,
        include_replaced=include_replaced,
        include_change_type_prefix=include_change_type_prefix,
        html_colour=html_colour,
        word_diff=word_diff,
        context_lines=context_lines,
        case_insensitive=case_insensitive,
        ignore_junk=ignore_junk
    )

    def flatten(lst: List[Union[str, List[str]]]) -> str:
        result = []
        for x in lst:
            if isinstance(x, list):
                result.extend(x)
            else:
                result.append(x)
        return line_feed_sep.join(result)

    return flatten(rendered_diff)