From a57d046b0ca080dfdd16bb2fd80873fbab1a8911 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 3 Oct 2025 10:55:56 +0200 Subject: [PATCH] Option to ignore junk/whitespace etc --- changedetectionio/blueprint/ui/views.py | 2 +- changedetectionio/diff.py | 114 +++++++++++++++-- changedetectionio/notification_service.py | 12 +- .../tests/unit/test_notification_diff.py | 119 ++++++++++++++++++ 4 files changed, 227 insertions(+), 20 deletions(-) diff --git a/changedetectionio/blueprint/ui/views.py b/changedetectionio/blueprint/ui/views.py index f6afb748..8191c4f7 100644 --- a/changedetectionio/blueprint/ui/views.py +++ b/changedetectionio/blueprint/ui/views.py @@ -224,7 +224,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe to_version_file_contents, include_equal=True, html_colour=True, - case_insensitive=datastore.data['settings']['application'].get('ignore_whitespace', False) + case_insensitive=datastore.data['settings']['application'].get('ignore_whitespace', False), ) return render_template("diff.html", diff --git a/changedetectionio/diff.py b/changedetectionio/diff.py index 75395fdc..e6e98680 100644 --- a/changedetectionio/diff.py +++ b/changedetectionio/diff.py @@ -6,7 +6,7 @@ from typing import List, Iterator, Union REMOVED_STYLE = "background-color: #fadad7; color: #b30000;" ADDED_STYLE = "background-color: #eaf2c2; color: #406619;" -def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool = False) -> str: +def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool = False, ignore_junk: bool = False) -> str: """ Render word-level differences between two lines inline. @@ -14,6 +14,7 @@ def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool before_line: Original line text after_line: Modified line text html_colour: Use HTML background colors for differences + ignore_junk: Ignore whitespace-only changes Returns: str: Single line with inline word-level highlighting @@ -30,7 +31,9 @@ def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool after_tokens = tokenize(after_line) # Use SequenceMatcher to find word-level differences - matcher = difflib.SequenceMatcher(None, before_tokens, after_tokens) + # If ignore_junk is True, treat whitespace tokens as junk + isjunk = (lambda x: x.strip() == '') if ignore_junk else None + matcher = difflib.SequenceMatcher(isjunk, before_tokens, after_tokens) if html_colour: result = [] @@ -39,15 +42,45 @@ def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool result.append(''.join(before_tokens[i1:i2])) elif tag == 'delete': deleted = ''.join(before_tokens[i1:i2]) + # If only whitespace and ignore_junk is enabled, preserve whitespace without marking + if ignore_junk and deleted.strip() == '': + result.append(deleted) + continue result.append(f'{deleted}') elif tag == 'insert': inserted = ''.join(after_tokens[j1:j2]) + # If only whitespace and ignore_junk is enabled, preserve whitespace without marking + if ignore_junk and inserted.strip() == '': + result.append(inserted) + continue result.append(f'{inserted}') elif tag == 'replace': deleted = ''.join(before_tokens[i1:i2]) inserted = ''.join(after_tokens[j1:j2]) - result.append(f'{deleted}') - result.append(f'{inserted}') + # If both are only whitespace and ignore_junk is enabled, use the after version + if ignore_junk and deleted.strip() == '' and inserted.strip() == '': + result.append(inserted) + continue + # When ignore_junk is enabled, filter out whitespace-only tokens from replace operations + if ignore_junk: + deleted_parts = [] + inserted_parts = [] + for token in before_tokens[i1:i2]: + if token.strip() != '': + deleted_parts.append(token) + for token in after_tokens[j1:j2]: + if token.strip() != '': + inserted_parts.append(token) + # Add a single space between words (normalized whitespace) + if deleted_parts or inserted_parts: + result.append(' ') + if deleted_parts: + result.append(f'{"".join(deleted_parts)}') + if inserted_parts: + result.append(f'{"".join(inserted_parts)}') + else: + result.append(f'{deleted}') + result.append(f'{inserted}') return ''.join(result) else: # Plain text format with markers @@ -57,14 +90,45 @@ def render_inline_word_diff(before_line: str, after_line: str, html_colour: bool result.append(''.join(before_tokens[i1:i2])) elif tag == 'delete': deleted = ''.join(before_tokens[i1:i2]) + # If only whitespace and ignore_junk is enabled, preserve whitespace without marking + if ignore_junk and deleted.strip() == '': + result.append(deleted) + continue result.append(f'[-{deleted}-]') elif tag == 'insert': inserted = ''.join(after_tokens[j1:j2]) + # If only whitespace and ignore_junk is enabled, preserve whitespace without marking + if ignore_junk and inserted.strip() == '': + result.append(inserted) + continue result.append(f'[+{inserted}+]') elif tag == 'replace': deleted = ''.join(before_tokens[i1:i2]) inserted = ''.join(after_tokens[j1:j2]) - result.append(f'[-{deleted}-][+{inserted}+]') + # If both are only whitespace and ignore_junk is enabled, use the after version + if ignore_junk and deleted.strip() == '' and inserted.strip() == '': + result.append(inserted) + continue + # When ignore_junk is enabled, filter out whitespace-only tokens from replace operations + if ignore_junk: + deleted_parts = [] + inserted_parts = [] + for token in before_tokens[i1:i2]: + if token.strip() != '': + deleted_parts.append(token) + for token in after_tokens[j1:j2]: + if token.strip() != '': + inserted_parts.append(token) + # Add a single space between words (normalized whitespace) + if deleted_parts or inserted_parts: + result.append(' ') + if deleted_parts: + result.append(f'[-{"".join(deleted_parts)}-]') + if inserted_parts: + result.append(f'[+{"".join(inserted_parts)}+]') + else: + result.append(f'[-{deleted}-]') + result.append(f'[+{inserted}+]') return ''.join(result) def same_slicer(lst: List[str], start: int, end: int) -> List[str]: @@ -82,7 +146,8 @@ def customSequenceMatcher( html_colour: bool = False, word_diff: bool = False, context_lines: int = 0, - case_insensitive: bool = False + case_insensitive: bool = False, + ignore_junk: bool = False ) -> Iterator[List[str]]: """ Compare two sequences and yield differences based on specified parameters. @@ -99,13 +164,23 @@ def customSequenceMatcher( word_diff (bool): Use word-level diffing for replaced lines context_lines (int): Number of unchanged lines to show around changes (like grep -C) case_insensitive (bool): Perform case-insensitive comparison + ignore_junk (bool): Ignore whitespace-only changes Yields: List[str]: Differences between sequences """ - # Prepare sequences for comparison (lowercase if case-insensitive) - compare_before = [line.lower() for line in before] if case_insensitive else before - compare_after = [line.lower() for line in after] if case_insensitive else after + # Prepare sequences for comparison (lowercase if case-insensitive, normalize whitespace if ignore_junk) + import re + def prepare_line(line): + if case_insensitive: + line = line.lower() + if ignore_junk: + # Normalize whitespace: replace multiple spaces/tabs with single space + line = re.sub(r'\s+', ' ', line) + return line + + compare_before = [prepare_line(line) for line in before] + compare_after = [prepare_line(line) for line in after] cruncher = difflib.SequenceMatcher(isjunk=lambda x: x in " \t", a=compare_before, b=compare_after) @@ -157,7 +232,17 @@ def customSequenceMatcher( # Use word-level diff for single line replacements when enabled if word_diff and len(before_lines) == 1 and len(after_lines) == 1: - inline_diff = render_inline_word_diff(before_lines[0], after_lines[0], html_colour) + inline_diff = render_inline_word_diff(before_lines[0], after_lines[0], html_colour, ignore_junk) + # Check if there are any actual changes (not just whitespace when ignore_junk is enabled) + if ignore_junk: + # Check if the output contains any change markers + if html_colour: + has_changes = ' str: """ Render the difference between two file contents. @@ -204,7 +290,8 @@ def render_diff( html_colour (bool): Use HTML background colors for differences word_diff (bool): Use word-level diffing for replaced lines context_lines (int): Number of unchanged lines to show around changes (like grep -C) - case_insensitive (bool): Perform case-insensitive comparison + case_insensitive (bool): Perform case-insensitive comparison, By default the test_json_diff/process.py is case sensitive, so this follows same logic + ignore_junk (bool): Ignore whitespace-only changes Returns: str: Rendered difference @@ -227,7 +314,8 @@ def render_diff( html_colour=html_colour, word_diff=word_diff, context_lines=context_lines, - case_insensitive=case_insensitive + case_insensitive=case_insensitive, + ignore_junk=ignore_junk ) def flatten(lst: List[Union[str, List[str]]]) -> str: diff --git a/changedetectionio/notification_service.py b/changedetectionio/notification_service.py index eb6bc721..cd4216eb 100644 --- a/changedetectionio/notification_service.py +++ b/changedetectionio/notification_service.py @@ -76,15 +76,15 @@ class NotificationService: prev_snapshot = watch.get_history_snapshot(dates[-2]) current_snapshot = watch.get_history_snapshot(dates[-1]) - case_insensitive=self. datastore.data['settings']['application'].get('ignore_whitespace', False) + ignore_junk = self.datastore.data['settings']['application'].get('ignore_whitespace', False) n_object.update({ 'current_snapshot': snapshot_contents, - 'diff': diff.render_diff(prev_snapshot, current_snapshot, line_feed_sep=line_feed_sep, html_colour=html_colour_enable, case_insensitive=case_insensitive), - 'diff_added': diff.render_diff(prev_snapshot, current_snapshot, include_removed=False, line_feed_sep=line_feed_sep, case_insensitive=case_insensitive), - 'diff_full': diff.render_diff(prev_snapshot, current_snapshot, include_equal=True, line_feed_sep=line_feed_sep, html_colour=html_colour_enable, case_insensitive=case_insensitive), - 'diff_patch': diff.render_diff(prev_snapshot, current_snapshot, line_feed_sep=line_feed_sep, patch_format=True, case_insensitive=case_insensitive), - 'diff_removed': diff.render_diff(prev_snapshot, current_snapshot, include_added=False, line_feed_sep=line_feed_sep, case_insensitive=case_insensitive), + 'diff': diff.render_diff(prev_snapshot, current_snapshot, line_feed_sep=line_feed_sep, html_colour=html_colour_enable, ignore_junk=ignore_junk), + 'diff_added': diff.render_diff(prev_snapshot, current_snapshot, include_removed=False, line_feed_sep=line_feed_sep, ignore_junk=ignore_junk), + 'diff_full': diff.render_diff(prev_snapshot, current_snapshot, include_equal=True, line_feed_sep=line_feed_sep, html_colour=html_colour_enable, ignore_junk=ignore_junk), + 'diff_patch': diff.render_diff(prev_snapshot, current_snapshot, line_feed_sep=line_feed_sep, patch_format=True, ignore_junk=ignore_junk), + 'diff_removed': diff.render_diff(prev_snapshot, current_snapshot, include_added=False, line_feed_sep=line_feed_sep, ignore_junk=ignore_junk), 'notification_timestamp': now, 'screenshot': watch.get_screenshot() if watch and watch.get('notification_screenshot') else None, 'triggered_text': triggered_text, diff --git a/changedetectionio/tests/unit/test_notification_diff.py b/changedetectionio/tests/unit/test_notification_diff.py index 477fcd57..bfd7ee1f 100755 --- a/changedetectionio/tests/unit/test_notification_diff.py +++ b/changedetectionio/tests/unit/test_notification_diff.py @@ -228,5 +228,124 @@ Line 4""" self.assertIn('200', output) self.assertIn('background-color', output) + def test_ignore_junk_word_diff_enabled(self): + """Test ignore_junk with word_diff=True""" + before = "The quick brown fox" + after = "The quick brown fox" + + # Without ignore_junk, should detect whitespace changes + output = diff.render_diff(before, after, include_equal=False, word_diff=True, ignore_junk=False) + # Should show some difference (whitespace changes) + self.assertTrue(len(output.strip()) > 0, "Should detect whitespace changes when ignore_junk=False") + + # With ignore_junk, should ignore whitespace-only changes + output = diff.render_diff(before, after, include_equal=False, word_diff=True, ignore_junk=True) + lines = [l for l in output.split("\n") if l.strip()] + self.assertEqual(len(lines), 0, "Should ignore whitespace-only changes when ignore_junk=True") + + def test_ignore_junk_word_diff_disabled(self): + """Test ignore_junk with word_diff=False""" + before = "Hello World" + after = "Hello World" + + # Without ignore_junk, should detect line change + output = diff.render_diff(before, after, include_equal=False, word_diff=False, ignore_junk=False) + self.assertIn('(changed)', output) + self.assertIn('(into)', output) + + # With ignore_junk enabled and word_diff disabled + # When ignore_junk is enabled, whitespace is normalized at line level so lines match + output = diff.render_diff(before, after, include_equal=False, word_diff=False, ignore_junk=True) + # Lines should be treated as equal + lines = [l for l in output.split("\n") if l.strip()] + self.assertEqual(len(lines), 0, "Should ignore whitespace differences at line level") + + def test_ignore_junk_with_real_changes(self): + """Test ignore_junk doesn't ignore actual word changes""" + before = "The quick brown fox" + after = "The quick brown cat" + + output = diff.render_diff(before, after, include_equal=False, word_diff=True, ignore_junk=True) + + # Should still detect the word change (fox -> cat) + self.assertIn('[-fox-]', output) + self.assertIn('[+cat+]', output) + # But shouldn't highlight whitespace differences + + def test_ignore_junk_tabs_vs_spaces(self): + """Test ignore_junk treats tabs and spaces as equivalent""" + before = "Column1\tColumn2\tColumn3" + after = "Column1 Column2 Column3" + + # Without ignore_junk, should detect difference + output = diff.render_diff(before, after, include_equal=False, word_diff=True, ignore_junk=False) + self.assertTrue(len(output.strip()) > 0, "Should detect tab vs space differences") + + # With ignore_junk, should ignore tab/space differences + output = diff.render_diff(before, after, include_equal=False, word_diff=True, ignore_junk=True) + lines = [l for l in output.split("\n") if l.strip()] + self.assertEqual(len(lines), 0, "Should ignore tab vs space differences when ignore_junk=True") + + def test_ignore_junk_html_output(self): + """Test ignore_junk with HTML coloring""" + before = "Value: 100 points" + after = "Value: 200 points" + + output = diff.render_diff(before, after, include_equal=False, word_diff=True, html_colour=True, ignore_junk=True) + + # Should only highlight the actual value change + self.assertIn('100', output) + self.assertIn('200', output) + self.assertIn('background-color', output) + # Should not create separate spans for whitespace changes + + def test_ignore_junk_case_insensitive_combination(self): + """Test ignore_junk combined with case_insensitive""" + before = "The QUICK Brown Fox" + after = "The quick brown FOX" + + # Both enabled: should ignore case and whitespace + output = diff.render_diff(before, after, include_equal=False, word_diff=True, + case_insensitive=True, ignore_junk=True) + lines = [l for l in output.split("\n") if l.strip()] + self.assertEqual(len(lines), 0, "Should ignore both case and whitespace differences") + + # Only case_insensitive: should detect whitespace changes + output = diff.render_diff(before, after, include_equal=False, word_diff=True, + case_insensitive=True, ignore_junk=False) + self.assertTrue(len(output.strip()) > 0, "Should detect whitespace changes") + + # Only ignore_junk: should detect case changes + output = diff.render_diff(before, after, include_equal=False, word_diff=True, + case_insensitive=False, ignore_junk=True) + # Should detect case differences + self.assertIn('QUICK', output) + self.assertIn('quick', output) + self.assertIn('Brown', output) + self.assertIn('brown', output) + # Should show changes (though may be grouped together) + self.assertTrue('[-' in output and '-]' in output, "Should show removed text") + self.assertTrue('[+' in output and '+]' in output, "Should show added text") + + def test_ignore_junk_multiline(self): + """Test ignore_junk with multiple lines""" + before = """Line 1 with spaces +Line 2 unchanged +Line 3 with tabs and spaces""" + + after = """Line 1 with spaces +Line 2 unchanged +Line 3 with tabs and spaces""" + + # With ignore_junk, should only show unchanged line when include_equal=True + output = diff.render_diff(before, after, include_equal=False, word_diff=True, ignore_junk=True) + lines = [l for l in output.split("\n") if l.strip()] + # Should be empty since only whitespace changed + self.assertEqual(len(lines), 0, "Should ignore whitespace changes across multiple lines") + + # Verify Line 2 is not shown as changed + self.assertNotIn('[-Line 2-]', output) + self.assertNotIn('[+Line 2+]', output) + if __name__ == '__main__': unittest.main()