Filters - Support multi line regex (#2889)

2026-07-08 00:10:46 +00:00 · 2025-04-09 15:06:08 +02:00
parent 1378b5b2ff
commit 4e6e680d79
2 changed files with 74 additions and 14 deletions
@@ -366,22 +366,41 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
 # wordlist - list of regex's (str) or words (str)
 # Preserves all linefeeds and other whitespacing, its not the job of this to remove that
 def strip_ignore_text(content, wordlist, mode="content"):
-    i = 0
-    output = []
    ignore_text = []
    ignore_regex = []
-    ignored_line_numbers = []
+    ignore_regex_multiline = []
+    ignored_lines = []

    for k in wordlist:
        # Is it a regex?
        res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE)
        if res:
-            ignore_regex.append(re.compile(perl_style_slash_enclosed_regex_to_options(k)))
+            res = re.compile(perl_style_slash_enclosed_regex_to_options(k))
+            if res.flags & re.DOTALL or res.flags & re.MULTILINE:
+                ignore_regex_multiline.append(res)
+            else:
+                ignore_regex.append(res)
        else:
            ignore_text.append(k.strip())

-    for line in content.splitlines(keepends=True):
-        i += 1
+    for r in ignore_regex_multiline:
+        for match in r.finditer(content):
+            content_lines = content[:match.end()].splitlines(keepends=True)
+            match_lines = content[match.start():match.end()].splitlines(keepends=True)
+
+            end_line = len(content_lines)
+            start_line = end_line - len(match_lines)
+
+            if end_line - start_line <= 1:
+                # Match is empty or in the middle of the line
+                ignored_lines.append(start_line)
+            else:
+                for i in range(start_line, end_line):
+                    ignored_lines.append(i)
+
+    line_index = 0
+    lines = content.splitlines(keepends=True)
+    for line in lines:
        # Always ignore blank lines in this mode. (when this function gets called)
        got_match = False
        for l in ignore_text:
@@ -393,17 +412,19 @@ def strip_ignore_text(content, wordlist, mode="content"):
                if r.search(line):
                    got_match = True

-        if not got_match:
-            # Not ignored, and should preserve "keepends"
-            output.append(line)
-        else:
-            ignored_line_numbers.append(i)
+        if got_match:
+            ignored_lines.append(line_index)
+
+        line_index += 1
+
+    ignored_lines = set([i for i in ignored_lines if i >= 0 and i < len(lines)])

    # Used for finding out what to highlight
    if mode == "line numbers":
-        return ignored_line_numbers
+        return [i + 1 for i in ignored_lines]

-    return ''.join(output)
+    output_lines = set(range(len(lines))) - ignored_lines
+    return ''.join([lines[i] for i in output_lines])

 def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
    from xml.sax.saxutils import escape as xml_escape
@@ -32,7 +32,6 @@ def test_strip_regex_text_func():
    ]

    stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)
-
    assert "but 1 lines" in stripped_content
    assert "igNORe-cAse text" not in stripped_content
    assert "but 1234 lines" not in stripped_content
@@ -42,6 +41,46 @@ def test_strip_regex_text_func():
    # Check line number reporting
    stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers")
    assert stripped_content == [2, 5, 6, 7, 8, 10]
+    
+    stripped_content = html_tools.strip_ignore_text(test_content, ['/but 1.+5 lines/s'])
+    assert "but 1 lines" not in stripped_content
+    assert "skip 5 lines" not in stripped_content
+    
+    stripped_content = html_tools.strip_ignore_text(test_content, ['/but 1.+5 lines/s'], mode="line numbers")
+    assert stripped_content == [4, 5]
+    
+    stripped_content = html_tools.strip_ignore_text(test_content, ['/.+/s'])
+    assert stripped_content == ""
+    
+    stripped_content = html_tools.strip_ignore_text(test_content, ['/.+/s'], mode="line numbers")
+    assert stripped_content == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
+
+    stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+but.+\\n.+lines$/m'])
+    assert "but 1 lines" not in stripped_content
+    assert "skip 5 lines" not in stripped_content
+
+    stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+but.+\\n.+lines$/m'], mode="line numbers")
+    assert stripped_content == [4, 5]
+
+    stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+?\.$/m'])
+    assert "but sometimes we want to remove the lines." not in stripped_content
+    assert "but not always." not in stripped_content
+
+    stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+?\.$/m'], mode="line numbers")
+    assert stripped_content == [2, 11]
+
+    stripped_content = html_tools.strip_ignore_text(test_content, ['/but.+?but/ms'])
+    assert "but sometimes we want to remove the lines." not in stripped_content
+    assert "but 1 lines" not in stripped_content
+    assert "but 1234 lines" not in stripped_content
+    assert "igNORe-cAse text we dont want to keep" not in stripped_content
+    assert "but not always." not in stripped_content
+
+    stripped_content = html_tools.strip_ignore_text(test_content, ['/but.+?but/ms'], mode="line numbers")
+    assert stripped_content == [2, 3, 4, 9, 10, 11]
+
+    stripped_content = html_tools.strip_ignore_text("\n\ntext\n\ntext\n\n", ['/^$/ms'], mode="line numbers")
+    assert stripped_content == [1, 2, 4, 6]

    # Check that linefeeds are preserved when there are is no matching ignores
    content = "some text\n\nand other text\n"