changedetection.io/changedetectionio/tests/test_extract_regex.py

#!/usr/bin/env python3

import time
from flask import url_for
from .util import live_server_setup, wait_for_all_checks, delete_all_watches
import os

from ..html_tools import *


def set_original_response(datastore_path):
    test_return_data = """<html>
       <body>
     Some initial text<br>
     <p>Which is across multiple lines</p>
     <br>
     So let's see what happens.  <br>
     <div id="sametext">Some text thats the same</div>
     <div class="changetext">Some text that will change</div>
     </body>
     </html>
    """

    with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
        f.write(test_return_data)
    return None


def set_modified_response(datastore_path):
    test_return_data = """<html>
       <body>
     Some initial text<br>
     <p>which has this one new line</p>
     <br>
     So let's see what happens.  <br>
     <div id="sametext">Some text thats the same</div>
     <div class="changetext">Some text that did change ( 1000 online <br> 80 guests<br>  2000 online )</div>
     <div class="changetext">SomeCase insensitive 3456</div>
     </body>
     </html>
    """

    with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
        f.write(test_return_data)

    return None


def set_multiline_response(datastore_path):
    test_return_data = """<html>
       <body>

     <p>Something <br>
        across 6 billion multiple<br>
        lines
     </p>

     <div>aaand something lines</div>
     <br>
     <div>and this should be</div>
     </body>
     </html>
    """

    with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
        f.write(test_return_data)

    return None


# def test_setup(client, live_server, measure_memory_usage, datastore_path):
   #  live_server_setup(live_server) # Setup on conftest per function

def test_check_filter_multiline(client, live_server, measure_memory_usage, datastore_path):
   ##  live_server_setup(live_server) # Setup on conftest per function
    set_multiline_response(datastore_path=datastore_path)

    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
    client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)

    wait_for_all_checks(client)

    # Goto the edit page, add our ignore text
    # Add our URL to the import page
    res = client.post(
        url_for("ui.ui_edit.edit_page", uuid="first"),
        data={"include_filters": '',
              # Test a regex and a plaintext
              'extract_text': '/something.+?6 billion.+?lines/si\r\nand this should be',
              "url": test_url,
              "tags": "",
              "headers": "",
              'fetch_backend': "html_requests",
              "time_between_check_use_default": "y"
              },
        follow_redirects=True
    )

    assert b"Updated watch." in res.data
    wait_for_all_checks(client)

    res = client.get(url_for("watchlist.index"))

    # Issue 1828
    assert b'not at the start of the expression' not in res.data

    res = client.get(
        url_for("ui.ui_preview.preview_page", uuid="first"),
        follow_redirects=True
    )
    # Plaintext that doesnt look like a regex should match also
    assert b'and this should be' in res.data

    assert b'Something' in res.data
    assert b'across 6 billion multiple' in res.data
    assert b'lines' in res.data

    # but the last one, which also says 'lines' shouldnt be here (non-greedy match checking)
    assert b'aaand something lines' not in res.data

def test_check_filter_and_regex_extract(client, live_server, measure_memory_usage, datastore_path):

    include_filters = ".changetext"

    set_original_response(datastore_path=datastore_path)

    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
    client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)

    # Give the thread time to pick it up
    wait_for_all_checks(client)

    # Goto the edit page, add our ignore text
    # Add our URL to the import page
    res = client.post(
        url_for("ui.ui_edit.edit_page", uuid="first"),
        data={"include_filters": include_filters,
              'extract_text': '/\d+ online/\r\n/\d+ guests/\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i\r\n/issue1828.+?2022/i',
              "url": test_url,
              "tags": "",
              "headers": "",
              'fetch_backend': "html_requests",
              "time_between_check_use_default": "y"
              },
        follow_redirects=True
    )

    assert b"Updated watch." in res.data


    # Give the thread time to pick it up
    wait_for_all_checks(client)

    res = client.get(url_for("watchlist.index"))
    #issue 1828
    assert b'not at the start of the expression' not in res.data

    #  Make a change
    set_modified_response(datastore_path=datastore_path)

    # Trigger a check
    client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
    # Give the thread time to pick it up
    wait_for_all_checks(client)

    # It should have 'has-unread-changes' still
    # Because it should be looking at only that 'sametext' id
    res = client.get(url_for("watchlist.index"))
    assert b'has-unread-changes' in res.data

    # Check HTML conversion detected and workd
    res = client.get(
        url_for("ui.ui_preview.preview_page", uuid="first"),
        follow_redirects=True
    )

    assert b'1000 online' in res.data

    # All regex matching should be here
    assert b'2000 online' in res.data

    # Both regexs should be here
    assert b'80 guests' in res.data

    # Regex with flag handling should be here
    assert b'SomeCase insensitive 3456' in res.data

    # Singular group from /somecase insensitive (345\d)/i
    assert b'3456' in res.data

    # Regex with multiline flag handling should be here

    # Should not be here
    assert b'Some text that did change' not in res.data


def test_regex_error_handling(client, live_server, measure_memory_usage, datastore_path):


    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
    time.sleep(0.2)
    ### test regex error handling
    res = client.post(
        url_for("ui.ui_edit.edit_page", uuid=uuid),
        data={"extract_text": '/something bad\d{3/XYZ',
              "url": test_url,
              "fetch_backend": "html_requests",
              "time_between_check_use_default": "y"},
        follow_redirects=True
    )

    assert b'is not a valid regular expression.' in res.data

    delete_all_watches(client)


def test_extract_lines_containing(client, live_server, measure_memory_usage, datastore_path):
    """Test the 'extract_lines_containing' filter keeps only lines with matching substrings."""

    test_return_data = """<html>
       <body>
         <p>Current temperature: 21 celsius</p>
         <p>Humidity: 55%</p>
         <p>Wind speed: 10 km/h</p>
         <p>Feels like: 19 celsius</p>
         <p>UV index: 3</p>
       </body>
       </html>
    """
    with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
        f.write(test_return_data)

    test_url = url_for('test_endpoint', _external=True)
    uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
    client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
    wait_for_all_checks(client)

    res = client.post(
        url_for("ui.ui_edit.edit_page", uuid=uuid),
        data={
            'extract_lines_containing': 'celsius',
            "url": test_url,
            "tags": "",
            "headers": "",
            'fetch_backend': "html_requests",
            "time_between_check_use_default": "y"
        },
        follow_redirects=True
    )
    assert b"Updated watch." in res.data
    wait_for_all_checks(client)

    res = client.get(url_for("ui.ui_preview.preview_page", uuid=uuid), follow_redirects=True)

    # Lines containing 'celsius' should be present
    assert b'celsius' in res.data
    # Lines without 'celsius' should be excluded
    assert b'Humidity' not in res.data
    assert b'Wind speed' not in res.data
    assert b'UV index' not in res.data

    delete_all_watches(client)


def test_extract_lines_containing_case_insensitive(client, live_server, measure_memory_usage, datastore_path):
    """Test that extract_lines_containing is case-insensitive."""

    test_return_data = """<html>
       <body>
         <p>PRICE: $99.99</p>
         <p>Price drops to $79.99</p>
         <p>Stock: Available</p>
         <p>price history shows decline</p>
       </body>
       </html>
    """
    with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
        f.write(test_return_data)

    test_url = url_for('test_endpoint', _external=True)
    uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
    client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
    wait_for_all_checks(client)

    res = client.post(
        url_for("ui.ui_edit.edit_page", uuid=uuid),
        data={
            'extract_lines_containing': 'price',
            "url": test_url,
            "tags": "",
            "headers": "",
            'fetch_backend': "html_requests",
            "time_between_check_use_default": "y"
        },
        follow_redirects=True
    )
    assert b"Updated watch." in res.data
    wait_for_all_checks(client)

    res = client.get(url_for("ui.ui_preview.preview_page", uuid=uuid), follow_redirects=True)

    # All three price lines (different cases) should match
    assert b'$99.99' in res.data
    assert b'$79.99' in res.data
    assert b'price history' in res.data
    # Non-price line should be excluded
    assert b'Stock' not in res.data

    delete_all_watches(client)


def test_extract_lines_containing_multiple_terms(client, live_server, measure_memory_usage, datastore_path):
    """Test that multiple extract_lines_containing entries act as OR (keep line if any term matches)."""

    test_return_data = """<html>
       <body>
         <p>Temperature: 21 celsius</p>
         <p>Humidity: 55%</p>
         <p>Wind speed: 10 km/h</p>
         <p>Rain chance: 20%</p>
       </body>
       </html>
    """
    with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
        f.write(test_return_data)

    test_url = url_for('test_endpoint', _external=True)
    uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
    client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
    wait_for_all_checks(client)

    res = client.post(
        url_for("ui.ui_edit.edit_page", uuid=uuid),
        data={
            'extract_lines_containing': 'celsius\r\nhumidity',
            "url": test_url,
            "tags": "",
            "headers": "",
            'fetch_backend': "html_requests",
            "time_between_check_use_default": "y"
        },
        follow_redirects=True
    )
    assert b"Updated watch." in res.data
    wait_for_all_checks(client)

    res = client.get(url_for("ui.ui_preview.preview_page", uuid=uuid), follow_redirects=True)

    assert b'celsius' in res.data
    assert b'Humidity' in res.data
    # Wind and Rain lines should be excluded
    assert b'Wind speed' not in res.data
    assert b'Rain chance' not in res.data

    delete_all_watches(client)


def test_extract_lines_containing_with_ignore_text(client, live_server, measure_memory_usage, datastore_path):
    """
    extract_lines_containing narrows to matching lines; ignore_text then suppresses specific
    lines from triggering change detection (they remain visible but don't affect the checksum).

    Filters are set BEFORE the first check so the filtered+ignored checksum is the baseline
    from the very start — no race between a forced-recheck and the next content write.
    """
    initial_data = """<html><body>
      <p>Temperature: 21 celsius</p>
      <p>Feels like: 19 celsius</p>
      <p>Humidity: 55%</p>
    </body></html>"""

    with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
        f.write(initial_data)

    test_url = url_for('test_endpoint', _external=True)
    uuid = client.application.config.get('DATASTORE').add_watch(url=test_url, extras={'paused': True})

    # Set filters BEFORE the first check so the baseline is always filtered+ignored.
    # (Setting them after an initial unfiltered check creates a race: the forced recheck
    #  that updates previous_md5 must complete before the next content write, which is
    #  timing-sensitive and fails intermittently on slower systems / Python 3.14.)
    res = client.post(
        url_for("ui.ui_edit.edit_page", uuid=uuid, unpause_on_save=1),
        data={
            'extract_lines_containing': 'celsius',
            'ignore_text': 'Feels like',
            "url": test_url,
            "tags": "",
            "headers": "",
            'fetch_backend': "html_requests",
            "time_between_check_use_default": "y"
        },
        follow_redirects=True
    )
    assert b"unpaused" in res.data

    # First check — establishes filtered+ignored baseline. previous_md5 was False so
    # a change is always detected here; mark_all_viewed clears it before we assert.
    client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
    wait_for_all_checks(client)

    # Sanity: preview should only show celsius lines
    res = client.get(url_for("ui.ui_preview.preview_page", uuid=uuid), follow_redirects=True)
    assert b'celsius' in res.data
    assert b'Humidity' not in res.data

    # Change ONLY the ignored "Feels like" line — should NOT trigger a change
    changed_data = """<html><body>
      <p>Temperature: 21 celsius</p>
      <p>Feels like: 17 celsius</p>
      <p>Humidity: 55%</p>
    </body></html>"""

    with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
        f.write(changed_data)

    client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
    wait_for_all_checks(client)

    res = client.get(url_for("watchlist.index"))
    assert b'has-unread-changes' not in res.data, "Changing an ignored line should not trigger a change notification"

    client.get(url_for("ui.mark_all_viewed"), follow_redirects=True)
    time.sleep(1)

    # Change the non-ignored celsius line — SHOULD trigger
    triggered_data = """<html><body>
      <p>Temperature: 30 celsius</p>
      <p>Feels like: 17 celsius</p>
      <p>Humidity: 55%</p>
    </body></html>"""

    with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
        f.write(triggered_data)

    client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
    wait_for_all_checks(client)

    res = client.get(url_for("watchlist.index"))
    assert b'has-unread-changes' in res.data,  "Changing a non-ignored line should trigger a change notification"

    delete_all_watches(client)


def test_extract_lines_containing_with_extract_text_regex(client, live_server, measure_memory_usage, datastore_path):
    """
    extract_lines_containing first narrows to relevant lines, then extract_text regex
    pulls specific tokens from those lines — verifying correct pipeline ordering.
    """
    test_return_data = """<html><body>
      <p>Widget price: $49.99 each</p>
      <p>Gadget price: $129.00 each</p>
      <p>Latest news: price index up 2%</p>
      <p>Stock count: 150 units</p>
      <p>Shipping cost: $5.99</p>
    </body></html>"""

    with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
        f.write(test_return_data)

    test_url = url_for('test_endpoint', _external=True)
    uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
    client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
    wait_for_all_checks(client)

    res = client.post(
        url_for("ui.ui_edit.edit_page", uuid=uuid),
        data={
            # Step 1: keep lines containing "price" (excludes Stock count and Shipping cost)
            'extract_lines_containing': 'price',
            # Step 2: from those lines extract only dollar amounts
            'extract_text': r'/\$[\d.]+/',
            "url": test_url,
            "tags": "",
            "headers": "",
            'fetch_backend': "html_requests",
            "time_between_check_use_default": "y"
        },
        follow_redirects=True
    )
    assert b"Updated watch." in res.data
    wait_for_all_checks(client)

    res = client.get(url_for("ui.ui_preview.preview_page", uuid=uuid), follow_redirects=True)

    # Dollar amounts from price lines should be extracted
    assert b'$49.99' in res.data
    assert b'$129.00' in res.data
    # "price index up 2%" has no dollar amount — nothing extracted from that line
    # "Shipping cost" line was excluded by extract_lines_containing before regex ran
    assert b'$5.99' not in res.data
    # Raw line text should not appear — regex replaced it with just the match
    assert b'Widget' not in res.data
    assert b'Stock count' not in res.data

    delete_all_watches(client)


def test_extract_lines_containing_with_include_filters_css(client, live_server, measure_memory_usage, datastore_path):
    """
    CSS include_filters narrows the HTML first; extract_lines_containing then filters
    within that already-reduced text — verifying correct pipeline ordering.
    """
    test_return_data = """<html><body>
      <div class="weather">
        <p>Temperature: 21 celsius</p>
        <p>Humidity: 60%</p>
        <p>Wind: 15 km/h</p>
      </div>
      <div class="news">
        <p>Local forecast: warm celsius weather ahead</p>
        <p>Markets closed early</p>
      </div>
    </body></html>"""

    with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
        f.write(test_return_data)

    test_url = url_for('test_endpoint', _external=True)
    uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
    client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
    wait_for_all_checks(client)

    res = client.post(
        url_for("ui.ui_edit.edit_page", uuid=uuid),
        data={
            # CSS filter: only look inside the weather div
            'include_filters': 'div.weather',
            # Then keep only celsius lines from that section
            'extract_lines_containing': 'celsius',
            "url": test_url,
            "tags": "",
            "headers": "",
            'fetch_backend': "html_requests",
            "time_between_check_use_default": "y"
        },
        follow_redirects=True
    )
    assert b"Updated watch." in res.data
    wait_for_all_checks(client)

    res = client.get(url_for("ui.ui_preview.preview_page", uuid=uuid), follow_redirects=True)

    # Only the celsius line from the weather div should survive both filters
    assert b'celsius' in res.data
    # Other weather lines excluded by extract_lines_containing
    assert b'Humidity' not in res.data
    assert b'Wind' not in res.data
    # News div content excluded entirely by CSS filter (even though it contains "celsius")
    assert b'Markets' not in res.data
    assert b'forecast' not in res.data

    delete_all_watches(client)