Files
changedetection.io/changedetectionio/tests/test_history_consistency.py
T
dgtlmoon b6d3d63372
Build and push containers / metadata (push) Has been cancelled
Build and push containers / build-push-containers (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
Avoid reprocessing if the page was the same (#3867)
2026-02-14 21:24:28 +01:00

275 lines
12 KiB
Python

#!/usr/bin/env python3
import time
import os
import json
from flask import url_for
from loguru import logger
from .. import strtobool
from .util import wait_for_all_checks, delete_all_watches
import brotli
def test_consistent_history(client, live_server, measure_memory_usage, datastore_path):
uuids = set()
sys_fetch_workers = int(os.getenv("FETCH_WORKERS", 10))
workers = range(1, sys_fetch_workers)
now = time.time()
for one in workers:
if strtobool(os.getenv("TEST_WITH_BROTLI")):
# A very long string that WILL trigger Brotli compression of the snapshot
# BROTLI_COMPRESS_SIZE_THRESHOLD should be set to say 200
from ..model.Watch import BROTLI_COMPRESS_SIZE_THRESHOLD
content = str(one) + "x" + str(one) * (BROTLI_COMPRESS_SIZE_THRESHOLD + 10)
else:
# Just enough to test datastore
content = str(one)+'x'
test_url = url_for('test_endpoint', content_type="text/html", content=content, _external=True)
uuids.add(client.application.config.get('DATASTORE').add_watch(url=test_url, extras={'title': str(one)}))
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
duration = time.time() - now
per_worker = duration/sys_fetch_workers
if sys_fetch_workers < 20:
per_worker_threshold=0.6
elif sys_fetch_workers < 50:
per_worker_threshold = 0.8
else:
per_worker_threshold = 1.5
logger.debug(f"All fetched in {duration:.2f}s, {per_worker}s per worker")
# Problematic on github
#assert per_worker < per_worker_threshold, f"If concurrency is working good, no blocking async problems, each worker ({sys_fetch_workers} workers) should have done his job in under {per_worker_threshold}s, got {per_worker:.2f}s per worker, total duration was {duration:.2f}s"
# Essentially just triggers the DB write/update
res = client.post(
url_for("settings.settings_page"),
data={"application-empty_pages_are_a_change": "",
"requests-time_between_check-minutes": 180,
'application-fetch_backend': "html_requests"},
follow_redirects=True
)
assert b"Settings updated." in res.data
# Wait for the sync DB save to happen
time.sleep(2)
# Check which format is being used
datastore_path = live_server.app.config['DATASTORE'].datastore_path
changedetection_json = os.path.join(datastore_path, 'changedetection.json')
url_watches_json = os.path.join(datastore_path, 'url-watches.json')
json_obj = {'watching': {}}
if os.path.exists(changedetection_json):
# New format: individual watch.json files
logger.info("Testing with new format (changedetection.json + individual watch.json)")
# Load each watch.json file
for uuid in live_server.app.config['DATASTORE'].data['watching'].keys():
watch_json_file = os.path.join(datastore_path, uuid, 'watch.json')
assert os.path.isfile(watch_json_file), f"watch.json should exist at {watch_json_file}"
with open(watch_json_file, 'r', encoding='utf-8') as f:
json_obj['watching'][uuid] = json.load(f)
else:
# Legacy format: url-watches.json
logger.info("Testing with legacy format (url-watches.json)")
with open(url_watches_json, 'r', encoding='utf-8') as f:
json_obj = json.load(f)
# assert the right amount of watches was found in the JSON
assert len(json_obj['watching']) == len(workers), "Correct number of watches was found in the JSON"
i = 0
# each one should have a history.txt containing just one line
for w in json_obj['watching'].keys():
i += 1
history_txt_index_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, 'history.txt')
assert os.path.isfile(history_txt_index_file), f"History.txt should exist where I expect it at {history_txt_index_file}"
# Should be no errors (could be from brotli etc)
assert not live_server.app.config['DATASTORE'].data['watching'][w].get('last_error')
# Same like in model.Watch
with open(history_txt_index_file, "r") as f:
tmp_history = dict(i.strip().split(',', 2) for i in f.readlines())
assert len(tmp_history) == 1, "History.txt should contain 1 line"
# Should be two files,. the history.txt , and the snapshot.txt
files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w))
# Find the snapshot one
for fname in files_in_watch_dir:
if fname != 'history.txt' and fname != 'watch.json' and fname != 'last-checksum.txt' and 'html' not in fname:
if strtobool(os.getenv("TEST_WITH_BROTLI")):
assert fname.endswith('.br'), "Forced TEST_WITH_BROTLI then it should be a .br filename"
full_snapshot_history_path = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, fname)
# contents should match what we requested as content returned from the test url
if fname.endswith('.br'):
with open(full_snapshot_history_path, 'rb') as f:
contents = brotli.decompress(f.read()).decode('utf-8')
else:
with open(full_snapshot_history_path, 'r') as snapshot_f:
contents = snapshot_f.read()
watch_title = json_obj['watching'][w]['title']
assert json_obj['watching'][w]['title'], "Watch should have a title set"
assert contents.startswith(watch_title + "x"), f"Snapshot contents in file {fname} should start with '{watch_title}x', got '{contents}'"
# With new format, we have watch.json, so 4 files minimum
# Note: last-checksum.txt may or may not exist - it gets cleared by settings changes,
# and this test changes settings before checking files
# This assertion should be AFTER the loop, not inside it
if os.path.exists(changedetection_json):
# 4 required files: watch.json, html.br, history.txt, extracted text snapshot
# last-checksum.txt is optional (cleared by settings changes in this test)
assert len(files_in_watch_dir) >= 4 and len(files_in_watch_dir) <= 5, f"Should be 4-5 files in the dir with new format (last-checksum.txt is optional). Found {len(files_in_watch_dir)}: {files_in_watch_dir}"
else:
# 3 required files: html.br, history.txt, extracted text snapshot
# last-checksum.txt is optional
assert len(files_in_watch_dir) >= 3 and len(files_in_watch_dir) <= 4, f"Should be 3-4 files in the dir with legacy format (last-checksum.txt is optional). Found {len(files_in_watch_dir)}: {files_in_watch_dir}"
# Check that 'default' Watch vars aren't accidentally being saved
if os.path.exists(changedetection_json):
# New format: check all individual watch.json files
for uuid in json_obj['watching'].keys():
watch_json_file = os.path.join(datastore_path, uuid, 'watch.json')
with open(watch_json_file, 'r', encoding='utf-8') as f:
assert '"default"' not in f.read(), f"'default' probably shouldnt be here in {watch_json_file}, it came from when the 'default' Watch vars were accidently being saved"
else:
# Legacy format: check url-watches.json
with open(url_watches_json, 'r', encoding='utf-8') as f:
assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"
delete_all_watches(client)
def test_check_text_history_view(client, live_server, measure_memory_usage, datastore_path):
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write("<html>test-one</html>")
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
wait_for_all_checks(client)
# Set second version, Make a change
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write("<html>test-two</html>")
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("ui.ui_diff.diff_history_page", uuid=uuid))
assert b'test-one' in res.data
assert b'test-two' in res.data
# Set third version, Make a change
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write("<html>test-three</html>")
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
# It should remember the last viewed time, so the first difference is not shown
res = client.get(url_for("ui.ui_diff.diff_history_page", uuid="first"))
assert b'test-three' in res.data
assert b'test-two' in res.data
assert b'test-one' not in res.data
delete_all_watches(client)
def test_history_trim_global_only(client, live_server, measure_memory_usage, datastore_path):
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
uuid = None
limit = 3
for i in range(0, 10):
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(f"<html>test {i}</html>")
if not uuid:
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
if i ==8:
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
history_n = len(list(watch.history.keys()))
logger.debug(f"History length should be at limit {limit} and it is {history_n}")
assert history_n == limit
if i == 6:
res = client.post(
url_for("settings.settings_page"),
data={"application-history_snapshot_max_length": limit},
follow_redirects=True
)
# It will need to detect one more change to start trimming it, which is really at 'start of 7'
assert b'Settings updated' in res.data
delete_all_watches(client)
def test_history_trim_global_override_in_watch(client, live_server, measure_memory_usage, datastore_path):
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
uuid = None
limit = 3
res = client.post(
url_for("settings.settings_page"),
data={"application-history_snapshot_max_length": 10000},
follow_redirects=True
)
# It will need to detect one more change to start trimming it, which is really at 'start of 7'
assert b'Settings updated' in res.data
for i in range(0, 10):
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write(f"<html>test {i}</html>")
if not uuid:
uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
res = client.post(
url_for("ui.ui_edit.edit_page", uuid="first"),
data={"include_filters": "", "url": test_url, "tags": "", "headers": "", 'fetch_backend': "html_requests",
"time_between_check_use_default": "y", "history_snapshot_max_length": str(limit)},
follow_redirects=True
)
assert b"Updated watch." in res.data
wait_for_all_checks(client)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
if i == 8:
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
history_n = len(list(watch.history.keys()))
logger.debug(f"History length should be at limit {limit} and it is {history_n}")
assert history_n == limit
if i == 6:
res = client.post(
url_for("settings.settings_page"),
data={"application-history_snapshot_max_length": limit},
follow_redirects=True
)
# It will need to detect one more change to start trimming it, which is really at 'start of 7'
assert b'Settings updated' in res.data
delete_all_watches(client)