Compare commits

..

1 Commits

Author SHA1 Message Date
dgtlmoon
8100a809ca Datastore - Use orjson for faster saves 2025-11-05 14:55:58 +01:00
19 changed files with 157 additions and 112 deletions

View File

@@ -64,7 +64,7 @@ def count_words_in_history(watch):
return 0
latest_key = list(watch.history.keys())[-1]
latest_content = watch.get_history_snapshot(timestamp=latest_key)
latest_content = watch.get_history_snapshot(latest_key)
return len(latest_content.split())
except Exception as e:
logger.error(f"Error counting words: {str(e)}")

View File

@@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
__version__ = '0.50.42'
__version__ = '0.50.39'
from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError

View File

@@ -175,7 +175,7 @@ class WatchSingleHistory(Resource):
response = make_response("No content found", 404)
response.mimetype = "text/plain"
else:
content = watch.get_history_snapshot(timestamp=timestamp)
content = watch.get_history_snapshot(timestamp)
response = make_response(content, 200)
response.mimetype = "text/plain"

View File

@@ -353,15 +353,12 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore):
count = watch.get('check_count', 0) + 1
# Always record page title (used in notifications, and can change even when the content is the same)
if update_obj.get('content-type') and 'html' in update_obj.get('content-type'):
try:
page_title = html_tools.extract_title(data=update_handler.fetcher.content)
if page_title:
page_title = page_title.strip()[:2000]
logger.debug(f"UUID: {uuid} Page <title> is '{page_title}'")
datastore.update_watch(uuid=uuid, update_obj={'page_title': page_title})
except Exception as e:
logger.warning(f"UUID: {uuid} Exception when extracting <title> - {str(e)}")
try:
page_title = html_tools.extract_title(data=update_handler.fetcher.content)
logger.debug(f"UUID: {uuid} Page <title> is '{page_title}'")
datastore.update_watch(uuid=uuid, update_obj={'page_title': page_title})
except Exception as e:
logger.warning(f"UUID: {uuid} Exception when extracting <title> - {str(e)}")
# Record server header
try:

View File

@@ -118,8 +118,8 @@ def construct_blueprint(datastore: ChangeDetectionStore):
fe.title(title=watch_label)
try:
html_diff = diff.render_diff(previous_version_file_contents=watch.get_history_snapshot(timestamp=dates[-2]),
newest_version_file_contents=watch.get_history_snapshot(timestamp=dates[-1]),
html_diff = diff.render_diff(previous_version_file_contents=watch.get_history_snapshot(dates[-2]),
newest_version_file_contents=watch.get_history_snapshot(dates[-1]),
include_equal=False,
line_feed_sep="<br>"
)

View File

@@ -106,7 +106,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
trigger_text = watch.get('trigger_text', [])
# Add text that was triggered
if len(dates):
snapshot_contents = watch.get_history_snapshot(timestamp=dates[-1])
snapshot_contents = watch.get_history_snapshot(dates[-1])
else:
snapshot_contents = "No snapshot/history available, the watch should fetch atleast once."
@@ -123,8 +123,8 @@ def construct_blueprint(datastore: ChangeDetectionStore):
if len(dates) > 1:
prev_snapshot = watch.get_history_snapshot(timestamp=dates[-2])
current_snapshot = watch.get_history_snapshot(timestamp=dates[-1])
prev_snapshot = watch.get_history_snapshot(dates[-2])
current_snapshot = watch.get_history_snapshot(dates[-1])
n_object.update(set_basic_notification_vars(snapshot_contents=snapshot_contents,
current_snapshot=current_snapshot,

View File

@@ -47,7 +47,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
try:
versions = list(watch.history.keys())
content = watch.get_history_snapshot(timestamp=timestamp)
content = watch.get_history_snapshot(timestamp)
triggered_line_numbers = html_tools.strip_ignore_text(content=content,
wordlist=watch['trigger_text'],

View File

@@ -14,7 +14,7 @@ def count_words_in_history(watch, incoming_text=None):
elif watch.history.keys():
# When called from UI extras to count latest snapshot
latest_key = list(watch.history.keys())[-1]
latest_content = watch.get_history_snapshot(timestamp=latest_key)
latest_content = watch.get_history_snapshot(latest_key)
return len(latest_content.split())
return 0
except Exception as e:

View File

@@ -503,9 +503,7 @@ class ValidateJinja2Template(object):
jinja2_env = create_jinja_env(loader=BaseLoader)
# Add notification tokens for validation
static_token_placeholders = NotificationContextData()
static_token_placeholders.set_random_for_validation()
jinja2_env.globals.update(static_token_placeholders)
jinja2_env.globals.update(NotificationContextData())
if hasattr(field, 'extra_notification_tokens'):
jinja2_env.globals.update(field.extra_notification_tokens)

View File

@@ -276,17 +276,9 @@ class model(watch_base):
# When the 'last viewed' timestamp is less than the oldest snapshot, return oldest
return sorted_keys[-1]
def get_history_snapshot(self, timestamp=None, filepath=None):
"""
Accepts either timestamp or filepath
:param timestamp:
:param filepath:
:return:
"""
def get_history_snapshot(self, timestamp):
import brotli
if not filepath:
filepath = self.history[timestamp]
filepath = self.history[timestamp]
# See if a brotli versions exists and switch to that
if not filepath.endswith('.br') and os.path.isfile(f"{filepath}.br"):
@@ -390,7 +382,7 @@ class model(watch_base):
# Compare each lines (set) against each history text file (set) looking for something new..
existing_history = set({})
for k, v in self.history.items():
content = self.get_history_snapshot(filepath=v)
content = self.get_history_snapshot(k)
if ignore_whitespace:
alist = set([line.translate(TRANSLATE_WHITESPACE_TABLE).lower() for line in content.splitlines()])
@@ -647,7 +639,7 @@ class model(watch_base):
for k, fname in self.history.items():
if os.path.isfile(fname):
if True:
contents = self.get_history_snapshot(timestamp=k)
contents = self.get_history_snapshot(k)
res = re.findall(regex, contents, re.MULTILINE)
if res:
if not csv_writer:
@@ -740,7 +732,7 @@ class model(watch_base):
# If a previous attempt doesnt yet exist, just snarf the previous snapshot instead
dates = list(self.history.keys())
if len(dates):
return self.get_history_snapshot(timestamp=dates[-1])
return self.get_history_snapshot(dates[-1])
else:
return ''

View File

@@ -133,7 +133,7 @@ class NotificationService:
# Add text that was triggered
if len(dates):
snapshot_contents = watch.get_history_snapshot(timestamp=dates[-1])
snapshot_contents = watch.get_history_snapshot(dates[-1])
else:
snapshot_contents = "No snapshot/history available, the watch should fetch atleast once."
@@ -154,8 +154,8 @@ class NotificationService:
current_snapshot = "Example text: example test\nExample text: change detection is fantastic\nExample text: even more examples\nExample text: a lot more examples"
if len(dates) > 1:
prev_snapshot = watch.get_history_snapshot(timestamp=dates[-2])
current_snapshot = watch.get_history_snapshot(timestamp=dates[-1])
prev_snapshot = watch.get_history_snapshot(dates[-2])
current_snapshot = watch.get_history_snapshot(dates[-1])
n_object.update(set_basic_notification_vars(snapshot_contents=snapshot_contents,

View File

@@ -37,6 +37,18 @@ class SignalHandler:
notification_event_signal.connect(self.handle_notification_event, weak=False)
logger.info("SignalHandler: Connected to notification_event signal")
# Create and start the queue update thread using standard threading
import threading
self.polling_emitter_thread = threading.Thread(
target=self.polling_emit_running_or_queued_watches_threaded,
daemon=True
)
self.polling_emitter_thread.start()
logger.info("Started polling thread using threading (eventlet-free)")
# Store the thread reference in socketio for clean shutdown
self.socketio_instance.polling_emitter_thread = self.polling_emitter_thread
def handle_signal(self, *args, **kwargs):
logger.trace(f"SignalHandler: Signal received with {len(args)} args and {len(kwargs)} kwargs")
# Safely extract the watch UUID from kwargs
@@ -112,6 +124,74 @@ class SignalHandler:
except Exception as e:
logger.error(f"Socket.IO error in handle_notification_event: {str(e)}")
def polling_emit_running_or_queued_watches_threaded(self):
"""Threading version of polling for Windows compatibility"""
import time
import threading
logger.info("Queue update thread started (threading mode)")
# Import here to avoid circular imports
from changedetectionio.flask_app import app
from changedetectionio import worker_handler
watch_check_update = signal('watch_check_update')
# Track previous state to avoid unnecessary emissions
previous_running_uuids = set()
# Run until app shutdown - check exit flag more frequently for fast shutdown
exit_event = getattr(app.config, 'exit', threading.Event())
while not exit_event.is_set():
try:
# Get current running UUIDs from async workers
running_uuids = set(worker_handler.get_running_uuids())
# Only send updates for UUIDs that changed state
newly_running = running_uuids - previous_running_uuids
no_longer_running = previous_running_uuids - running_uuids
# Send updates for newly running UUIDs (but exit fast if shutdown requested)
for uuid in newly_running:
if exit_event.is_set():
break
logger.trace(f"Threading polling: UUID {uuid} started processing")
with app.app_context():
watch_check_update.send(app_context=app, watch_uuid=uuid)
time.sleep(0.01) # Small yield
# Send updates for UUIDs that finished processing (but exit fast if shutdown requested)
if not exit_event.is_set():
for uuid in no_longer_running:
if exit_event.is_set():
break
logger.trace(f"Threading polling: UUID {uuid} finished processing")
with app.app_context():
watch_check_update.send(app_context=app, watch_uuid=uuid)
time.sleep(0.01) # Small yield
# Update tracking for next iteration
previous_running_uuids = running_uuids
# Sleep between polling cycles, but check exit flag every 0.5 seconds for fast shutdown
for _ in range(20): # 20 * 0.5 = 10 seconds total
if exit_event.is_set():
break
time.sleep(0.5)
except Exception as e:
logger.error(f"Error in threading polling: {str(e)}")
# Even during error recovery, check for exit quickly
for _ in range(1): # 1 * 0.5 = 0.5 seconds
if exit_event.is_set():
break
time.sleep(0.5)
# Check if we're in pytest environment - if so, be more gentle with logging
import sys
in_pytest = "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ
if not in_pytest:
logger.info("Queue update thread stopped (threading mode)")
def handle_watch_update(socketio, **kwargs):
@@ -303,6 +383,19 @@ def init_socketio(app, datastore):
"""Shutdown the SocketIO server fast and aggressively"""
try:
logger.info("Socket.IO: Fast shutdown initiated...")
# For threading mode, give the thread a very short time to exit gracefully
if hasattr(socketio, 'polling_emitter_thread'):
if socketio.polling_emitter_thread.is_alive():
logger.info("Socket.IO: Waiting 1 second for polling thread to stop...")
socketio.polling_emitter_thread.join(timeout=1.0) # Only 1 second timeout
if socketio.polling_emitter_thread.is_alive():
logger.info("Socket.IO: Polling thread still running after timeout - continuing with shutdown")
else:
logger.info("Socket.IO: Polling thread stopped quickly")
else:
logger.info("Socket.IO: Polling thread already stopped")
logger.info("Socket.IO: Fast shutdown complete")
except Exception as e:
logger.error(f"Socket.IO error during shutdown: {str(e)}")

View File

@@ -19,9 +19,18 @@ def test_inscriptus():
def test_check_basic_change_detection_functionality(client, live_server, measure_memory_usage, datastore_path):
set_original_response(datastore_path=datastore_path)
# live_server_setup(live_server) # Setup on conftest per function
uuid = client.application.config.get('DATASTORE').add_watch(url=url_for('test_endpoint', _external=True))
# Add our URL to the import page
res = client.post(
url_for("imports.import_page"),
data={"urls": url_for('test_endpoint', _external=True)},
follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
# Do this a few times.. ensures we dont accidently set the status
for n in range(3):
@@ -106,6 +115,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
# It should report nothing found (no new 'has-unread-changes' class)
res = client.get(url_for("watchlist.index"))
assert b'has-unread-changes' not in res.data
assert b'class="has-unread-changes' not in res.data
assert b'head title' in res.data # Should be ON by default
@@ -119,6 +129,23 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
res = client.get(url_for("watchlist.index"))
assert b'head title and more' in res.data
# disable <title> pickup
res = client.post(
url_for("settings.settings_page"),
data={"application-ui-use_page_title_in_list": "", "requests-time_between_check-minutes": 180,
'application-fetch_backend': "html_requests"},
follow_redirects=True
)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
assert b'has-unread-changes' in res.data
assert b'class="has-unread-changes' in res.data
assert b'head title' not in res.data # should now be off
# Be sure the last_viewed is going to be greater than the last snapshot
time.sleep(1)
@@ -139,63 +166,6 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
# Cleanup everything
delete_all_watches(client)
def test_title_scraper(client, live_server, measure_memory_usage, datastore_path):
set_original_response(datastore_path=datastore_path)
uuid = client.application.config.get('DATASTORE').add_watch(url=url_for('test_endpoint', _external=True))
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks()
# It should report nothing found (no new 'has-unread-changes' class)
res = client.get(url_for("watchlist.index"))
assert b'head title' in res.data # Should be ON by default
# Recheck it but only with a title change, content wasnt changed
set_original_response(datastore_path=datastore_path, extra_title=" and more")
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
assert b'head title and more' in res.data
# disable <title> pickup
res = client.post(
url_for("settings.settings_page"),
data={"application-ui-use_page_title_in_list": "",
"requests-time_between_check-minutes": 180,
'application-fetch_backend': "html_requests"},
follow_redirects=True
)
set_original_response(datastore_path=datastore_path, extra_title=" SHOULD NOT APPEAR")
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(url_for("watchlist.index"))
assert b'SHOULD NOT APPEAR' not in res.data
delete_all_watches(client)
def test_title_scraper_html_only(client, live_server, measure_memory_usage, datastore_path):
with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f:
f.write('"My text document\nWhere I talk about <title>\nwhich should not get registered\n</title>')
test_url = url_for('test_endpoint', content_type="text/plain", _external=True)
uuid = client.application.config.get('DATASTORE').add_watch(test_url)
client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
wait_for_all_checks()
# It should report nothing found (no new 'has-unread-changes' class)
res = client.get(url_for("watchlist.index"))
assert b'which should not get registered' not in res.data # Should be ON by default
assert not live_server.app.config['DATASTORE'].data['watching'][uuid].get('title')
# Server says its plaintext, we should always treat it as plaintext, and then if they have a filter, try to apply that
def test_requests_timeout(client, live_server, measure_memory_usage, datastore_path):

View File

@@ -353,7 +353,7 @@ def check_json_ext_filter(json_filter, client, live_server, datastore_path):
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
dates = list(watch.history.keys())
snapshot_contents = watch.get_history_snapshot(timestamp=dates[0])
snapshot_contents = watch.get_history_snapshot(dates[0])
assert snapshot_contents[0] == '['
@@ -439,7 +439,7 @@ def test_correct_header_detect(client, live_server, measure_memory_usage, datast
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
dates = list(watch.history.keys())
snapshot_contents = watch.get_history_snapshot(timestamp=dates[0])
snapshot_contents = watch.get_history_snapshot(dates[0])
assert b'&#34;hello&#34;: 123,' in res.data # properly html escaped in the front end

View File

@@ -302,20 +302,15 @@ def test_notification_urls_jinja2_apprise_integration(client, live_server, measu
data={
"application-fetch_backend": "html_requests",
"application-minutes_between_check": 180,
"application-notification_body": '{ "url" : "{{ watch_url }}", "secret": 444, "somebug": "网站监测 内容更新了", "another": "{{diff|truncate(1500)}}" }',
"application-notification_body": '{ "url" : "{{ watch_url }}", "secret": 444, "somebug": "网站监测 内容更新了" }',
"application-notification_format": default_notification_format,
"application-notification_urls": test_notification_url,
# https://github.com/caronc/apprise/wiki/Notify_Custom_JSON#get-parameter-manipulation
"application-notification_title": "New ChangeDetection.io Notification - {{ watch_url }} {{diff|truncate(200)}} ",
"application-notification_title": "New ChangeDetection.io Notification - {{ watch_url }} ",
},
follow_redirects=True
)
assert b'Settings updated' in res.data
assert '网站监测'.encode() in res.data
assert b'{{diff|truncate(1500)}}' in res.data
assert b'{{diff|truncate(200)}}' in res.data
def test_notification_custom_endpoint_and_jinja2(client, live_server, measure_memory_usage, datastore_path):

View File

@@ -22,7 +22,7 @@ def test_fetch_pdf(client, live_server, measure_memory_usage, datastore_path):
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
dates = list(watch.history.keys())
snapshot_contents = watch.get_history_snapshot(timestamp=dates[0])
snapshot_contents = watch.get_history_snapshot(dates[0])
# PDF header should not be there (it was converted to text)
assert 'PDF' not in snapshot_contents
@@ -75,7 +75,7 @@ def test_fetch_pdf(client, live_server, measure_memory_usage, datastore_path):
dates = list(watch.history.keys())
# new snapshot was also OK, no HTML
snapshot_contents = watch.get_history_snapshot(timestamp=dates[1])
snapshot_contents = watch.get_history_snapshot(dates[1])
assert 'html' not in snapshot_contents.lower()
assert f'Original file size - {os.path.getsize(os.path.join(datastore_path, "endpoint-test.pdf"))}' in snapshot_contents
assert f'here is a change' in snapshot_contents

View File

@@ -65,7 +65,7 @@ def test_rss_reader_mode(client, live_server, measure_memory_usage, datastore_pa
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
dates = list(watch.history.keys())
snapshot_contents = watch.get_history_snapshot(timestamp=dates[0])
snapshot_contents = watch.get_history_snapshot(dates[0])
assert 'Wet noodles escape' in snapshot_contents
assert '<br>' not in snapshot_contents
assert '&lt;' not in snapshot_contents
@@ -91,7 +91,7 @@ def test_rss_reader_mode_with_css_filters(client, live_server, measure_memory_us
watch = live_server.app.config['DATASTORE'].data['watching'][uuid]
dates = list(watch.history.keys())
snapshot_contents = watch.get_history_snapshot(timestamp=dates[0])
snapshot_contents = watch.get_history_snapshot(dates[0])
assert 'Wet noodles escape' not in snapshot_contents
assert '<br>' not in snapshot_contents
assert '&lt;' not in snapshot_contents

View File

@@ -55,8 +55,8 @@ class TestTriggerConditions(unittest.TestCase):
self.assertEqual(len(history), 2)
# Retrieve and check snapshots
#snapshot1 = watch.get_history_snapshot(timestamp=str(timestamp1))
#snapshot2 = watch.get_history_snapshot(timestamp=str(timestamp2))
#snapshot1 = watch.get_history_snapshot(str(timestamp1))
#snapshot2 = watch.get_history_snapshot(str(timestamp2))
self.store.data['watching'][self.watch_uuid].update(
{

View File

@@ -34,7 +34,7 @@ wtforms~=3.2
jsonpath-ng~=1.7.0
# Fast JSON serialization for better performance
orjson~=3.11
orjson~=3.10
# dnspython - Used by paho-mqtt for MQTT broker resolution
# Version pin removed since eventlet (which required the specific 2.6.1 pin) has been eliminated