mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-02-19 04:36:04 +00:00
Compare commits
20 Commits
restock-va
...
0.53.4
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bca35f680e | ||
|
|
fafea1b5c6 | ||
|
|
93630e188d | ||
|
|
7e99d748b9 | ||
|
|
352c91c619 | ||
|
|
a6e55aaba9 | ||
|
|
25a17bd49d | ||
|
|
954582a581 | ||
|
|
d8ef86a8b5 | ||
|
|
8711d29861 | ||
|
|
2343ddd88a | ||
|
|
c6d6ef0e0c | ||
|
|
23063ad8a1 | ||
|
|
27b8a2d178 | ||
|
|
a53f2a784d | ||
|
|
7558ca5fda | ||
|
|
383c3b427f | ||
|
|
b01ba5d8a1 | ||
|
|
86e5184cef | ||
|
|
1dbf1f5db5 |
33
.github/nginx-reverse-proxy-test.conf
vendored
Normal file
33
.github/nginx-reverse-proxy-test.conf
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
server {
|
||||
listen 80;
|
||||
server_name localhost;
|
||||
|
||||
# Test basic reverse proxy to changedetection.io
|
||||
location / {
|
||||
proxy_pass http://changedet-app:5000;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
# WebSocket support
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection "upgrade";
|
||||
}
|
||||
|
||||
# Test subpath deployment with X-Forwarded-Prefix
|
||||
location /changedet-sub/ {
|
||||
proxy_pass http://changedet-app:5000/;
|
||||
proxy_set_header X-Forwarded-Prefix /changedet-sub;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
# WebSocket support
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection "upgrade";
|
||||
}
|
||||
}
|
||||
169
.github/workflows/test-stack-reusable-workflow.yml
vendored
169
.github/workflows/test-stack-reusable-workflow.yml
vendored
@@ -324,6 +324,175 @@ jobs:
|
||||
run: |
|
||||
docker run --rm --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/smtp/test_notification_smtp.py'
|
||||
|
||||
nginx-reverse-proxy:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
timeout-minutes: 10
|
||||
env:
|
||||
PYTHON_VERSION: ${{ inputs.python-version }}
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Download Docker image artifact
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: test-changedetectionio-${{ env.PYTHON_VERSION }}
|
||||
path: /tmp
|
||||
|
||||
- name: Load Docker image
|
||||
run: |
|
||||
docker load -i /tmp/test-changedetectionio.tar
|
||||
|
||||
- name: Spin up services
|
||||
run: |
|
||||
docker network create changedet-network
|
||||
|
||||
# Start changedetection.io container with X-Forwarded headers support
|
||||
docker run --name changedet-app --hostname changedet-app --network changedet-network \
|
||||
-e USE_X_SETTINGS=true \
|
||||
-d test-changedetectionio
|
||||
sleep 3
|
||||
|
||||
- name: Start nginx reverse proxy
|
||||
run: |
|
||||
# Start nginx with our test configuration
|
||||
docker run --name nginx-proxy --network changedet-network -d -p 8080:80 --rm \
|
||||
-v ${{ github.workspace }}/.github/nginx-reverse-proxy-test.conf:/etc/nginx/conf.d/default.conf:ro \
|
||||
nginx:alpine
|
||||
sleep 2
|
||||
|
||||
- name: Test reverse proxy - root path
|
||||
run: |
|
||||
echo "=== Testing nginx reverse proxy at root path ==="
|
||||
curl --retry-connrefused --retry 6 -s http://localhost:8080/ > /tmp/nginx-test-root.html
|
||||
|
||||
# Check for changedetection.io UI elements
|
||||
if grep -q "checkbox-uuid" /tmp/nginx-test-root.html; then
|
||||
echo "✓ Found checkbox-uuid in response"
|
||||
else
|
||||
echo "ERROR: checkbox-uuid not found in response"
|
||||
cat /tmp/nginx-test-root.html
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check for watchlist content
|
||||
if grep -q -i "watch" /tmp/nginx-test-root.html; then
|
||||
echo "✓ Found watch/watchlist content in response"
|
||||
else
|
||||
echo "ERROR: watchlist content not found"
|
||||
cat /tmp/nginx-test-root.html
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✓ Root path reverse proxy working correctly"
|
||||
|
||||
- name: Test reverse proxy - subpath with X-Forwarded-Prefix
|
||||
run: |
|
||||
echo "=== Testing nginx reverse proxy at subpath /changedet-sub/ ==="
|
||||
curl --retry-connrefused --retry 6 -s http://localhost:8080/changedet-sub/ > /tmp/nginx-test-subpath.html
|
||||
|
||||
# Check for changedetection.io UI elements
|
||||
if grep -q "checkbox-uuid" /tmp/nginx-test-subpath.html; then
|
||||
echo "✓ Found checkbox-uuid in subpath response"
|
||||
else
|
||||
echo "ERROR: checkbox-uuid not found in subpath response"
|
||||
cat /tmp/nginx-test-subpath.html
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✓ Subpath reverse proxy working correctly"
|
||||
|
||||
- name: Test API through reverse proxy subpath
|
||||
run: |
|
||||
echo "=== Testing API endpoints through nginx subpath /changedet-sub/ ==="
|
||||
|
||||
# Extract API key from the changedetection.io datastore
|
||||
API_KEY=$(docker exec changedet-app cat /datastore/changedetection.json | grep -o '"api_access_token": *"[^"]*"' | cut -d'"' -f4)
|
||||
|
||||
if [ -z "$API_KEY" ]; then
|
||||
echo "ERROR: Could not extract API key from datastore"
|
||||
docker exec changedet-app cat /datastore/changedetection.json
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✓ Extracted API key: ${API_KEY:0:8}..."
|
||||
|
||||
# Create a watch via API through nginx proxy subpath
|
||||
echo "Creating watch via POST to /changedet-sub/api/v1/watch"
|
||||
RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "http://localhost:8080/changedet-sub/api/v1/watch" \
|
||||
-H "x-api-key: ${API_KEY}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com/test-nginx-proxy",
|
||||
"tag": "nginx-test"
|
||||
}')
|
||||
|
||||
HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
|
||||
BODY=$(echo "$RESPONSE" | head -n-1)
|
||||
|
||||
if [ "$HTTP_CODE" != "201" ]; then
|
||||
echo "ERROR: Expected HTTP 201, got $HTTP_CODE"
|
||||
echo "Response: $BODY"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✓ Watch created successfully (HTTP 201)"
|
||||
|
||||
# Extract the watch UUID from response
|
||||
WATCH_UUID=$(echo "$BODY" | grep -o '"uuid": *"[^"]*"' | cut -d'"' -f4)
|
||||
echo "✓ Watch UUID: $WATCH_UUID"
|
||||
|
||||
# Update the watch via PUT through nginx proxy subpath
|
||||
echo "Updating watch via PUT to /changedet-sub/api/v1/watch/${WATCH_UUID}"
|
||||
RESPONSE=$(curl -s -w "\n%{http_code}" -X PUT "http://localhost:8080/changedet-sub/api/v1/watch/${WATCH_UUID}" \
|
||||
-H "x-api-key: ${API_KEY}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"paused": true
|
||||
}')
|
||||
|
||||
HTTP_CODE=$(echo "$RESPONSE" | tail -n1)
|
||||
BODY=$(echo "$RESPONSE" | head -n-1)
|
||||
|
||||
if [ "$HTTP_CODE" != "200" ]; then
|
||||
echo "ERROR: Expected HTTP 200, got $HTTP_CODE"
|
||||
echo "Response: $BODY"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if echo "$BODY" | grep -q 'OK'; then
|
||||
echo "✓ Watch updated successfully (HTTP 200, response: OK)"
|
||||
else
|
||||
echo "ERROR: Expected response 'OK', got: $BODY"
|
||||
echo "Response: $BODY"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Verify the watch is paused via GET
|
||||
echo "Verifying watch is paused via GET"
|
||||
RESPONSE=$(curl -s "http://localhost:8080/changedet-sub/api/v1/watch/${WATCH_UUID}" \
|
||||
-H "x-api-key: ${API_KEY}")
|
||||
|
||||
if echo "$RESPONSE" | grep -q '"paused": *true'; then
|
||||
echo "✓ Watch is paused as expected"
|
||||
else
|
||||
echo "ERROR: Watch paused state not confirmed"
|
||||
echo "Response: $RESPONSE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✓ API tests through nginx subpath completed successfully"
|
||||
|
||||
- name: Cleanup nginx test
|
||||
if: always()
|
||||
run: |
|
||||
docker logs nginx-proxy || true
|
||||
docker logs changedet-app || true
|
||||
docker stop nginx-proxy changedet-app || true
|
||||
docker rm nginx-proxy changedet-app || true
|
||||
|
||||
|
||||
|
||||
# Proxy tests
|
||||
proxy-tests:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
|
||||
# Semver means never use .01, or 00. Should be .1.
|
||||
__version__ = '0.53.1'
|
||||
__version__ = '0.53.4'
|
||||
|
||||
from changedetectionio.strtobool import strtobool
|
||||
from json.decoder import JSONDecodeError
|
||||
@@ -610,7 +610,7 @@ def main():
|
||||
|
||||
@app.context_processor
|
||||
def inject_template_globals():
|
||||
return dict(right_sticky="v{}".format(datastore.data['version_tag']),
|
||||
return dict(right_sticky="v"+__version__,
|
||||
new_version_available=app.config['NEW_VERSION_AVAILABLE'],
|
||||
has_password=datastore.data['settings']['application']['password'] != False,
|
||||
socket_io_enabled=datastore.data['settings']['application'].get('ui', {}).get('socket_io_enabled', True),
|
||||
|
||||
@@ -97,17 +97,6 @@ class Tag(Resource):
|
||||
# Delete the tag, and any tag reference
|
||||
del self.datastore.data['settings']['application']['tags'][uuid]
|
||||
|
||||
# Delete tag.json file if it exists
|
||||
import os
|
||||
tag_dir = os.path.join(self.datastore.datastore_path, uuid)
|
||||
tag_json = os.path.join(tag_dir, "tag.json")
|
||||
if os.path.exists(tag_json):
|
||||
try:
|
||||
os.unlink(tag_json)
|
||||
logger.info(f"Deleted tag.json for tag {uuid}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete tag.json for tag {uuid}: {e}")
|
||||
|
||||
# Remove tag from all watches
|
||||
for watch_uuid, watch in self.datastore.data['watching'].items():
|
||||
if watch.get('tags') and uuid in watch['tags']:
|
||||
|
||||
@@ -103,6 +103,7 @@ def validate_openapi_request(operation_id):
|
||||
if request.method.upper() != 'GET':
|
||||
# Lazy import - only loaded when actually validating a request
|
||||
from openapi_core.contrib.flask import FlaskOpenAPIRequest
|
||||
from openapi_core.templating.paths.exceptions import ServerNotFound, PathNotFound, PathError
|
||||
|
||||
spec = get_openapi_spec()
|
||||
openapi_request = FlaskOpenAPIRequest(request)
|
||||
@@ -110,6 +111,16 @@ def validate_openapi_request(operation_id):
|
||||
if result.errors:
|
||||
error_details = []
|
||||
for error in result.errors:
|
||||
# Skip path/server validation errors for reverse proxy compatibility
|
||||
# Flask routing already validates that endpoints exist (returns 404 if not).
|
||||
# OpenAPI validation here is primarily for request body schema validation.
|
||||
# When behind nginx/reverse proxy, URLs may have path prefixes that don't
|
||||
# match the OpenAPI server definitions, causing false positives.
|
||||
if isinstance(error, PathError):
|
||||
logger.debug(f"API Call - Skipping path/server validation (delegated to Flask): {error}")
|
||||
continue
|
||||
|
||||
error_str = str(error)
|
||||
# Extract detailed schema errors from __cause__
|
||||
if hasattr(error, '__cause__') and hasattr(error.__cause__, 'schema_errors'):
|
||||
for schema_error in error.__cause__.schema_errors:
|
||||
@@ -117,9 +128,12 @@ def validate_openapi_request(operation_id):
|
||||
msg = schema_error.message if hasattr(schema_error, 'message') else str(schema_error)
|
||||
error_details.append(f"{field}: {msg}")
|
||||
else:
|
||||
error_details.append(str(error))
|
||||
error_details.append(error_str)
|
||||
|
||||
# Only raise if we have actual validation errors (not path/server issues)
|
||||
if error_details:
|
||||
logger.error(f"API Call - Validation failed: {'; '.join(error_details)}")
|
||||
raise BadRequest(f"Validation failed: {'; '.join(error_details)}")
|
||||
raise BadRequest(f"Validation failed: {'; '.join(error_details)}")
|
||||
except BadRequest:
|
||||
# Re-raise BadRequest exceptions (validation failures)
|
||||
raise
|
||||
|
||||
@@ -70,17 +70,6 @@ def construct_blueprint(datastore: ChangeDetectionStore):
|
||||
if datastore.data['settings']['application']['tags'].get(uuid):
|
||||
del datastore.data['settings']['application']['tags'][uuid]
|
||||
|
||||
# Delete tag.json file if it exists
|
||||
import os
|
||||
tag_dir = os.path.join(datastore.datastore_path, uuid)
|
||||
tag_json = os.path.join(tag_dir, "tag.json")
|
||||
if os.path.exists(tag_json):
|
||||
try:
|
||||
os.unlink(tag_json)
|
||||
logger.info(f"Deleted tag.json for tag {uuid}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete tag.json for tag {uuid}: {e}")
|
||||
|
||||
# Remove tag from all watches in background thread to avoid blocking
|
||||
def remove_tag_background(tag_uuid):
|
||||
"""Background thread to remove tag from watches - discarded after completion."""
|
||||
@@ -127,19 +116,11 @@ def construct_blueprint(datastore: ChangeDetectionStore):
|
||||
@tags_blueprint.route("/delete_all", methods=['GET'])
|
||||
@login_optionally_required
|
||||
def delete_all():
|
||||
# Delete all tag.json files
|
||||
import os
|
||||
for tag_uuid in list(datastore.data['settings']['application']['tags'].keys()):
|
||||
tag_dir = os.path.join(datastore.datastore_path, tag_uuid)
|
||||
tag_json = os.path.join(tag_dir, "tag.json")
|
||||
if os.path.exists(tag_json):
|
||||
try:
|
||||
os.unlink(tag_json)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete tag.json for tag {tag_uuid}: {e}")
|
||||
|
||||
# Clear all tags from settings immediately
|
||||
datastore.data['settings']['application']['tags'] = {}
|
||||
for tag_uuid in list(datastore.data['settings']['application']['tags'].keys()):
|
||||
# TagsDict 'del' handler will remove the dir
|
||||
del datastore.data['settings']['application']['tags'][tag_uuid]
|
||||
|
||||
|
||||
# Clear tags from all watches in background thread to avoid blocking
|
||||
def clear_all_tags_background():
|
||||
@@ -255,7 +236,4 @@ def construct_blueprint(datastore: ChangeDetectionStore):
|
||||
return redirect(url_for('tags.tags_overview_page'))
|
||||
|
||||
|
||||
@tags_blueprint.route("/delete/<string:uuid>", methods=['GET'])
|
||||
def form_tag_delete(uuid):
|
||||
return redirect(url_for('tags.tags_overview_page'))
|
||||
return tags_blueprint
|
||||
|
||||
@@ -194,9 +194,9 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
|
||||
tag_limit = request.args.get('tag')
|
||||
now = int(time.time())
|
||||
|
||||
# Mark watches as viewed in background thread to avoid blocking
|
||||
def mark_viewed_background():
|
||||
"""Background thread to mark watches as viewed - discarded after completion."""
|
||||
# Mark watches as viewed - use background thread only for large watch counts
|
||||
def mark_viewed_impl():
|
||||
"""Mark watches as viewed - can run synchronously or in background thread."""
|
||||
marked_count = 0
|
||||
try:
|
||||
for watch_uuid, watch in datastore.data['watching'].items():
|
||||
@@ -209,15 +209,21 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, worker_pool,
|
||||
datastore.set_last_viewed(watch_uuid, now)
|
||||
marked_count += 1
|
||||
|
||||
logger.info(f"Background marking complete: {marked_count} watches marked as viewed")
|
||||
logger.info(f"Marking complete: {marked_count} watches marked as viewed")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in background mark as viewed: {e}")
|
||||
logger.error(f"Error marking as viewed: {e}")
|
||||
|
||||
# Start background thread and return immediately
|
||||
thread = threading.Thread(target=mark_viewed_background, daemon=True)
|
||||
thread.start()
|
||||
# For small watch counts (< 10), run synchronously to avoid race conditions in tests
|
||||
# For larger counts, use background thread to avoid blocking the UI
|
||||
watch_count = len(datastore.data['watching'])
|
||||
if watch_count < 10:
|
||||
# Run synchronously for small watch counts
|
||||
mark_viewed_impl()
|
||||
else:
|
||||
# Start background thread for large watch counts
|
||||
thread = threading.Thread(target=mark_viewed_impl, daemon=True)
|
||||
thread.start()
|
||||
|
||||
flash(gettext("Marking watches as viewed in background..."))
|
||||
return redirect(url_for('watchlist.index', tag=tag_limit))
|
||||
|
||||
@ui_blueprint.route("/delete", methods=['GET'])
|
||||
|
||||
@@ -86,8 +86,8 @@ async def capture_full_page(page, screenshot_format='JPEG', watch_uuid=None, loc
|
||||
# better than scrollTo incase they override it in the page
|
||||
await page.evaluate(
|
||||
"""(y) => {
|
||||
document.documentElement.scrollTop = y;
|
||||
document.body.scrollTop = y;
|
||||
const el = document.scrollingElement;
|
||||
if (el) el.scrollTop = y;
|
||||
}""",
|
||||
y
|
||||
)
|
||||
@@ -305,6 +305,8 @@ class fetcher(Fetcher):
|
||||
await asyncio.wait_for(self.browser.close(), timeout=3.0)
|
||||
except Exception as cleanup_error:
|
||||
logger.error(f"[{watch_uuid}] Failed to cleanup browser after page creation failure: {cleanup_error}")
|
||||
finally:
|
||||
self.browser = None
|
||||
raise
|
||||
|
||||
# Add console handler to capture console.log from favicon fetcher
|
||||
@@ -532,6 +534,14 @@ class fetcher(Fetcher):
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
raise (BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))
|
||||
finally:
|
||||
# Internal cleanup on any exception/timeout - call quit() immediately
|
||||
# This prevents connection leaks during exception bursts
|
||||
# Worker.py's quit() call becomes a redundant safety net (idempotent)
|
||||
try:
|
||||
await self.quit(watch={'uuid': watch_uuid} if watch_uuid else None)
|
||||
except Exception as cleanup_error:
|
||||
logger.error(f"[{watch_uuid}] Error during internal quit() cleanup: {cleanup_error}")
|
||||
|
||||
|
||||
# Plugin registration for built-in fetcher
|
||||
|
||||
@@ -565,6 +565,27 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
|
||||
if is_rss:
|
||||
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
|
||||
html_content = re.sub(r'</title>', r'</h1>', html_content)
|
||||
else:
|
||||
# Strip bloat in one pass, SPA's often dump 10Mb+ into the <head> for styles, which is not needed
|
||||
# Causing inscriptis to silently exit when more than ~10MB is found.
|
||||
# All we are doing here is converting the HTML to text, no CSS layout etc
|
||||
# Use backreference (\1) to ensure opening/closing tags match (prevents <style> matching </svg> in CSS data URIs)
|
||||
html_content = re.sub(r'<(style|script|svg|noscript)[^>]*>.*?</\1>|<(?:link|meta)[^>]*/?>|<!--.*?-->',
|
||||
'', html_content, flags=re.DOTALL | re.IGNORECASE)
|
||||
|
||||
# SPAs often use <body style="display:none"> to hide content until JS loads
|
||||
# inscriptis respects CSS display rules, so we need to remove these hiding styles
|
||||
# to extract the actual page content
|
||||
body_style_pattern = r'(<body[^>]*)\s+style\s*=\s*["\']([^"\']*\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b[^"\']*)["\']'
|
||||
|
||||
# Check if body has hiding styles that need to be fixed
|
||||
body_match = re.search(body_style_pattern, html_content, flags=re.IGNORECASE)
|
||||
if body_match:
|
||||
from loguru import logger
|
||||
logger.debug(f"html_to_text: Removing hiding styles from body tag (found: '{body_match.group(2)}')")
|
||||
|
||||
html_content = re.sub(body_style_pattern, r'\1', html_content, flags=re.IGNORECASE)
|
||||
|
||||
|
||||
text_content = get_text(html_content, config=parser_config)
|
||||
return text_content
|
||||
|
||||
@@ -2,6 +2,7 @@ from os import getenv
|
||||
from copy import deepcopy
|
||||
|
||||
from changedetectionio.blueprint.rss import RSS_FORMAT_TYPES, RSS_CONTENT_FORMAT_DEFAULT
|
||||
from changedetectionio.model.Tags import TagsDict
|
||||
|
||||
from changedetectionio.notification import (
|
||||
default_notification_body,
|
||||
@@ -68,7 +69,7 @@ class model(dict):
|
||||
'schema_version' : 0,
|
||||
'shared_diff_access': False,
|
||||
'strip_ignored_lines': False,
|
||||
'tags': {}, #@todo use Tag.model initialisers
|
||||
'tags': None, # Initialized in __init__ with real datastore_path
|
||||
'webdriver_delay': None , # Extra delay in seconds before extracting text
|
||||
'ui': {
|
||||
'use_page_title_in_list': True,
|
||||
@@ -80,10 +81,16 @@ class model(dict):
|
||||
}
|
||||
}
|
||||
|
||||
def __init__(self, *arg, **kw):
|
||||
def __init__(self, *arg, datastore_path=None, **kw):
|
||||
super(model, self).__init__(*arg, **kw)
|
||||
# Capture any tags data passed in before base_config overwrites the structure
|
||||
existing_tags = self.get('settings', {}).get('application', {}).get('tags') or {}
|
||||
# CRITICAL: deepcopy to avoid sharing mutable objects between instances
|
||||
self.update(deepcopy(self.base_config))
|
||||
# TagsDict requires the real datastore_path at runtime (cannot be set at class-definition time)
|
||||
if datastore_path is None:
|
||||
raise ValueError("App.model() requires 'datastore_path' keyword argument")
|
||||
self['settings']['application']['tags'] = TagsDict(existing_tags, datastore_path=datastore_path)
|
||||
|
||||
|
||||
def parse_headers_from_text_file(filepath):
|
||||
|
||||
39
changedetectionio/model/Tags.py
Normal file
39
changedetectionio/model/Tags.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from loguru import logger
|
||||
|
||||
_SENTINEL = object()
|
||||
|
||||
|
||||
class TagsDict(dict):
|
||||
"""Dict subclass that removes the corresponding tag.json file when a tag is deleted."""
|
||||
|
||||
def __init__(self, *args, datastore_path: str | os.PathLike, **kwargs) -> None:
|
||||
self._datastore_path = Path(datastore_path)
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def __delitem__(self, key: str) -> None:
|
||||
super().__delitem__(key)
|
||||
tag_dir = self._datastore_path / key
|
||||
tag_json_file = tag_dir / "tag.json"
|
||||
if not os.path.exists(tag_json_file):
|
||||
logger.critical(f"Aborting deletion of directory '{tag_dir}' because '{tag_json_file}' does not exist.")
|
||||
return
|
||||
try:
|
||||
shutil.rmtree(tag_dir)
|
||||
logger.info(f"Deleted tag directory for tag {key!r}")
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
except OSError as e:
|
||||
logger.error(f"Failed to delete tag directory for tag {key!r}: {e}")
|
||||
|
||||
def pop(self, key: str, default=_SENTINEL):
|
||||
"""Remove and return tag, deleting its tag.json file. Raises KeyError if missing and no default given."""
|
||||
if key in self:
|
||||
value = self[key]
|
||||
del self[key]
|
||||
return value
|
||||
if default is _SENTINEL:
|
||||
raise KeyError(key)
|
||||
return default
|
||||
@@ -129,6 +129,51 @@ class ChangeDetectionSpec:
|
||||
"""
|
||||
pass
|
||||
|
||||
@hookspec
|
||||
def update_handler_alter(update_handler, watch, datastore):
|
||||
"""Modify or wrap the update_handler before it processes a watch.
|
||||
|
||||
This hook is called after the update_handler (perform_site_check instance) is created
|
||||
but before it calls call_browser() and run_changedetection(). Plugins can use this to:
|
||||
- Wrap the handler to add logging/metrics
|
||||
- Modify handler configuration
|
||||
- Add custom preprocessing logic
|
||||
|
||||
Args:
|
||||
update_handler: The perform_site_check instance that will process the watch
|
||||
watch: The watch dict being processed
|
||||
datastore: The application datastore
|
||||
|
||||
Returns:
|
||||
object or None: Return a modified/wrapped handler, or None to keep the original.
|
||||
If multiple plugins return handlers, they are chained in registration order.
|
||||
"""
|
||||
pass
|
||||
|
||||
@hookspec
|
||||
def update_finalize(update_handler, watch, datastore, processing_exception):
|
||||
"""Called after watch processing completes (success or failure).
|
||||
|
||||
This hook is called in the finally block after all processing is complete,
|
||||
allowing plugins to perform cleanup, update metrics, or log final status.
|
||||
|
||||
The plugin can access update_handler.last_logging_insert_id if it was stored
|
||||
during update_handler_alter, and use processing_exception to determine if
|
||||
the processing succeeded or failed.
|
||||
|
||||
Args:
|
||||
update_handler: The perform_site_check instance (may be None if creation failed)
|
||||
watch: The watch dict that was processed (may be None if not loaded)
|
||||
datastore: The application datastore
|
||||
processing_exception: The exception from the main processing block, or None if successful.
|
||||
This does NOT include cleanup exceptions - only exceptions from
|
||||
the actual watch processing (fetch, diff, etc).
|
||||
|
||||
Returns:
|
||||
None: This hook doesn't return a value
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
# Set up Plugin Manager
|
||||
plugin_manager = pluggy.PluginManager(PLUGIN_NAMESPACE)
|
||||
@@ -499,4 +544,66 @@ def get_plugin_template_paths():
|
||||
template_paths.append(templates_dir)
|
||||
logger.debug(f"Added plugin template path: {templates_dir}")
|
||||
|
||||
return template_paths
|
||||
return template_paths
|
||||
|
||||
|
||||
def apply_update_handler_alter(update_handler, watch, datastore):
|
||||
"""Apply update_handler_alter hooks from all plugins.
|
||||
|
||||
Allows plugins to wrap or modify the update_handler before it processes a watch.
|
||||
Multiple plugins can chain modifications - each plugin receives the result from
|
||||
the previous plugin.
|
||||
|
||||
Args:
|
||||
update_handler: The perform_site_check instance to potentially modify
|
||||
watch: The watch dict being processed
|
||||
datastore: The application datastore
|
||||
|
||||
Returns:
|
||||
object: The (potentially modified/wrapped) update_handler
|
||||
"""
|
||||
# Get all plugins that implement the update_handler_alter hook
|
||||
results = plugin_manager.hook.update_handler_alter(
|
||||
update_handler=update_handler,
|
||||
watch=watch,
|
||||
datastore=datastore
|
||||
)
|
||||
|
||||
# Chain results - each plugin gets the result from the previous one
|
||||
current_handler = update_handler
|
||||
if results:
|
||||
for result in results:
|
||||
if result is not None:
|
||||
logger.debug(f"Plugin modified update_handler for watch {watch.get('uuid')}")
|
||||
current_handler = result
|
||||
|
||||
return current_handler
|
||||
|
||||
|
||||
def apply_update_finalize(update_handler, watch, datastore, processing_exception):
|
||||
"""Apply update_finalize hooks from all plugins.
|
||||
|
||||
Called in the finally block after watch processing completes, allowing plugins
|
||||
to perform cleanup, update metrics, or log final status.
|
||||
|
||||
Args:
|
||||
update_handler: The perform_site_check instance (may be None)
|
||||
watch: The watch dict that was processed (may be None)
|
||||
datastore: The application datastore
|
||||
processing_exception: The exception from processing, or None if successful
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
try:
|
||||
# Call all plugins that implement the update_finalize hook
|
||||
plugin_manager.hook.update_finalize(
|
||||
update_handler=update_handler,
|
||||
watch=watch,
|
||||
datastore=datastore,
|
||||
processing_exception=processing_exception
|
||||
)
|
||||
except Exception as e:
|
||||
# Don't let plugin errors crash the worker
|
||||
logger.error(f"Error in update_finalize hook: {e}")
|
||||
logger.exception(f"update_finalize hook exception details:")
|
||||
@@ -347,6 +347,7 @@ class ContentProcessor:
|
||||
def extract_text_from_html(self, html_content, stream_content_type):
|
||||
"""Convert HTML to plain text."""
|
||||
do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
|
||||
|
||||
return html_tools.html_to_text(
|
||||
html_content=html_content,
|
||||
render_anchor_tag_content=do_anchor,
|
||||
|
||||
@@ -102,7 +102,9 @@
|
||||
}
|
||||
|
||||
// Navigate to search results (always redirect to watchlist home)
|
||||
window.location.href = '/?' + params.toString();
|
||||
// Use base_path if available (for sub-path deployments like /enlighten-richerx)
|
||||
const basePath = typeof base_path !== 'undefined' ? base_path : '';
|
||||
window.location.href = basePath + '/?' + params.toString();
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
@@ -1 +1 @@
|
||||
#diff-form{background:rgba(0,0,0,.05);padding:1em;border-radius:10px;margin-bottom:1em;color:#fff;font-size:.9rem;text-align:center}#diff-form label.from-to-label{width:4rem;text-decoration:none;padding:.5rem}#diff-form label.from-to-label#change-from{color:#b30000;background:#fadad7}#diff-form label.from-to-label#change-to{background:#eaf2c2;color:#406619}#diff-form #diff-style>span{display:inline-block;padding:.3em}#diff-form #diff-style>span label{font-weight:normal}#diff-form *{vertical-align:middle}body.difference-page section.content{padding-top:40px}#diff-ui{background:var(--color-background);padding:1rem;border-radius:5px}@media(min-width: 767px){#diff-ui{min-width:50%}}#diff-ui #text{font-size:11px}#diff-ui pre{white-space:break-spaces}#diff-ui h1{display:inline;font-size:100%}#diff-ui #result{white-space:pre-wrap;word-break:break-word;overflow-wrap:break-word}#diff-ui .source{position:absolute;right:1%;top:.2em}@-moz-document url-prefix(){#diff-ui body{height:99%}}#diff-ui td#diff-col div{text-align:justify;white-space:pre-wrap}#diff-ui .ignored{background-color:#ccc;opacity:.7}#diff-ui .triggered{background-color:#1b98f8}#diff-ui .ignored.triggered{background-color:red}#diff-ui .tab-pane-inner#screenshot{text-align:center}#diff-ui .tab-pane-inner#screenshot img{max-width:99%}#diff-ui .pure-form button.reset-margin{margin:0px}#diff-ui .diff-fieldset{display:flex;align-items:center;gap:4px;flex-wrap:wrap}#diff-ui ul#highlightSnippetActions{list-style-type:none;display:flex;align-items:center;justify-content:center;gap:1.5rem;flex-wrap:wrap;padding:0;margin:0}#diff-ui ul#highlightSnippetActions li{display:flex;flex-direction:column;align-items:center;text-align:center;padding:.5rem;gap:.3rem}#diff-ui ul#highlightSnippetActions li button,#diff-ui ul#highlightSnippetActions li a{white-space:nowrap}#diff-ui ul#highlightSnippetActions span{font-size:.8rem;color:var(--color-text-input-description)}#diff-ui #cell-diff-jump-visualiser{display:flex;flex-direction:row;gap:1px;background:var(--color-background);border-radius:3px;overflow-x:hidden;position:sticky;top:0;z-index:10;padding-top:1rem;padding-bottom:1rem;justify-content:center}#diff-ui #cell-diff-jump-visualiser>div{flex:1;min-width:1px;max-width:10px;height:10px;background:var(--color-background-button-cancel);opacity:.3;border-radius:1px;transition:opacity .2s;position:relative}#diff-ui #cell-diff-jump-visualiser>div.deletion{background:#b30000;opacity:1}#diff-ui #cell-diff-jump-visualiser>div.insertion{background:#406619;opacity:1}#diff-ui #cell-diff-jump-visualiser>div.note{background:#406619;opacity:1}#diff-ui #cell-diff-jump-visualiser>div.mixed{background:linear-gradient(to right, #b30000 50%, #406619 50%);opacity:1}#diff-ui #cell-diff-jump-visualiser>div.current-position::after{content:"";position:absolute;bottom:-6px;left:50%;transform:translateX(-50%);width:0;height:0;border-left:4px solid rgba(0,0,0,0);border-right:4px solid rgba(0,0,0,0);border-bottom:4px solid var(--color-text)}#diff-ui #cell-diff-jump-visualiser>div:hover{opacity:.8;cursor:pointer}#text-diff-heading-area .snapshot-age{padding:4px;margin:.5rem 0;background-color:var(--color-background-snapshot-age);border-radius:3px;font-weight:bold;margin-bottom:4px}#text-diff-heading-area .snapshot-age.error{background-color:var(--color-error-background-snapshot-age);color:var(--color-error-text-snapshot-age)}#text-diff-heading-area .snapshot-age>*{padding-right:1rem}
|
||||
#diff-form{background:rgba(0,0,0,.05);padding:1em;border-radius:10px;margin-bottom:1em;color:#fff;font-size:.9rem;text-align:center}#diff-form label.from-to-label{width:4rem;text-decoration:none;padding:.5rem}#diff-form label.from-to-label#change-from{color:#b30000;background:#fadad7}#diff-form label.from-to-label#change-to{background:#eaf2c2;color:#406619}#diff-form #diff-style>span{display:inline-block;padding:.3em}#diff-form #diff-style>span label{font-weight:normal}#diff-form *{vertical-align:middle}body.difference-page section.content{padding-top:40px}#diff-ui{background:var(--color-background);padding:1rem;border-radius:5px}@media(min-width: 767px){#diff-ui{min-width:50%}}#diff-ui #text{font-size:11px}#diff-ui pre{white-space:break-spaces;overflow-wrap:anywhere}#diff-ui h1{display:inline;font-size:100%}#diff-ui #result{white-space:pre-wrap;word-break:break-word;overflow-wrap:break-word}#diff-ui .source{position:absolute;right:1%;top:.2em}@-moz-document url-prefix(){#diff-ui body{height:99%}}#diff-ui td#diff-col div{text-align:justify;white-space:pre-wrap}#diff-ui .ignored{background-color:#ccc;opacity:.7}#diff-ui .triggered{background-color:#1b98f8}#diff-ui .ignored.triggered{background-color:red}#diff-ui .tab-pane-inner#screenshot{text-align:center}#diff-ui .tab-pane-inner#screenshot img{max-width:99%}#diff-ui .pure-form button.reset-margin{margin:0px}#diff-ui .diff-fieldset{display:flex;align-items:center;gap:4px;flex-wrap:wrap}#diff-ui ul#highlightSnippetActions{list-style-type:none;display:flex;align-items:center;justify-content:center;gap:1.5rem;flex-wrap:wrap;padding:0;margin:0}#diff-ui ul#highlightSnippetActions li{display:flex;flex-direction:column;align-items:center;text-align:center;padding:.5rem;gap:.3rem}#diff-ui ul#highlightSnippetActions li button,#diff-ui ul#highlightSnippetActions li a{white-space:nowrap}#diff-ui ul#highlightSnippetActions span{font-size:.8rem;color:var(--color-text-input-description)}#diff-ui #cell-diff-jump-visualiser{display:flex;flex-direction:row;gap:1px;background:var(--color-background);border-radius:3px;overflow-x:hidden;position:sticky;top:0;z-index:10;padding-top:1rem;padding-bottom:1rem;justify-content:center}#diff-ui #cell-diff-jump-visualiser>div{flex:1;min-width:1px;max-width:10px;height:10px;background:var(--color-background-button-cancel);opacity:.3;border-radius:1px;transition:opacity .2s;position:relative}#diff-ui #cell-diff-jump-visualiser>div.deletion{background:#b30000;opacity:1}#diff-ui #cell-diff-jump-visualiser>div.insertion{background:#406619;opacity:1}#diff-ui #cell-diff-jump-visualiser>div.note{background:#406619;opacity:1}#diff-ui #cell-diff-jump-visualiser>div.mixed{background:linear-gradient(to right, #b30000 50%, #406619 50%);opacity:1}#diff-ui #cell-diff-jump-visualiser>div.current-position::after{content:"";position:absolute;bottom:-6px;left:50%;transform:translateX(-50%);width:0;height:0;border-left:4px solid rgba(0,0,0,0);border-right:4px solid rgba(0,0,0,0);border-bottom:4px solid var(--color-text)}#diff-ui #cell-diff-jump-visualiser>div:hover{opacity:.8;cursor:pointer}#text-diff-heading-area .snapshot-age{padding:4px;margin:.5rem 0;background-color:var(--color-background-snapshot-age);border-radius:3px;font-weight:bold;margin-bottom:4px}#text-diff-heading-area .snapshot-age.error{background-color:var(--color-error-background-snapshot-age);color:var(--color-error-text-snapshot-age)}#text-diff-heading-area .snapshot-age>*{padding-right:1rem}
|
||||
|
||||
@@ -62,6 +62,7 @@ body.difference-page {
|
||||
|
||||
pre {
|
||||
white-space: break-spaces;
|
||||
overflow-wrap: anywhere;
|
||||
}
|
||||
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -22,6 +22,8 @@ import uuid as uuid_builder
|
||||
from loguru import logger
|
||||
from blinker import signal
|
||||
|
||||
from ..model.Tags import TagsDict
|
||||
|
||||
# Try to import orjson for faster JSON serialization
|
||||
try:
|
||||
import orjson
|
||||
@@ -121,6 +123,11 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
|
||||
if 'application' in settings_data['settings']:
|
||||
self.__data['settings']['application'].update(settings_data['settings']['application'])
|
||||
|
||||
# Use our Tags dict with cleanup helpers etc
|
||||
# @todo Same for Watches
|
||||
existing_tags = settings_data.get('settings', {}).get('application', {}).get('tags') or {}
|
||||
self.__data['settings']['application']['tags'] = TagsDict(existing_tags, datastore_path=self.datastore_path)
|
||||
|
||||
# More or less for the old format which had this data in the one url-watches.json
|
||||
# cant hurt to leave it here,
|
||||
if 'watching' in settings_data:
|
||||
@@ -196,7 +203,7 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
|
||||
self.datastore_path = datastore_path
|
||||
|
||||
# Initialize data structure
|
||||
self.__data = App.model()
|
||||
self.__data = App.model(datastore_path=datastore_path)
|
||||
self.json_store_path = os.path.join(self.datastore_path, "changedetection.json")
|
||||
|
||||
# Base definition for all watchers (deepcopy part of #569)
|
||||
@@ -355,6 +362,9 @@ class ChangeDetectionStore(DatastoreUpdatesMixin, FileSavingDataStore):
|
||||
# Deep copy settings to avoid modifying the original
|
||||
settings_copy = copy.deepcopy(self.__data['settings'])
|
||||
|
||||
# Is saved as {uuid}/tag.json
|
||||
settings_copy['application']['tags'] = {}
|
||||
|
||||
return {
|
||||
'note': 'Settings file - watches are in {uuid}/watch.json, tags are in {uuid}/tag.json',
|
||||
'app_guid': self.__data.get('app_guid'),
|
||||
|
||||
@@ -669,7 +669,9 @@ class DatastoreUpdatesMixin:
|
||||
def update_26(self):
|
||||
self.migrate_legacy_db_format()
|
||||
|
||||
def update_28(self):
|
||||
# Re-run tag to JSON migration
|
||||
def update_29(self):
|
||||
|
||||
"""
|
||||
Migrate tags to individual tag.json files.
|
||||
|
||||
@@ -682,8 +684,6 @@ class DatastoreUpdatesMixin:
|
||||
- Enables independent tag versioning/backup
|
||||
- Maintains backwards compatibility (tags stay in settings too)
|
||||
"""
|
||||
# Force save as tag.json (not watch.json) even if object is corrupted
|
||||
|
||||
logger.critical("=" * 80)
|
||||
logger.critical("Running migration: Individual tag persistence (update_28)")
|
||||
logger.critical("Creating individual tag.json files")
|
||||
@@ -702,6 +702,9 @@ class DatastoreUpdatesMixin:
|
||||
failed_count = 0
|
||||
|
||||
for uuid, tag_data in tags.items():
|
||||
if os.path.isfile(os.path.join(self.datastore_path, uuid, "tag.json")):
|
||||
logger.debug(f"Tag {uuid} tag.json exists, skipping")
|
||||
continue
|
||||
try:
|
||||
tag_data.commit()
|
||||
saved_count += 1
|
||||
@@ -723,3 +726,7 @@ class DatastoreUpdatesMixin:
|
||||
logger.info("Future tag edits will update both locations (dual storage)")
|
||||
logger.critical("=" * 80)
|
||||
|
||||
# write it to disk, it will be saved without ['tags'] in the JSON db because we find it from disk glob
|
||||
# (left this out by accident in previous update, added tags={} in the changedetection.json save_to_disk)
|
||||
self._save_settings()
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
<li>{{ _('Trigger text is processed from the result-text that comes out of any CSS/JSON Filters for this monitor') }}</li>
|
||||
<li>{{ _('Each line is processed separately (think of each line as "OR")') }}</li>
|
||||
<li>{{ _('Note: Wrap in forward slash / to use regex example:') }} <code>/foo\d/</code></li>
|
||||
<li>{{ _('You can also use')}} <a href="#conditions">{{ _('conditions')}}</a> - {{ _('"Page text" - with Contains, Starts With, Not Contains and many more' ) }} <code>/foo\d/</code></li>
|
||||
</ul>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
@@ -199,6 +199,259 @@ class TestHtmlToText(unittest.TestCase):
|
||||
|
||||
print(f"✓ Basic thread-safety test passed: {len(results)} threads, no errors")
|
||||
|
||||
def test_large_html_with_bloated_head(self):
|
||||
"""
|
||||
Test that html_to_text can handle large HTML documents with massive <head> bloat.
|
||||
|
||||
SPAs often dump 10MB+ of styles, scripts, and other bloat into the <head> section.
|
||||
This can cause inscriptis to silently exit when processing very large documents.
|
||||
The fix strips <style>, <script>, <svg>, <noscript>, <link>, <meta>, and HTML comments
|
||||
before processing, allowing extraction of actual body content.
|
||||
"""
|
||||
# Generate massive style block (~5MB)
|
||||
large_style = '<style>' + '.class{color:red;}\n' * 200000 + '</style>\n'
|
||||
|
||||
# Generate massive script block (~5MB)
|
||||
large_script = '<script>' + 'console.log("bloat");\n' * 200000 + '</script>\n'
|
||||
|
||||
# Generate lots of SVG bloat (~3MB)
|
||||
svg_bloat = '<svg><path d="M0,0 L100,100"/></svg>\n' * 50000
|
||||
|
||||
# Generate meta/link tags (~2MB)
|
||||
meta_bloat = '<meta name="description" content="bloat"/>\n' * 50000
|
||||
link_bloat = '<link rel="stylesheet" href="bloat.css"/>\n' * 50000
|
||||
|
||||
# Generate HTML comments (~1MB)
|
||||
comment_bloat = '<!-- This is bloat -->\n' * 50000
|
||||
|
||||
# Generate noscript bloat
|
||||
noscript_bloat = '<noscript>Enable JavaScript</noscript>\n' * 10000
|
||||
|
||||
# Build the large HTML document
|
||||
html = f'''<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Test Page</title>
|
||||
{large_style}
|
||||
{large_script}
|
||||
{svg_bloat}
|
||||
{meta_bloat}
|
||||
{link_bloat}
|
||||
{comment_bloat}
|
||||
{noscript_bloat}
|
||||
</head>
|
||||
<body>
|
||||
<h1>Important Heading</h1>
|
||||
<p>This is the actual content that should be extracted.</p>
|
||||
<div>
|
||||
<p>First paragraph with meaningful text.</p>
|
||||
<p>Second paragraph with more content.</p>
|
||||
</div>
|
||||
<footer>Footer text</footer>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
# Verify the HTML is actually large (should be ~20MB+)
|
||||
html_size_mb = len(html) / (1024 * 1024)
|
||||
assert html_size_mb > 15, f"HTML should be >15MB, got {html_size_mb:.2f}MB"
|
||||
|
||||
print(f" Testing {html_size_mb:.2f}MB HTML document with bloated head...")
|
||||
|
||||
# This should not crash or silently exit
|
||||
text = html_to_text(html)
|
||||
|
||||
# Verify we got actual text output (not empty/None)
|
||||
assert text is not None, "html_to_text returned None"
|
||||
assert len(text) > 0, "html_to_text returned empty string"
|
||||
|
||||
# Verify the actual body content was extracted
|
||||
assert 'Important Heading' in text, "Failed to extract heading"
|
||||
assert 'actual content that should be extracted' in text, "Failed to extract paragraph"
|
||||
assert 'First paragraph with meaningful text' in text, "Failed to extract first paragraph"
|
||||
assert 'Second paragraph with more content' in text, "Failed to extract second paragraph"
|
||||
assert 'Footer text' in text, "Failed to extract footer"
|
||||
|
||||
# Verify bloat was stripped (output should be tiny compared to input)
|
||||
text_size_kb = len(text) / 1024
|
||||
assert text_size_kb < 1, f"Output too large ({text_size_kb:.2f}KB), bloat not stripped"
|
||||
|
||||
# Verify no CSS, script content, or SVG leaked through
|
||||
assert 'color:red' not in text, "Style content leaked into text output"
|
||||
assert 'console.log' not in text, "Script content leaked into text output"
|
||||
assert '<path' not in text, "SVG content leaked into text output"
|
||||
assert 'bloat.css' not in text, "Link href leaked into text output"
|
||||
|
||||
print(f" ✓ Successfully processed {html_size_mb:.2f}MB HTML -> {text_size_kb:.2f}KB text")
|
||||
|
||||
def test_body_display_none_spa_pattern(self):
|
||||
"""
|
||||
Test that html_to_text can extract content from pages with display:none body.
|
||||
|
||||
SPAs (Single Page Applications) often use <body style="display:none"> to hide content
|
||||
until JavaScript loads and renders the page. inscriptis respects CSS display rules,
|
||||
so without preprocessing, it would skip all content and return only newlines.
|
||||
|
||||
The fix strips display:none and visibility:hidden styles from the body tag before
|
||||
processing, allowing text extraction from client-side rendered applications.
|
||||
"""
|
||||
# Test case 1: Basic display:none
|
||||
html1 = '''<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head><title>What's New – Fluxguard</title></head>
|
||||
<body style="display:none">
|
||||
<h1>Important Heading</h1>
|
||||
<p>This is actual content that should be extracted.</p>
|
||||
<div>
|
||||
<p>First paragraph with meaningful text.</p>
|
||||
<p>Second paragraph with more content.</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>'''
|
||||
|
||||
text1 = html_to_text(html1)
|
||||
|
||||
# Before fix: would return ~33 newlines, len(text) ~= 33
|
||||
# After fix: should extract actual content, len(text) > 100
|
||||
assert len(text1) > 100, f"Expected substantial text output, got {len(text1)} chars"
|
||||
assert 'Important Heading' in text1, "Failed to extract heading from display:none body"
|
||||
assert 'actual content' in text1, "Failed to extract paragraph from display:none body"
|
||||
assert 'First paragraph' in text1, "Failed to extract nested content"
|
||||
|
||||
# Should not be mostly newlines
|
||||
newline_ratio = text1.count('\n') / len(text1)
|
||||
assert newline_ratio < 0.5, f"Output is mostly newlines ({newline_ratio:.2%}), content not extracted"
|
||||
|
||||
# Test case 2: visibility:hidden (another hiding pattern)
|
||||
html2 = '<html><body style="visibility:hidden"><h1>Hidden Content</h1><p>Test paragraph.</p></body></html>'
|
||||
text2 = html_to_text(html2)
|
||||
|
||||
assert 'Hidden Content' in text2, "Failed to extract content from visibility:hidden body"
|
||||
assert 'Test paragraph' in text2, "Failed to extract paragraph from visibility:hidden body"
|
||||
|
||||
# Test case 3: Mixed styles (display:none with other CSS)
|
||||
html3 = '<html><body style="color: red; display:none; font-size: 12px"><p>Mixed style content</p></body></html>'
|
||||
text3 = html_to_text(html3)
|
||||
|
||||
assert 'Mixed style content' in text3, "Failed to extract content from body with mixed styles"
|
||||
|
||||
# Test case 4: Case insensitivity (DISPLAY:NONE uppercase)
|
||||
html4 = '<html><body style="DISPLAY:NONE"><p>Uppercase style</p></body></html>'
|
||||
text4 = html_to_text(html4)
|
||||
|
||||
assert 'Uppercase style' in text4, "Failed to handle uppercase DISPLAY:NONE"
|
||||
|
||||
# Test case 5: Space variations (display: none vs display:none)
|
||||
html5 = '<html><body style="display: none"><p>With spaces</p></body></html>'
|
||||
text5 = html_to_text(html5)
|
||||
|
||||
assert 'With spaces' in text5, "Failed to handle 'display: none' with space"
|
||||
|
||||
# Test case 6: Body with other attributes (class, id)
|
||||
html6 = '<html><body class="foo" style="display:none" id="bar"><p>With attributes</p></body></html>'
|
||||
text6 = html_to_text(html6)
|
||||
|
||||
assert 'With attributes' in text6, "Failed to extract from body with multiple attributes"
|
||||
|
||||
# Test case 7: Should NOT affect opacity:0 (which doesn't hide from inscriptis)
|
||||
html7 = '<html><body style="opacity:0"><p>Transparent content</p></body></html>'
|
||||
text7 = html_to_text(html7)
|
||||
|
||||
# Opacity doesn't affect inscriptis text extraction, content should be there
|
||||
assert 'Transparent content' in text7, "Incorrectly stripped opacity:0 style"
|
||||
|
||||
print(" ✓ All display:none body tag tests passed")
|
||||
|
||||
def test_style_tag_with_svg_data_uri(self):
|
||||
"""
|
||||
Test that style tags containing SVG data URIs are properly stripped.
|
||||
|
||||
Some WordPress and modern sites embed SVG as data URIs in CSS, which contains
|
||||
<svg> and </svg> tags within the style content. The regex must use backreferences
|
||||
to ensure <style> matches </style> (not </svg> inside the CSS).
|
||||
|
||||
This was causing errors where the regex would match <style> and stop at the first
|
||||
</svg> it encountered inside a CSS data URI, breaking the HTML structure.
|
||||
"""
|
||||
# Real-world example from WordPress wp-block-image styles
|
||||
html = '''<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<style id='wp-block-image-inline-css'>
|
||||
.wp-block-image>a,.wp-block-image>figure>a{display:inline-block}.wp-block-image img{box-sizing:border-box;height:auto;max-width:100%;vertical-align:bottom}@supports ((-webkit-mask-image:none) or (mask-image:none)) or (-webkit-mask-image:none){.wp-block-image.is-style-circle-mask img{border-radius:0;-webkit-mask-image:url('data:image/svg+xml;utf8,<svg viewBox="0 0 100 100" xmlns="http://www.w3.org/2000/svg"><circle cx="50" cy="50" r="50"/></svg>');mask-image:url('data:image/svg+xml;utf8,<svg viewBox="0 0 100 100" xmlns="http://www.w3.org/2000/svg"><circle cx="50" cy="50" r="50"/></svg>');mask-mode:alpha}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Test Heading</h1>
|
||||
<p>This is the actual content that should be extracted.</p>
|
||||
<div class="wp-block-image">
|
||||
<img src="test.jpg" alt="Test image">
|
||||
</div>
|
||||
</body>
|
||||
</html>'''
|
||||
|
||||
# This should not crash and should extract the body content
|
||||
text = html_to_text(html)
|
||||
|
||||
# Verify the actual body content was extracted
|
||||
assert text is not None, "html_to_text returned None"
|
||||
assert len(text) > 0, "html_to_text returned empty string"
|
||||
assert 'Test Heading' in text, "Failed to extract heading"
|
||||
assert 'actual content that should be extracted' in text, "Failed to extract paragraph"
|
||||
|
||||
# Verify CSS content was stripped (including the SVG data URI)
|
||||
assert '.wp-block-image' not in text, "CSS class selector leaked into text"
|
||||
assert 'mask-image' not in text, "CSS property leaked into text"
|
||||
assert 'data:image/svg+xml' not in text, "SVG data URI leaked into text"
|
||||
assert 'viewBox' not in text, "SVG attributes leaked into text"
|
||||
|
||||
# Verify no broken HTML structure
|
||||
assert '<style' not in text, "Unclosed style tag in output"
|
||||
assert '</svg>' not in text, "SVG closing tag leaked into text"
|
||||
|
||||
print(" ✓ Style tag with SVG data URI test passed")
|
||||
|
||||
def test_style_tag_closes_correctly(self):
|
||||
"""
|
||||
Test that each tag type (style, script, svg) closes with the correct closing tag.
|
||||
|
||||
Before the fix, the regex used (?:style|script|svg|noscript) for both opening and
|
||||
closing tags, which meant <style> could incorrectly match </svg> as its closing tag.
|
||||
With backreferences, <style> must close with </style>, <svg> with </svg>, etc.
|
||||
"""
|
||||
# Test nested tags where incorrect matching would break
|
||||
html = '''<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
body { background: url('data:image/svg+xml,<svg><rect/></svg>'); }
|
||||
</style>
|
||||
<script>
|
||||
const svg = '<svg><path d="M0,0"/></svg>';
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Content</h1>
|
||||
<svg><circle cx="50" cy="50" r="40"/></svg>
|
||||
<p>After SVG</p>
|
||||
</body>
|
||||
</html>'''
|
||||
|
||||
text = html_to_text(html)
|
||||
|
||||
# Should extract body content
|
||||
assert 'Content' in text, "Failed to extract heading"
|
||||
assert 'After SVG' in text, "Failed to extract content after SVG"
|
||||
|
||||
# Should strip all style/script/svg content
|
||||
assert 'background:' not in text, "Style content leaked"
|
||||
assert 'const svg' not in text, "Script content leaked"
|
||||
assert '<circle' not in text, "SVG element leaked"
|
||||
assert 'data:image/svg+xml' not in text, "Data URI leaked"
|
||||
|
||||
print(" ✓ Tag closing validation test passed")
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Can run this file directly for quick testing
|
||||
|
||||
@@ -8,6 +8,7 @@ python3 -m pytest changedetectionio/tests/unit/test_time_handler.py -v
|
||||
"""
|
||||
|
||||
import unittest
|
||||
import unittest.mock
|
||||
import arrow
|
||||
from changedetectionio import time_handler
|
||||
|
||||
@@ -240,6 +241,211 @@ class TestAmIInsideTime(unittest.TestCase):
|
||||
# Result depends on current time
|
||||
self.assertIsInstance(result, bool)
|
||||
|
||||
def test_24_hour_schedule_from_midnight(self):
|
||||
"""Test 24-hour schedule starting at midnight covers entire day."""
|
||||
timezone_str = 'UTC'
|
||||
# Test at a specific time: Monday 00:00
|
||||
test_time = arrow.get('2024-01-01 00:00:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
|
||||
day_of_week = test_time.format('dddd') # Monday
|
||||
|
||||
# Mock current time for testing
|
||||
with unittest.mock.patch('arrow.now', return_value=test_time):
|
||||
result = time_handler.am_i_inside_time(
|
||||
day_of_week=day_of_week,
|
||||
time_str="00:00",
|
||||
timezone_str=timezone_str,
|
||||
duration=1440 # 24 hours
|
||||
)
|
||||
self.assertTrue(result, "Should be active at start of 24-hour schedule")
|
||||
|
||||
def test_24_hour_schedule_at_end_of_day(self):
|
||||
"""Test 24-hour schedule is active at 23:59:59."""
|
||||
timezone_str = 'UTC'
|
||||
# Test at Monday 23:59:59
|
||||
test_time = arrow.get('2024-01-01 23:59:59', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
|
||||
day_of_week = test_time.format('dddd') # Monday
|
||||
|
||||
with unittest.mock.patch('arrow.now', return_value=test_time):
|
||||
result = time_handler.am_i_inside_time(
|
||||
day_of_week=day_of_week,
|
||||
time_str="00:00",
|
||||
timezone_str=timezone_str,
|
||||
duration=1440 # 24 hours
|
||||
)
|
||||
self.assertTrue(result, "Should be active at end of 24-hour schedule")
|
||||
|
||||
def test_24_hour_schedule_at_midnight_transition(self):
|
||||
"""Test 24-hour schedule at exactly midnight transition."""
|
||||
timezone_str = 'UTC'
|
||||
# Test at Tuesday 00:00:00 (end of Monday's 24-hour schedule)
|
||||
test_time = arrow.get('2024-01-02 00:00:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
|
||||
monday = test_time.shift(days=-1).format('dddd') # Monday
|
||||
|
||||
with unittest.mock.patch('arrow.now', return_value=test_time):
|
||||
result = time_handler.am_i_inside_time(
|
||||
day_of_week=monday,
|
||||
time_str="00:00",
|
||||
timezone_str=timezone_str,
|
||||
duration=1440 # 24 hours
|
||||
)
|
||||
self.assertTrue(result, "Should include exactly midnight at end of 24-hour schedule")
|
||||
|
||||
def test_schedule_crosses_midnight_before_midnight(self):
|
||||
"""Test schedule crossing midnight - before midnight."""
|
||||
timezone_str = 'UTC'
|
||||
# Monday 23:30
|
||||
test_time = arrow.get('2024-01-01 23:30:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
|
||||
day_of_week = test_time.format('dddd') # Monday
|
||||
|
||||
with unittest.mock.patch('arrow.now', return_value=test_time):
|
||||
result = time_handler.am_i_inside_time(
|
||||
day_of_week=day_of_week,
|
||||
time_str="23:00",
|
||||
timezone_str=timezone_str,
|
||||
duration=120 # 2 hours (until 01:00 next day)
|
||||
)
|
||||
self.assertTrue(result, "Should be active before midnight in cross-midnight schedule")
|
||||
|
||||
def test_schedule_crosses_midnight_after_midnight(self):
|
||||
"""Test schedule crossing midnight - after midnight."""
|
||||
timezone_str = 'UTC'
|
||||
# Tuesday 00:30
|
||||
test_time = arrow.get('2024-01-02 00:30:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
|
||||
monday = test_time.shift(days=-1).format('dddd') # Monday
|
||||
|
||||
with unittest.mock.patch('arrow.now', return_value=test_time):
|
||||
result = time_handler.am_i_inside_time(
|
||||
day_of_week=monday,
|
||||
time_str="23:00",
|
||||
timezone_str=timezone_str,
|
||||
duration=120 # 2 hours (until 01:00 Tuesday)
|
||||
)
|
||||
self.assertTrue(result, "Should be active after midnight in cross-midnight schedule")
|
||||
|
||||
def test_schedule_crosses_midnight_at_exact_end(self):
|
||||
"""Test schedule crossing midnight at exact end time."""
|
||||
timezone_str = 'UTC'
|
||||
# Tuesday 01:00 (exact end of Monday 23:00 + 120 minutes)
|
||||
test_time = arrow.get('2024-01-02 01:00:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
|
||||
monday = test_time.shift(days=-1).format('dddd') # Monday
|
||||
|
||||
with unittest.mock.patch('arrow.now', return_value=test_time):
|
||||
result = time_handler.am_i_inside_time(
|
||||
day_of_week=monday,
|
||||
time_str="23:00",
|
||||
timezone_str=timezone_str,
|
||||
duration=120 # 2 hours
|
||||
)
|
||||
self.assertTrue(result, "Should include exact end time of schedule")
|
||||
|
||||
def test_duration_60_minutes(self):
|
||||
"""Test that duration of 60 minutes works correctly."""
|
||||
timezone_str = 'UTC'
|
||||
test_time = arrow.get('2024-01-01 12:30:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
|
||||
day_of_week = test_time.format('dddd')
|
||||
|
||||
with unittest.mock.patch('arrow.now', return_value=test_time):
|
||||
result = time_handler.am_i_inside_time(
|
||||
day_of_week=day_of_week,
|
||||
time_str="12:00",
|
||||
timezone_str=timezone_str,
|
||||
duration=60 # Exactly 60 minutes
|
||||
)
|
||||
self.assertTrue(result, "60-minute duration should work")
|
||||
|
||||
def test_duration_at_exact_end_minute(self):
|
||||
"""Test at exact end of 60-minute window."""
|
||||
timezone_str = 'UTC'
|
||||
# Exactly 13:00 (end of 12:00 + 60 minutes)
|
||||
test_time = arrow.get('2024-01-01 13:00:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
|
||||
day_of_week = test_time.format('dddd')
|
||||
|
||||
with unittest.mock.patch('arrow.now', return_value=test_time):
|
||||
result = time_handler.am_i_inside_time(
|
||||
day_of_week=day_of_week,
|
||||
time_str="12:00",
|
||||
timezone_str=timezone_str,
|
||||
duration=60
|
||||
)
|
||||
self.assertTrue(result, "Should include exact end minute")
|
||||
|
||||
def test_one_second_after_schedule_ends(self):
|
||||
"""Test one second after schedule should end."""
|
||||
timezone_str = 'UTC'
|
||||
# 13:00:01 (one second after 12:00 + 60 minutes)
|
||||
test_time = arrow.get('2024-01-01 13:00:01', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
|
||||
day_of_week = test_time.format('dddd')
|
||||
|
||||
with unittest.mock.patch('arrow.now', return_value=test_time):
|
||||
result = time_handler.am_i_inside_time(
|
||||
day_of_week=day_of_week,
|
||||
time_str="12:00",
|
||||
timezone_str=timezone_str,
|
||||
duration=60
|
||||
)
|
||||
self.assertFalse(result, "Should be False one second after schedule ends")
|
||||
|
||||
def test_multi_day_schedule(self):
|
||||
"""Test schedule longer than 24 hours (48 hours)."""
|
||||
timezone_str = 'UTC'
|
||||
# Tuesday 12:00 (36 hours after Monday 00:00)
|
||||
test_time = arrow.get('2024-01-02 12:00:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
|
||||
monday = test_time.shift(days=-1).format('dddd')
|
||||
|
||||
with unittest.mock.patch('arrow.now', return_value=test_time):
|
||||
result = time_handler.am_i_inside_time(
|
||||
day_of_week=monday,
|
||||
time_str="00:00",
|
||||
timezone_str=timezone_str,
|
||||
duration=2880 # 48 hours
|
||||
)
|
||||
self.assertTrue(result, "Should support multi-day schedules")
|
||||
|
||||
def test_schedule_one_minute_duration(self):
|
||||
"""Test very short 1-minute schedule."""
|
||||
timezone_str = 'UTC'
|
||||
test_time = arrow.get('2024-01-01 12:00:30', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
|
||||
day_of_week = test_time.format('dddd')
|
||||
|
||||
with unittest.mock.patch('arrow.now', return_value=test_time):
|
||||
result = time_handler.am_i_inside_time(
|
||||
day_of_week=day_of_week,
|
||||
time_str="12:00",
|
||||
timezone_str=timezone_str,
|
||||
duration=1 # Just 1 minute
|
||||
)
|
||||
self.assertTrue(result, "1-minute schedule should work")
|
||||
|
||||
def test_schedule_at_exact_start_time(self):
|
||||
"""Test at exact start time (00:00:00.000000)."""
|
||||
timezone_str = 'UTC'
|
||||
test_time = arrow.get('2024-01-01 12:00:00.000000', 'YYYY-MM-DD HH:mm:ss.SSSSSS').replace(tzinfo=timezone_str)
|
||||
day_of_week = test_time.format('dddd')
|
||||
|
||||
with unittest.mock.patch('arrow.now', return_value=test_time):
|
||||
result = time_handler.am_i_inside_time(
|
||||
day_of_week=day_of_week,
|
||||
time_str="12:00",
|
||||
timezone_str=timezone_str,
|
||||
duration=30
|
||||
)
|
||||
self.assertTrue(result, "Should include exact start time")
|
||||
|
||||
def test_schedule_one_microsecond_before_start(self):
|
||||
"""Test one microsecond before schedule starts."""
|
||||
timezone_str = 'UTC'
|
||||
test_time = arrow.get('2024-01-01 11:59:59.999999', 'YYYY-MM-DD HH:mm:ss.SSSSSS').replace(tzinfo=timezone_str)
|
||||
day_of_week = test_time.format('dddd')
|
||||
|
||||
with unittest.mock.patch('arrow.now', return_value=test_time):
|
||||
result = time_handler.am_i_inside_time(
|
||||
day_of_week=day_of_week,
|
||||
time_str="12:00",
|
||||
timezone_str=timezone_str,
|
||||
duration=30
|
||||
)
|
||||
self.assertFalse(result, "Should not include time before start")
|
||||
|
||||
|
||||
class TestIsWithinSchedule(unittest.TestCase):
|
||||
"""Tests for the is_within_schedule function."""
|
||||
@@ -405,6 +611,175 @@ class TestIsWithinSchedule(unittest.TestCase):
|
||||
result = time_handler.is_within_schedule(time_schedule_limit)
|
||||
self.assertTrue(result, "Should handle timezone with whitespace")
|
||||
|
||||
def test_schedule_with_60_minutes(self):
|
||||
"""Test schedule with duration of 0 hours and 60 minutes."""
|
||||
timezone_str = 'UTC'
|
||||
now = arrow.now(timezone_str)
|
||||
current_day = now.format('dddd').lower()
|
||||
current_hour = now.format('HH:00')
|
||||
|
||||
time_schedule_limit = {
|
||||
'enabled': True,
|
||||
'timezone': timezone_str,
|
||||
current_day: {
|
||||
'enabled': True,
|
||||
'start_time': current_hour,
|
||||
'duration': {'hours': 0, 'minutes': 60} # 60 minutes
|
||||
}
|
||||
}
|
||||
|
||||
result = time_handler.is_within_schedule(time_schedule_limit)
|
||||
self.assertTrue(result, "Should accept 60 minutes as valid duration")
|
||||
|
||||
def test_schedule_with_24_hours(self):
|
||||
"""Test schedule with duration of 24 hours and 0 minutes."""
|
||||
timezone_str = 'UTC'
|
||||
now = arrow.now(timezone_str)
|
||||
current_day = now.format('dddd').lower()
|
||||
start_hour = now.format('HH:00')
|
||||
|
||||
time_schedule_limit = {
|
||||
'enabled': True,
|
||||
'timezone': timezone_str,
|
||||
current_day: {
|
||||
'enabled': True,
|
||||
'start_time': start_hour,
|
||||
'duration': {'hours': 24, 'minutes': 0} # Full 24 hours
|
||||
}
|
||||
}
|
||||
|
||||
result = time_handler.is_within_schedule(time_schedule_limit)
|
||||
self.assertTrue(result, "Should accept 24 hours as valid duration")
|
||||
|
||||
def test_schedule_with_90_minutes(self):
|
||||
"""Test schedule with duration of 0 hours and 90 minutes."""
|
||||
timezone_str = 'UTC'
|
||||
now = arrow.now(timezone_str)
|
||||
current_day = now.format('dddd').lower()
|
||||
current_hour = now.format('HH:00')
|
||||
|
||||
time_schedule_limit = {
|
||||
'enabled': True,
|
||||
'timezone': timezone_str,
|
||||
current_day: {
|
||||
'enabled': True,
|
||||
'start_time': current_hour,
|
||||
'duration': {'hours': 0, 'minutes': 90} # 90 minutes = 1.5 hours
|
||||
}
|
||||
}
|
||||
|
||||
result = time_handler.is_within_schedule(time_schedule_limit)
|
||||
self.assertTrue(result, "Should accept 90 minutes as valid duration")
|
||||
|
||||
def test_schedule_24_hours_from_midnight(self):
|
||||
"""Test 24-hour schedule from midnight using is_within_schedule."""
|
||||
timezone_str = 'UTC'
|
||||
test_time = arrow.get('2024-01-01 12:00:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
|
||||
current_day = test_time.format('dddd').lower() # monday
|
||||
|
||||
time_schedule_limit = {
|
||||
'enabled': True,
|
||||
'timezone': timezone_str,
|
||||
current_day: {
|
||||
'enabled': True,
|
||||
'start_time': '00:00',
|
||||
'duration': {'hours': 24, 'minutes': 0}
|
||||
}
|
||||
}
|
||||
|
||||
with unittest.mock.patch('arrow.now', return_value=test_time):
|
||||
result = time_handler.is_within_schedule(time_schedule_limit)
|
||||
self.assertTrue(result, "24-hour schedule from midnight should cover entire day")
|
||||
|
||||
def test_schedule_24_hours_at_end_of_day(self):
|
||||
"""Test 24-hour schedule at 23:59 using is_within_schedule."""
|
||||
timezone_str = 'UTC'
|
||||
test_time = arrow.get('2024-01-01 23:59:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
|
||||
current_day = test_time.format('dddd').lower()
|
||||
|
||||
time_schedule_limit = {
|
||||
'enabled': True,
|
||||
'timezone': timezone_str,
|
||||
current_day: {
|
||||
'enabled': True,
|
||||
'start_time': '00:00',
|
||||
'duration': {'hours': 24, 'minutes': 0}
|
||||
}
|
||||
}
|
||||
|
||||
with unittest.mock.patch('arrow.now', return_value=test_time):
|
||||
result = time_handler.is_within_schedule(time_schedule_limit)
|
||||
self.assertTrue(result, "Should be active at 23:59 in 24-hour schedule")
|
||||
|
||||
def test_schedule_crosses_midnight_with_is_within_schedule(self):
|
||||
"""Test schedule crossing midnight using is_within_schedule."""
|
||||
timezone_str = 'UTC'
|
||||
# Tuesday 00:30
|
||||
test_time = arrow.get('2024-01-02 00:30:00', 'YYYY-MM-DD HH:mm:ss').replace(tzinfo=timezone_str)
|
||||
# Get Monday as that's when the schedule started
|
||||
monday = test_time.shift(days=-1).format('dddd').lower()
|
||||
|
||||
time_schedule_limit = {
|
||||
'enabled': True,
|
||||
'timezone': timezone_str,
|
||||
'monday': {
|
||||
'enabled': True,
|
||||
'start_time': '23:00',
|
||||
'duration': {'hours': 2, 'minutes': 0} # Until 01:00 Tuesday
|
||||
},
|
||||
'tuesday': {
|
||||
'enabled': False,
|
||||
'start_time': '09:00',
|
||||
'duration': {'hours': 8, 'minutes': 0}
|
||||
}
|
||||
}
|
||||
|
||||
with unittest.mock.patch('arrow.now', return_value=test_time):
|
||||
result = time_handler.is_within_schedule(time_schedule_limit)
|
||||
# Note: This checks Tuesday's schedule, not Monday's overlap
|
||||
# So it should be False because Tuesday is disabled
|
||||
self.assertFalse(result, "Should check current day (Tuesday), which is disabled")
|
||||
|
||||
def test_schedule_with_mixed_hours_minutes(self):
|
||||
"""Test schedule with both hours and minutes (23 hours 60 minutes = 24 hours)."""
|
||||
timezone_str = 'UTC'
|
||||
now = arrow.now(timezone_str)
|
||||
current_day = now.format('dddd').lower()
|
||||
current_hour = now.format('HH:00')
|
||||
|
||||
time_schedule_limit = {
|
||||
'enabled': True,
|
||||
'timezone': timezone_str,
|
||||
current_day: {
|
||||
'enabled': True,
|
||||
'start_time': current_hour,
|
||||
'duration': {'hours': 23, 'minutes': 60} # = 1440 minutes = 24 hours
|
||||
}
|
||||
}
|
||||
|
||||
result = time_handler.is_within_schedule(time_schedule_limit)
|
||||
self.assertTrue(result, "Should handle 23 hours + 60 minutes = 24 hours")
|
||||
|
||||
def test_schedule_48_hours(self):
|
||||
"""Test schedule with 48-hour duration."""
|
||||
timezone_str = 'UTC'
|
||||
now = arrow.now(timezone_str)
|
||||
current_day = now.format('dddd').lower()
|
||||
start_hour = now.format('HH:00')
|
||||
|
||||
time_schedule_limit = {
|
||||
'enabled': True,
|
||||
'timezone': timezone_str,
|
||||
current_day: {
|
||||
'enabled': True,
|
||||
'start_time': start_hour,
|
||||
'duration': {'hours': 48, 'minutes': 0} # 2 full days
|
||||
}
|
||||
}
|
||||
|
||||
result = time_handler.is_within_schedule(time_schedule_limit)
|
||||
self.assertTrue(result, "Should support 48-hour (multi-day) schedules")
|
||||
|
||||
|
||||
class TestWeekdayEnum(unittest.TestCase):
|
||||
"""Tests for the Weekday enum."""
|
||||
|
||||
@@ -62,19 +62,19 @@ def am_i_inside_time(
|
||||
# Calculate start and end times for the overlap from the previous day
|
||||
start_datetime_tz = start_datetime_tz.shift(days=-1)
|
||||
end_datetime_tz = start_datetime_tz.shift(minutes=duration)
|
||||
if start_datetime_tz <= now_tz < end_datetime_tz:
|
||||
if start_datetime_tz <= now_tz <= end_datetime_tz:
|
||||
return True
|
||||
|
||||
# Handle current day's range
|
||||
if target_weekday == current_weekday:
|
||||
end_datetime_tz = start_datetime_tz.shift(minutes=duration)
|
||||
if start_datetime_tz <= now_tz < end_datetime_tz:
|
||||
if start_datetime_tz <= now_tz <= end_datetime_tz:
|
||||
return True
|
||||
|
||||
# Handle next day's overlap
|
||||
if target_weekday == (current_weekday + 1) % 7:
|
||||
end_datetime_tz = start_datetime_tz.shift(minutes=duration)
|
||||
if now_tz < start_datetime_tz and now_tz.shift(days=1) < end_datetime_tz:
|
||||
if now_tz < start_datetime_tz and now_tz.shift(days=1) <= end_datetime_tz:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
@@ -4,11 +4,10 @@ import changedetectionio.content_fetchers.exceptions as content_fetchers_excepti
|
||||
from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
|
||||
from changedetectionio import html_tools
|
||||
from changedetectionio import worker_pool
|
||||
from changedetectionio.flask_app import watch_check_update
|
||||
from changedetectionio.queuedWatchMetaData import PrioritizedItem
|
||||
from changedetectionio.pluggy_interface import apply_update_handler_alter, apply_update_finalize
|
||||
|
||||
import asyncio
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
@@ -56,6 +55,7 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
|
||||
while not app.config.exit.is_set():
|
||||
update_handler = None
|
||||
watch = None
|
||||
processing_exception = None # Reset at start of each iteration to prevent state bleeding
|
||||
|
||||
try:
|
||||
# Efficient blocking via run_in_executor (no polling overhead!)
|
||||
@@ -119,7 +119,7 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
|
||||
# to prevent race condition with wait_for_all_checks()
|
||||
|
||||
fetch_start_time = round(time.time())
|
||||
|
||||
|
||||
try:
|
||||
if uuid in list(datastore.data['watching'].keys()) and datastore.data['watching'][uuid].get('url'):
|
||||
changed_detected = False
|
||||
@@ -136,6 +136,8 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
|
||||
logger.info(f"Worker {worker_id} processing watch UUID {uuid} Priority {queued_item_data.priority} URL {watch['url']}")
|
||||
|
||||
try:
|
||||
# Retrieve signal by name to ensure thread-safe access across worker threads
|
||||
watch_check_update = signal('watch_check_update')
|
||||
watch_check_update.send(watch_uuid=uuid)
|
||||
|
||||
# Processor is what we are using for detecting the "Change"
|
||||
@@ -154,6 +156,9 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
|
||||
update_handler = processor_module.perform_site_check(datastore=datastore,
|
||||
watch_uuid=uuid)
|
||||
|
||||
# Allow plugins to modify/wrap the update_handler
|
||||
update_handler = apply_update_handler_alter(update_handler, watch, datastore)
|
||||
|
||||
update_signal = signal('watch_small_status_comment')
|
||||
update_signal.send(watch_uuid=uuid, status="Fetching page..")
|
||||
|
||||
@@ -473,8 +478,6 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
|
||||
logger.exception(f"Worker {worker_id} full exception details:")
|
||||
logger.warning(f"UUID: {uuid} Exception when extracting <title> - {str(e)}")
|
||||
|
||||
|
||||
|
||||
# Store favicon if necessary
|
||||
if update_handler.fetcher.favicon_blob and update_handler.fetcher.favicon_blob.get('base64'):
|
||||
watch.bump_favicon(url=update_handler.fetcher.favicon_blob.get('url'),
|
||||
@@ -498,6 +501,8 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
|
||||
gc.collect()
|
||||
|
||||
except Exception as e:
|
||||
# Store the processing exception for plugin finalization hook
|
||||
processing_exception = e
|
||||
|
||||
logger.error(f"Worker {worker_id} unexpected error processing {uuid}: {e}")
|
||||
logger.exception(f"Worker {worker_id} full exception details:")
|
||||
@@ -509,6 +514,11 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
|
||||
finally:
|
||||
# Always cleanup - this runs whether there was an exception or not
|
||||
if uuid:
|
||||
# Capture references for plugin finalize hook BEFORE cleanup
|
||||
# (cleanup may delete these variables, but plugins need the original references)
|
||||
finalize_handler = update_handler # Capture now, before cleanup deletes it
|
||||
finalize_watch = watch # Capture now, before any modifications
|
||||
|
||||
# Call quit() as backup (Puppeteer/Playwright have internal cleanup, but this acts as safety net)
|
||||
try:
|
||||
if update_handler and hasattr(update_handler, 'fetcher') and update_handler.fetcher:
|
||||
@@ -518,12 +528,6 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
|
||||
logger.exception(f"Worker {worker_id} full exception details:")
|
||||
|
||||
try:
|
||||
# Release UUID from processing (thread-safe)
|
||||
worker_pool.release_uuid_from_processing(uuid, worker_id=worker_id)
|
||||
|
||||
# Send completion signal
|
||||
if watch:
|
||||
watch_check_update.send(watch_uuid=watch['uuid'])
|
||||
|
||||
# Clean up all memory references BEFORE garbage collection
|
||||
if update_handler:
|
||||
@@ -547,7 +551,37 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore, exec
|
||||
logger.error(f"Worker {worker_id} error during cleanup: {cleanup_error}")
|
||||
logger.exception(f"Worker {worker_id} full exception details:")
|
||||
|
||||
del(uuid)
|
||||
# Call plugin finalization hook after all cleanup is done
|
||||
# Use captured references from before cleanup
|
||||
try:
|
||||
apply_update_finalize(
|
||||
update_handler=finalize_handler,
|
||||
watch=finalize_watch,
|
||||
datastore=datastore,
|
||||
processing_exception=processing_exception
|
||||
)
|
||||
except Exception as finalize_error:
|
||||
logger.error(f"Worker {worker_id} error in finalize hook: {finalize_error}")
|
||||
logger.exception(f"Worker {worker_id} full exception details:")
|
||||
finally:
|
||||
# Clean up captured references to allow immediate garbage collection
|
||||
del finalize_handler
|
||||
del finalize_watch
|
||||
|
||||
# Release UUID from processing AFTER all cleanup and hooks complete (thread-safe)
|
||||
# This ensures wait_for_all_checks() waits for finalize hooks to complete
|
||||
try:
|
||||
worker_pool.release_uuid_from_processing(uuid, worker_id=worker_id)
|
||||
except Exception as release_error:
|
||||
logger.error(f"Worker {worker_id} error releasing UUID: {release_error}")
|
||||
logger.exception(f"Worker {worker_id} full exception details:")
|
||||
finally:
|
||||
# Send completion signal - retrieve by name to ensure thread-safe access
|
||||
if watch:
|
||||
watch_check_update = signal('watch_check_update')
|
||||
watch_check_update.send(watch_uuid=watch['uuid'])
|
||||
|
||||
del (uuid)
|
||||
|
||||
# Brief pause before continuing to avoid tight error loops (only on error)
|
||||
if 'e' in locals():
|
||||
|
||||
Reference in New Issue
Block a user