Compare commits

..

1 Commits

Author SHA1 Message Date
dgtlmoon
3a0b51053f Dont use .lower() because the html could be very big and this uses a lot of ram 2025-04-11 09:13:34 +02:00
16 changed files with 188 additions and 158 deletions

View File

@@ -1,9 +1,9 @@
recursive-include changedetectionio/api *
recursive-include changedetectionio/apprise_plugin *
recursive-include changedetectionio/blueprint *
recursive-include changedetectionio/content_fetchers *
recursive-include changedetectionio/conditions *
recursive-include changedetectionio/model *
recursive-include changedetectionio/notification *
recursive-include changedetectionio/processors *
recursive-include changedetectionio/static *
recursive-include changedetectionio/templates *

View File

@@ -1,7 +1,5 @@
# Responsible for building the storage dict into a set of rules ("JSON Schema") acceptable via the API
# Probably other ways to solve this when the backend switches to some ORM
from changedetectionio.notification import valid_notification_formats
def build_time_between_check_json_schema():
# Setup time between check schema
@@ -100,6 +98,8 @@ def build_watch_json_schema(d):
}
}
from changedetectionio.notification import valid_notification_formats
schema['properties']['notification_format'] = {'type': 'string',
'enum': list(valid_notification_formats.keys())
}

View File

@@ -4,6 +4,7 @@ from loguru import logger
from changedetectionio.store import ChangeDetectionStore
from changedetectionio.auth_decorator import login_optionally_required
from changedetectionio.notification import process_notification
def construct_blueprint(datastore: ChangeDetectionStore):
notification_blueprint = Blueprint('ui_notification', __name__, template_folder="../ui/templates")
@@ -17,11 +18,8 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Watch_uuid could be unset in the case it`s used in tag editor, global settings
import apprise
from changedetectionio.notification.handler import process_notification
from changedetectionio.notification.apprise_plugin.assets import apprise_asset
from changedetectionio.notification.apprise_plugin.custom_handlers import apprise_http_custom_handler
from ...apprise_plugin.assets import apprise_asset
from ...apprise_plugin.custom_handlers import apprise_http_custom_handler # noqa: F401
apobj = apprise.Apprise(asset=apprise_asset)
is_global_settings_form = request.args.get('mode', '') == 'global-settings'

View File

@@ -7,13 +7,13 @@ import os
# Visual Selector scraper - 'Button' is there because some sites have <button>OUT OF STOCK</button>.
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,button'
SCREENSHOT_MAX_HEIGHT_DEFAULT = 20000
SCREENSHOT_MAX_HEIGHT_DEFAULT = 16000
SCREENSHOT_DEFAULT_QUALITY = 40
# Maximum total height for the final image (When in stitch mode).
# We limit this to 16000px due to the huge amount of RAM that was being used
# Example: 16000 × 1400 × 3 = 67,200,000 bytes ≈ 64.1 MB (not including buffers in PIL etc)
SCREENSHOT_MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
# The size at which we will switch to stitching method, when below this (and
# MAX_TOTAL_HEIGHT which can be set by a user) we will use the default

View File

@@ -5,10 +5,13 @@ from urllib.parse import urlparse
from loguru import logger
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_MAX_TOTAL_HEIGHT, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
def capture_full_page(page):
import os
import time
@@ -17,56 +20,84 @@ def capture_full_page(page):
start = time.time()
page_height = page.evaluate("document.documentElement.scrollHeight")
page_width = page.evaluate("document.documentElement.scrollWidth")
original_viewport = page.viewport_size
logger.debug(f"Playwright viewport size {page.viewport_size} page height {page_height} page width {page_width}")
logger.debug(f"Playwright viewport size {page.viewport_size}")
# Use an approach similar to puppeteer: set a larger viewport and take screenshots in chunks
step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Size that won't cause GPU to overflow
screenshot_chunks = []
y = 0
# If page height is larger than current viewport, use a larger viewport for better capturing
if page_height > page.viewport_size['height']:
# Set viewport to a larger size to capture more content at once
page.set_viewport_size({'width': page.viewport_size['width'], 'height': step_size})
############################################################
#### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) #####
############################################################
# Capture screenshots in chunks up to the max total height
while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
# Optimization to avoid unnecessary stitching if we can avoid it
# Use the default screenshot method for smaller pages to take advantage
# of GPU and native playwright screenshot optimizations
# - No PIL needed here, no danger of memory leaks, no sub process required
if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ):
logger.debug("Using default screenshot method")
page.request_gc()
page.evaluate(f"window.scrollTo(0, {y})")
page.request_gc()
screenshot_chunks.append(page.screenshot(
screenshot = page.screenshot(
type="jpeg",
full_page=False,
quality=int(os.getenv("SCREENSHOT_QUALITY", 72))
))
y += step_size
quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
full_page=True,
)
page.request_gc()
# Restore original viewport size
page.set_viewport_size({'width': original_viewport['width'], 'height': original_viewport['height']})
# If we have multiple chunks, stitch them together
if len(screenshot_chunks) > 1:
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together")
parent_conn, child_conn = Pipe()
p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT))
p.start()
screenshot = parent_conn.recv_bytes()
p.join()
logger.debug(
f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
screenshot_chunks = None
logger.debug(f"Screenshot captured in {time.time() - start:.2f}s")
return screenshot
logger.debug(
f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
return screenshot_chunks[0]
###################################################################################
#### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES #####
###################################################################################
# - PIL can easily allocate memory and not release it cleanly
# - Fetching screenshot from playwright seems OK
# Image.new is leaky even with .close()
# So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling
logger.debug(
"Using stitching method for large screenshot because page height exceeds threshold"
)
# Limit the total capture height
capture_height = min(page_height, MAX_TOTAL_HEIGHT)
# Calculate number of chunks needed using ORIGINAL viewport height
num_chunks = (capture_height + page.viewport_size['height'] - 1) // page.viewport_size['height']
screenshot_chunks = []
# Track cumulative paste position
y_offset = 0
for _ in range(num_chunks):
page.request_gc()
page.evaluate(f"window.scrollTo(0, {y_offset})")
page.request_gc()
h = min(page.viewport_size['height'], capture_height - y_offset)
screenshot_chunks.append(page.screenshot(
type="jpeg",
clip={
"x": 0,
"y": 0,
"width": page.viewport_size['width'],
"height": h,
},
quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
))
y_offset += h # maybe better to inspect the image here?
page.request_gc()
# PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling
parent_conn, child_conn = Pipe()
p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height))
p.start()
result = parent_conn.recv_bytes()
p.join()
screenshot_chunks = None
logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s")
return result
class fetcher(Fetcher):
@@ -261,7 +292,6 @@ class fetcher(Fetcher):
self.page.request_gc()
self.content = self.page.content()
self.page.request_gc()
logger.debug(f"Scrape xPath element data in browser done in {time.time() - now:.2f}s")
# Bug 3 in Playwright screenshot handling
@@ -287,11 +317,4 @@ class fetcher(Fetcher):
# Clean up resources properly
context.close()
context = None
self.page.close()
self.page = None
browser.close()
borwser = None

View File

@@ -7,11 +7,10 @@ from urllib.parse import urlparse
from loguru import logger
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS, \
SCREENSHOT_MAX_TOTAL_HEIGHT
SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, \
BrowserConnectError
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
# Bug 3 in Playwright screenshot handling
@@ -28,53 +27,71 @@ async def capture_full_page(page):
start = time.time()
page_height = await page.evaluate("document.documentElement.scrollHeight")
page_width = await page.evaluate("document.documentElement.scrollWidth")
original_viewport = page.viewport
logger.debug(f"Puppeteer viewport size {page.viewport} page height {page_height} page width {page_width}")
logger.debug(f"Puppeteer viewport size {page.viewport}")
# Bug 3 in Playwright screenshot handling
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
# JPEG is better here because the screenshots can be very very large
############################################################
#### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) #####
############################################################
# Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
# which will significantly increase the IO size between the server and client, it's recommended to use the lowest
# acceptable screenshot quality here
step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Something that will not cause the GPU to overflow when taking the screenshot
screenshot_chunks = []
y = 0
if page_height > page.viewport['height']:
await page.setViewport({'width': page.viewport['width'], 'height': step_size})
while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
await page.evaluate(f"window.scrollTo(0, {y})")
screenshot_chunks.append(await page.screenshot(type_='jpeg',
fullPage=False,
quality=int(os.getenv("SCREENSHOT_QUALITY", 72))))
y += step_size
await page.setViewport({'width': original_viewport['width'], 'height': original_viewport['height']})
if len(screenshot_chunks) > 1:
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together")
parent_conn, child_conn = Pipe()
p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT))
p.start()
screenshot = parent_conn.recv_bytes()
p.join()
logger.debug(
f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
screenshot_chunks = None
# Optimization to avoid unnecessary stitching if we can avoid it
# Use the default screenshot method for smaller pages to take advantage
# of GPU and native playwright screenshot optimizations
# - No PIL needed here, no danger of memory leaks, no sub process required
if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ):
logger.debug("Using default screenshot method")
await page.evaluate(f"window.scrollTo(0, 0)")
screenshot = await page.screenshot(
type_="jpeg",
quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
fullPage=True,
)
logger.debug(f"Screenshot captured in {time.time() - start:.2f}s")
return screenshot
###################################################################################
#### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES #####
###################################################################################
# - PIL can easily allocate memory and not release it cleanly
# - Fetching screenshot from playwright seems OK
# Image.new is leaky even with .close()
# So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling
logger.debug(
f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
return screenshot_chunks[0]
"Using stitching method for large screenshot because page height exceeds threshold"
)
# Limit the total capture height
capture_height = min(page_height, MAX_TOTAL_HEIGHT)
# Calculate number of chunks needed using ORIGINAL viewport height
num_chunks = (capture_height + page.viewport['height'] - 1) // page.viewport['height']
screenshot_chunks = []
# Track cumulative paste position
y_offset = 0
for _ in range(num_chunks):
await page.evaluate(f"window.scrollTo(0, {y_offset})")
h = min(page.viewport['height'], capture_height - y_offset)
screenshot_chunks.append(await page.screenshot(
type_="jpeg",
quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
))
y_offset += h # maybe better to inspect the image here?
# PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling
parent_conn, child_conn = Pipe()
p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height))
p.start()
result = parent_conn.recv_bytes()
p.join()
screenshot_chunks = None
logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s")
return result
class fetcher(Fetcher):

View File

@@ -514,8 +514,7 @@ def notification_runner():
sent_obj = None
try:
from changedetectionio.notification.handler import process_notification
from changedetectionio import notification
# Fallback to system config if not set
if not n_object.get('notification_body') and datastore.data['settings']['application'].get('notification_body'):
n_object['notification_body'] = datastore.data['settings']['application'].get('notification_body')
@@ -525,8 +524,8 @@ def notification_runner():
if not n_object.get('notification_format') and datastore.data['settings']['application'].get('notification_format'):
n_object['notification_format'] = datastore.data['settings']['application'].get('notification_format')
if n_object.get('notification_urls', {}):
sent_obj = process_notification(n_object, datastore)
sent_obj = notification.process_notification(n_object, datastore)
except Exception as e:
logger.error(f"Watch URL: {n_object['watch_url']} Error {str(e)}")

View File

@@ -306,8 +306,8 @@ class ValidateAppRiseServers(object):
def __call__(self, form, field):
import apprise
from .notification.apprise_plugin.assets import apprise_asset
from .notification.apprise_plugin.custom_handlers import apprise_http_custom_handler # noqa: F401
from .apprise_plugin.assets import apprise_asset
from .apprise_plugin.custom_handlers import apprise_http_custom_handler # noqa: F401
apobj = apprise.Apprise(asset=apprise_asset)

View File

@@ -2,7 +2,7 @@ import os
import uuid
from changedetectionio import strtobool
default_notification_format_for_watch = 'System default'
from changedetectionio.notification import default_notification_format_for_watch
class watch_base(dict):

View File

@@ -1,17 +1,47 @@
import time
from apprise import NotifyFormat
import apprise
from loguru import logger
from .apprise_plugin.assets import apprise_asset, APPRISE_AVATAR_URL
from .apprise_plugin.assets import APPRISE_AVATAR_URL
from .apprise_plugin.custom_handlers import apprise_http_custom_handler # noqa: F401
from .safe_jinja import render as jinja_render
valid_tokens = {
'base_url': '',
'current_snapshot': '',
'diff': '',
'diff_added': '',
'diff_full': '',
'diff_patch': '',
'diff_removed': '',
'diff_url': '',
'preview_url': '',
'triggered_text': '',
'watch_tag': '',
'watch_title': '',
'watch_url': '',
'watch_uuid': '',
}
default_notification_format_for_watch = 'System default'
default_notification_format = 'HTML Color'
default_notification_body = '{{watch_url}} had a change.\n---\n{{diff}}\n---\n'
default_notification_title = 'ChangeDetection.io Notification - {{watch_url}}'
valid_notification_formats = {
'Text': NotifyFormat.TEXT,
'Markdown': NotifyFormat.MARKDOWN,
'HTML': NotifyFormat.HTML,
'HTML Color': 'htmlcolor',
# Used only for editing a watch (not for global)
default_notification_format_for_watch: default_notification_format_for_watch
}
def process_notification(n_object, datastore):
from changedetectionio.safe_jinja import render as jinja_render
from . import default_notification_format_for_watch, default_notification_format, valid_notification_formats
# be sure its registered
from .apprise_plugin.custom_handlers import apprise_http_custom_handler
now = time.time()
if n_object.get('notification_timestamp'):
logger.trace(f"Time since queued {now-n_object['notification_timestamp']:.3f}s")
@@ -28,13 +58,14 @@ def process_notification(n_object, datastore):
# Initially text or whatever
n_format = datastore.data['settings']['application'].get('notification_format', valid_notification_formats[default_notification_format])
logger.trace(f"Complete notification body including Jinja and placeholders calculated in {time.time() - now:.2f}s")
logger.trace(f"Complete notification body including Jinja and placeholders calculated in {time.time() - now:.3f}s")
# https://github.com/caronc/apprise/wiki/Development_LogCapture
# Anything higher than or equal to WARNING (which covers things like Connection errors)
# raise it as an exception
sent_objs = []
from .apprise_plugin.assets import apprise_asset
if 'as_async' in n_object:
apprise_asset.async_mode = n_object.get('as_async')
@@ -145,7 +176,6 @@ def process_notification(n_object, datastore):
# ( Where we prepare the tokens in the notification to be replaced with actual values )
def create_notification_parameters(n_object, datastore):
from copy import deepcopy
from . import valid_tokens
# in the case we send a test notification from the main settings, there is no UUID.
uuid = n_object['uuid'] if 'uuid' in n_object else ''

View File

@@ -1,35 +0,0 @@
from changedetectionio.model import default_notification_format_for_watch
ult_notification_format_for_watch = 'System default'
default_notification_format = 'HTML Color'
default_notification_body = '{{watch_url}} had a change.\n---\n{{diff}}\n---\n'
default_notification_title = 'ChangeDetection.io Notification - {{watch_url}}'
# The values (markdown etc) are from apprise NotifyFormat,
# But to avoid importing the whole heavy module just use the same strings here.
valid_notification_formats = {
'Text': 'text',
'Markdown': 'markdown',
'HTML': 'html',
'HTML Color': 'htmlcolor',
# Used only for editing a watch (not for global)
default_notification_format_for_watch: default_notification_format_for_watch
}
valid_tokens = {
'base_url': '',
'current_snapshot': '',
'diff': '',
'diff_added': '',
'diff_full': '',
'diff_patch': '',
'diff_removed': '',
'diff_url': '',
'preview_url': '',
'triggered_text': '',
'watch_tag': '',
'watch_title': '',
'watch_url': '',
'watch_uuid': '',
}

View File

@@ -167,10 +167,7 @@ def test_check_notification(client, live_server, measure_memory_usage):
assert ':-)' in notification_submission
# Check the attachment was added, and that it is a JPEG from the original PNG
notification_submission_object = json.loads(notification_submission)
assert notification_submission_object
# We keep PNG screenshots for now
# IF THIS FAILS YOU SHOULD BE TESTING WITH ENV VAR REMOVE_REQUESTS_OLD_SCREENSHOTS=False
assert notification_submission_object['attachments'][0]['filename'] == 'last-screenshot.png'
assert len(notification_submission_object['attachments'][0]['base64'])
assert notification_submission_object['attachments'][0]['mimetype'] == 'image/png'

View File

@@ -109,6 +109,7 @@ class update_worker(threading.Thread):
default_notification_title
)
# Would be better if this was some kind of Object where Watch can reference the parent datastore etc
v = watch.get(var_name)
if v and not watch.get('notification_muted'):