Compare commits

...

7 Commits

Author SHA1 Message Date
dgtlmoon
3d2bc5049b Small safety catch 2025-04-12 18:40:15 +02:00
dgtlmoon
186016e605 Playwright + Puppeteer fix for when page is taller than viewport but less than step_size 2025-04-12 17:56:16 +02:00
dgtlmoon
3a583a4e5d Memory management - Run HTML to text in sub process, a few more cleanups to playwright (#3110)
Some checks failed
Build and push containers / metadata (push) Has been cancelled
Build and push containers / build-push-containers (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built 📦 package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io Container Build Test / test-container-build (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
2025-04-11 18:18:29 +02:00
dgtlmoon
cfb4decf67 UI Edit/Stats - Add levenshtein distance info, explains how "different" the last two snapshot are (#3109) 2025-04-11 17:36:29 +02:00
dgtlmoon
8067d5170b 0.49.13
Some checks failed
Build and push containers / metadata (push) Has been cancelled
Build and push containers / build-push-containers (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built 📦 package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
2025-04-11 13:46:58 +02:00
Rob Mulder 
5551acf67d API - Added notifications API endpoints (#3103) 2025-04-11 13:43:59 +02:00
dgtlmoon
45a030bac6 Fetcher - Use bigger screenshot chunks to speed up page screenshot (#3107) 2025-04-11 13:42:50 +02:00
15 changed files with 449 additions and 146 deletions

View File

@@ -2,7 +2,7 @@
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
__version__ = '0.49.12'
__version__ = '0.49.13'
from changedetectionio.strtobool import strtobool
from json.decoder import JSONDecodeError

View File

@@ -0,0 +1,145 @@
from flask_expects_json import expects_json
from flask_restful import Resource
from . import auth
from flask_restful import abort, Resource
from flask import request
from . import auth
from . import schema_create_notification_urls, schema_delete_notification_urls
class Notifications(Resource):
def __init__(self, **kwargs):
# datastore is a black box dependency
self.datastore = kwargs['datastore']
@auth.check_token
def get(self):
"""
@api {get} /api/v1/notifications Return Notification URL List
@apiDescription Return the Notification URL List from the configuration
@apiExample {curl} Example usage:
curl http://localhost:5000/api/v1/notifications -H"x-api-key:813031b16330fe25e3780cf0325daa45"
HTTP/1.0 200
{
'notification_urls': ["notification-urls-list"]
}
@apiName Get
@apiGroup Notifications
"""
notification_urls = self.datastore.data.get('settings', {}).get('application', {}).get('notification_urls', [])
return {
'notification_urls': notification_urls,
}, 200
@auth.check_token
@expects_json(schema_create_notification_urls)
def post(self):
"""
@api {post} /api/v1/notifications Create Notification URLs
@apiDescription Add one or more notification URLs from the configuration
@apiExample {curl} Example usage:
curl http://localhost:5000/api/v1/notifications/batch -H"x-api-key:813031b16330fe25e3780cf0325daa45" -H "Content-Type: application/json" -d '{"notification_urls": ["url1", "url2"]}'
@apiName CreateBatch
@apiGroup Notifications
@apiSuccess (201) {Object[]} notification_urls List of added notification URLs
@apiError (400) {String} Invalid input
"""
json_data = request.get_json()
notification_urls = json_data.get("notification_urls", [])
from wtforms import ValidationError
try:
validate_notification_urls(notification_urls)
except ValidationError as e:
return str(e), 400
added_urls = []
for url in notification_urls:
clean_url = url.strip()
added_url = self.datastore.add_notification_url(clean_url)
if added_url:
added_urls.append(added_url)
if not added_urls:
return "No valid notification URLs were added", 400
return {'notification_urls': added_urls}, 201
@auth.check_token
@expects_json(schema_create_notification_urls)
def put(self):
"""
@api {put} /api/v1/notifications Replace Notification URLs
@apiDescription Replace all notification URLs with the provided list (can be empty)
@apiExample {curl} Example usage:
curl -X PUT http://localhost:5000/api/v1/notifications -H"x-api-key:813031b16330fe25e3780cf0325daa45" -H "Content-Type: application/json" -d '{"notification_urls": ["url1", "url2"]}'
@apiName Replace
@apiGroup Notifications
@apiSuccess (200) {Object[]} notification_urls List of current notification URLs
@apiError (400) {String} Invalid input
"""
json_data = request.get_json()
notification_urls = json_data.get("notification_urls", [])
from wtforms import ValidationError
try:
validate_notification_urls(notification_urls)
except ValidationError as e:
return str(e), 400
if not isinstance(notification_urls, list):
return "Invalid input format", 400
clean_urls = [url.strip() for url in notification_urls if isinstance(url, str)]
self.datastore.data['settings']['application']['notification_urls'] = clean_urls
self.datastore.needs_write = True
return {'notification_urls': clean_urls}, 200
@auth.check_token
@expects_json(schema_delete_notification_urls)
def delete(self):
"""
@api {delete} /api/v1/notifications Delete Notification URLs
@apiDescription Deletes one or more notification URLs from the configuration
@apiExample {curl} Example usage:
curl http://localhost:5000/api/v1/notifications -X DELETE -H"x-api-key:813031b16330fe25e3780cf0325daa45" -H "Content-Type: application/json" -d '{"notification_urls": ["url1", "url2"]}'
@apiParam {String[]} notification_urls The notification URLs to delete.
@apiName Delete
@apiGroup Notifications
@apiSuccess (204) {String} OK Deleted
@apiError (400) {String} No matching notification URLs found.
"""
json_data = request.get_json()
urls_to_delete = json_data.get("notification_urls", [])
if not isinstance(urls_to_delete, list):
abort(400, message="Expected a list of notification URLs.")
notification_urls = self.datastore.data['settings']['application'].get('notification_urls', [])
deleted = []
for url in urls_to_delete:
clean_url = url.strip()
if clean_url in notification_urls:
notification_urls.remove(clean_url)
deleted.append(clean_url)
if not deleted:
abort(400, message="No matching notification URLs found.")
self.datastore.data['settings']['application']['notification_urls'] = notification_urls
self.datastore.needs_write = True
return 'OK', 204
def validate_notification_urls(notification_urls):
from changedetectionio.forms import ValidateAppRiseServers
validator = ValidateAppRiseServers()
class DummyForm: pass
dummy_form = DummyForm()
field = type("Field", (object,), {"data": notification_urls, "gettext": lambda self, x: x})()
validator(dummy_form, field)

View File

@@ -19,8 +19,15 @@ schema_create_tag['required'] = ['title']
schema_update_tag = copy.deepcopy(schema_tag)
schema_update_tag['additionalProperties'] = False
schema_notification_urls = copy.deepcopy(schema)
schema_create_notification_urls = copy.deepcopy(schema_notification_urls)
schema_create_notification_urls['required'] = ['notification_urls']
schema_delete_notification_urls = copy.deepcopy(schema_notification_urls)
schema_delete_notification_urls['required'] = ['notification_urls']
# Import all API resources
from .Watch import Watch, WatchHistory, WatchSingleHistory, CreateWatch
from .Tags import Tags, Tag
from .Import import Import
from .SystemInfo import SystemInfo
from .Notifications import Notifications

View File

@@ -19,6 +19,20 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
if tag_uuid in watch.get('tags', []) and (tag.get('include_filters') or tag.get('subtractive_selectors')):
return True
def levenshtein_ratio_recent_history(watch):
try:
from Levenshtein import ratio, distance
k = list(watch.history.keys())
if len(k) >= 2:
a = watch.get_history_snapshot(timestamp=k[0])
b = watch.get_history_snapshot(timestamp=k[1])
distance = distance(a, b)
return distance
except Exception as e:
logger.warning("Unable to calc similarity", e)
return "Unable to calc similarity"
return ''
@edit_blueprint.route("/edit/<string:uuid>", methods=['GET', 'POST'])
@login_optionally_required
# https://stackoverflow.com/questions/42984453/wtforms-populate-form-with-data-if-data-exists
@@ -247,14 +261,15 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
'has_default_notification_urls': True if len(datastore.data['settings']['application']['notification_urls']) else False,
'has_extra_headers_file': len(datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) > 0,
'has_special_tag_options': _watch_has_tag_options_set(watch=watch),
'watch_uses_webdriver': watch_uses_webdriver,
'jq_support': jq_support,
'lev_info': levenshtein_ratio_recent_history(watch),
'playwright_enabled': os.getenv('PLAYWRIGHT_DRIVER_URL', False),
'settings_application': datastore.data['settings']['application'],
'timezone_default_config': datastore.data['settings']['application'].get('timezone'),
'using_global_webdriver_wait': not default['webdriver_delay'],
'uuid': uuid,
'watch': watch
'watch': watch,
'watch_uses_webdriver': watch_uses_webdriver,
}
included_content = None

View File

@@ -96,7 +96,7 @@ def execute_ruleset_against_all_plugins(current_watch_uuid: str, application_dat
ruleset_settings = application_datastruct['watching'].get(current_watch_uuid)
if ruleset_settings.get("conditions"):
if ruleset_settings and ruleset_settings.get("conditions"):
logic_operator = "and" if ruleset_settings.get("conditions_match_logic", "ALL") == "ALL" else "or"
complete_rules = filter_complete_rules(ruleset_settings['conditions'])
if complete_rules:

View File

@@ -7,13 +7,13 @@ import os
# Visual Selector scraper - 'Button' is there because some sites have <button>OUT OF STOCK</button>.
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,button'
SCREENSHOT_MAX_HEIGHT_DEFAULT = 16000
SCREENSHOT_MAX_HEIGHT_DEFAULT = 20000
SCREENSHOT_DEFAULT_QUALITY = 40
# Maximum total height for the final image (When in stitch mode).
# We limit this to 16000px due to the huge amount of RAM that was being used
# Example: 16000 × 1400 × 3 = 67,200,000 bytes ≈ 64.1 MB (not including buffers in PIL etc)
MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
SCREENSHOT_MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT))
# The size at which we will switch to stitching method, when below this (and
# MAX_TOTAL_HEIGHT which can be set by a user) we will use the default

View File

@@ -5,13 +5,10 @@ from urllib.parse import urlparse
from loguru import logger
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_MAX_TOTAL_HEIGHT, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
def capture_full_page(page):
import os
import time
@@ -20,84 +17,61 @@ def capture_full_page(page):
start = time.time()
page_height = page.evaluate("document.documentElement.scrollHeight")
page_width = page.evaluate("document.documentElement.scrollWidth")
original_viewport = page.viewport_size
logger.debug(f"Playwright viewport size {page.viewport_size}")
logger.debug(f"Playwright viewport size {page.viewport_size} page height {page_height} page width {page_width}")
############################################################
#### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) #####
############################################################
# Use an approach similar to puppeteer: set a larger viewport and take screenshots in chunks
step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Size that won't cause GPU to overflow
screenshot_chunks = []
y = 0
# Optimization to avoid unnecessary stitching if we can avoid it
# Use the default screenshot method for smaller pages to take advantage
# of GPU and native playwright screenshot optimizations
# - No PIL needed here, no danger of memory leaks, no sub process required
if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ):
logger.debug("Using default screenshot method")
if page_height > page.viewport_size['height']:
if page_height < step_size:
step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
logger.debug(f"Setting bigger viewport to step through large page width W{page.viewport_size['width']}xH{step_size} because page_height > viewport_size")
# Set viewport to a larger size to capture more content at once
page.set_viewport_size({'width': page.viewport_size['width'], 'height': step_size})
# Capture screenshots in chunks up to the max total height
while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
page.request_gc()
screenshot = page.screenshot(
page.evaluate(f"window.scrollTo(0, {y})")
page.request_gc()
screenshot_chunks.append(page.screenshot(
type="jpeg",
quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
full_page=True,
)
full_page=False,
quality=int(os.getenv("SCREENSHOT_QUALITY", 72))
))
y += step_size
page.request_gc()
logger.debug(f"Screenshot captured in {time.time() - start:.2f}s")
# Restore original viewport size
page.set_viewport_size({'width': original_viewport['width'], 'height': original_viewport['height']})
# If we have multiple chunks, stitch them together
if len(screenshot_chunks) > 1:
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together")
parent_conn, child_conn = Pipe()
p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT))
p.start()
screenshot = parent_conn.recv_bytes()
p.join()
logger.debug(
f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
# Explicit cleanup
del screenshot_chunks
del p
del parent_conn, child_conn
screenshot_chunks = None
return screenshot
###################################################################################
#### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES #####
###################################################################################
# - PIL can easily allocate memory and not release it cleanly
# - Fetching screenshot from playwright seems OK
# Image.new is leaky even with .close()
# So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling
logger.debug(
"Using stitching method for large screenshot because page height exceeds threshold"
)
f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
# Limit the total capture height
capture_height = min(page_height, MAX_TOTAL_HEIGHT)
# Calculate number of chunks needed using ORIGINAL viewport height
num_chunks = (capture_height + page.viewport_size['height'] - 1) // page.viewport_size['height']
screenshot_chunks = []
# Track cumulative paste position
y_offset = 0
for _ in range(num_chunks):
page.request_gc()
page.evaluate(f"window.scrollTo(0, {y_offset})")
page.request_gc()
h = min(page.viewport_size['height'], capture_height - y_offset)
screenshot_chunks.append(page.screenshot(
type="jpeg",
clip={
"x": 0,
"y": 0,
"width": page.viewport_size['width'],
"height": h,
},
quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
))
y_offset += h # maybe better to inspect the image here?
page.request_gc()
# PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling
parent_conn, child_conn = Pipe()
p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height))
p.start()
result = parent_conn.recv_bytes()
p.join()
screenshot_chunks = None
logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s")
return result
return screenshot_chunks[0]
class fetcher(Fetcher):
@@ -292,6 +266,7 @@ class fetcher(Fetcher):
self.page.request_gc()
self.content = self.page.content()
self.page.request_gc()
logger.debug(f"Scrape xPath element data in browser done in {time.time() - now:.2f}s")
# Bug 3 in Playwright screenshot handling
@@ -316,5 +291,28 @@ class fetcher(Fetcher):
pass
# Clean up resources properly
context.close()
browser.close()
try:
self.page.request_gc()
except:
pass
try:
self.page.close()
except:
pass
self.page = None
try:
context.close()
except:
pass
context = None
try:
browser.close()
except:
pass
browser = None

View File

@@ -7,10 +7,11 @@ from urllib.parse import urlparse
from loguru import logger
from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \
SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS
SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS, \
SCREENSHOT_MAX_TOTAL_HEIGHT
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, \
BrowserConnectError
# Bug 3 in Playwright screenshot handling
@@ -27,71 +28,54 @@ async def capture_full_page(page):
start = time.time()
page_height = await page.evaluate("document.documentElement.scrollHeight")
page_width = await page.evaluate("document.documentElement.scrollWidth")
original_viewport = page.viewport
logger.debug(f"Puppeteer viewport size {page.viewport}")
logger.debug(f"Puppeteer viewport size {page.viewport} page height {page_height} page width {page_width}")
############################################################
#### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) #####
############################################################
# Bug 3 in Playwright screenshot handling
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
# JPEG is better here because the screenshots can be very very large
# Optimization to avoid unnecessary stitching if we can avoid it
# Use the default screenshot method for smaller pages to take advantage
# of GPU and native playwright screenshot optimizations
# - No PIL needed here, no danger of memory leaks, no sub process required
if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ):
logger.debug("Using default screenshot method")
await page.evaluate(f"window.scrollTo(0, 0)")
screenshot = await page.screenshot(
type_="jpeg",
quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
fullPage=True,
)
logger.debug(f"Screenshot captured in {time.time() - start:.2f}s")
# Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
# which will significantly increase the IO size between the server and client, it's recommended to use the lowest
# acceptable screenshot quality here
step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Something that will not cause the GPU to overflow when taking the screenshot
screenshot_chunks = []
y = 0
if page_height > page.viewport['height']:
if page_height < step_size:
step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
await page.setViewport({'width': page.viewport['width'], 'height': step_size})
while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
await page.evaluate(f"window.scrollTo(0, {y})")
screenshot_chunks.append(await page.screenshot(type_='jpeg',
fullPage=False,
quality=int(os.getenv("SCREENSHOT_QUALITY", 72))))
y += step_size
await page.setViewport({'width': original_viewport['width'], 'height': original_viewport['height']})
if len(screenshot_chunks) > 1:
from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker
logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together")
parent_conn, child_conn = Pipe()
p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT))
p.start()
screenshot = parent_conn.recv_bytes()
p.join()
logger.debug(
f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
screenshot_chunks = None
return screenshot
###################################################################################
#### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES #####
###################################################################################
# - PIL can easily allocate memory and not release it cleanly
# - Fetching screenshot from playwright seems OK
# Image.new is leaky even with .close()
# So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling
logger.debug(
"Using stitching method for large screenshot because page height exceeds threshold"
)
# Limit the total capture height
capture_height = min(page_height, MAX_TOTAL_HEIGHT)
# Calculate number of chunks needed using ORIGINAL viewport height
num_chunks = (capture_height + page.viewport['height'] - 1) // page.viewport['height']
screenshot_chunks = []
# Track cumulative paste position
y_offset = 0
for _ in range(num_chunks):
await page.evaluate(f"window.scrollTo(0, {y_offset})")
h = min(page.viewport['height'], capture_height - y_offset)
screenshot_chunks.append(await page.screenshot(
type_="jpeg",
quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)),
))
y_offset += h # maybe better to inspect the image here?
# PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling
parent_conn, child_conn = Pipe()
p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height))
p.start()
result = parent_conn.recv_bytes()
p.join()
screenshot_chunks = None
logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s")
return result
f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
return screenshot_chunks[0]
class fetcher(Fetcher):

View File

@@ -33,7 +33,7 @@ from loguru import logger
from changedetectionio import __version__
from changedetectionio import queuedWatchMetaData
from changedetectionio.api import Watch, WatchHistory, WatchSingleHistory, CreateWatch, Import, SystemInfo, Tag, Tags
from changedetectionio.api import Watch, WatchHistory, WatchSingleHistory, CreateWatch, Import, SystemInfo, Tag, Tags, Notifications
from changedetectionio.api.Search import Search
from .time_handler import is_within_schedule
@@ -285,7 +285,8 @@ def changedetection_app(config=None, datastore_o=None):
watch_api.add_resource(Search, '/api/v1/search',
resource_class_kwargs={'datastore': datastore})
watch_api.add_resource(Notifications, '/api/v1/notifications',
resource_class_kwargs={'datastore': datastore})
@login_manager.user_loader
def user_loader(email):

View File

@@ -435,7 +435,9 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
return re.sub(pattern, repl, html_content)
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False) -> str:
def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
from inscriptis import get_text
from inscriptis.model.config import ParserConfig
@@ -470,9 +472,19 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
html_content = re.sub(r'</title>', r'</h1>', html_content)
text_content = get_text(html_content, config=parser_config)
conn.send(text_content)
conn.close()
return text_content
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
from multiprocessing import Process, Pipe
parent_conn, child_conn = Pipe()
p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
p.start()
text = parent_conn.recv()
p.join()
return text
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content):

View File

@@ -964,3 +964,25 @@ class ChangeDetectionStore:
f_d.write(zlib.compress(f_j.read()))
os.unlink(json_path)
def add_notification_url(self, notification_url):
logger.debug(f">>> Adding new notification_url - '{notification_url}'")
notification_urls = self.data['settings']['application'].get('notification_urls', [])
if notification_url in notification_urls:
return notification_url
with self.lock:
notification_urls = self.__data['settings']['application'].get('notification_urls', [])
if notification_url in notification_urls:
return notification_url
# Append and update the datastore
notification_urls.append(notification_url)
self.__data['settings']['application']['notification_urls'] = notification_urls
self.needs_write = True
return notification_url

View File

@@ -443,6 +443,10 @@ Math: {{ 1 + 1 }}") }}
</tr>
</tbody>
</table>
<h4>Text similarity</h4>
<p><strong>Levenshtein Distance</strong> - Last 2 snapshots: {{ lev_info }}</p>
<p style="max-width: 80%; font-size: 80%"><strong>Levenshtein Distance</strong> Calculates the minimum number of insertions, deletions, and substitutions required to change one text into the other.</p>
{% if watch.history_n %}
<p>
<a href="{{url_for('ui.ui_edit.watch_get_latest_html', uuid=uuid)}}" class="pure-button button-small">Download latest HTML snapshot</a>

View File

@@ -0,0 +1,108 @@
#!/usr/bin/env python3
from flask import url_for
from .util import live_server_setup
import json
def test_api_notifications_crud(client, live_server):
live_server_setup(live_server)
api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
# Confirm notifications are initially empty
res = client.get(
url_for("notifications"),
headers={'x-api-key': api_key}
)
assert res.status_code == 200
assert res.json == {"notification_urls": []}
# Add notification URLs
test_urls = ["posts://example.com/notify1", "posts://example.com/notify2"]
res = client.post(
url_for("notifications"),
data=json.dumps({"notification_urls": test_urls}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 201
for url in test_urls:
assert url in res.json["notification_urls"]
# Confirm the notification URLs were added
res = client.get(
url_for("notifications"),
headers={'x-api-key': api_key}
)
assert res.status_code == 200
for url in test_urls:
assert url in res.json["notification_urls"]
# Delete one notification URL
res = client.delete(
url_for("notifications"),
data=json.dumps({"notification_urls": [test_urls[0]]}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 204
# Confirm it was removed and the other remains
res = client.get(
url_for("notifications"),
headers={'x-api-key': api_key}
)
assert res.status_code == 200
assert test_urls[0] not in res.json["notification_urls"]
assert test_urls[1] in res.json["notification_urls"]
# Try deleting a non-existent URL
res = client.delete(
url_for("notifications"),
data=json.dumps({"notification_urls": ["posts://nonexistent.com"]}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 400
res = client.post(
url_for("notifications"),
data=json.dumps({"notification_urls": test_urls}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 201
# Replace with a new list
replacement_urls = ["posts://new.example.com"]
res = client.put(
url_for("notifications"),
data=json.dumps({"notification_urls": replacement_urls}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 200
assert res.json["notification_urls"] == replacement_urls
# Replace with an empty list
res = client.put(
url_for("notifications"),
data=json.dumps({"notification_urls": []}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 200
assert res.json["notification_urls"] == []
# Provide an invalid AppRise URL to trigger validation error
invalid_urls = ["ftp://not-app-rise"]
res = client.post(
url_for("notifications"),
data=json.dumps({"notification_urls": invalid_urls}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 400
assert "is not a valid AppRise URL." in res.data.decode()
res = client.put(
url_for("notifications"),
data=json.dumps({"notification_urls": invalid_urls}),
headers={'content-type': 'application/json', 'x-api-key': api_key}
)
assert res.status_code == 400
assert "is not a valid AppRise URL." in res.data.decode()

View File

@@ -74,6 +74,11 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
res = client.get(url_for("ui.ui_edit.watch_get_latest_html", uuid=uuid))
assert b'which has this one new line' in res.data
# Check the 'levenshtein' distance calc showed something useful
res = client.get(url_for("ui.ui_edit.edit_page", uuid=uuid))
assert b'Last 2 snapshots: 17' in res.data
# Now something should be ready, indicated by having a 'unviewed' class
res = client.get(url_for("watchlist.index"))
assert b'unviewed' in res.data

View File

@@ -68,6 +68,8 @@ openpyxl
jq~=1.3; python_version >= "3.8" and sys_platform == "darwin"
jq~=1.3; python_version >= "3.8" and sys_platform == "linux"
levenshtein
# playwright is installed at Dockerfile build time because it's not available on all platforms
pyppeteer-ng==2.0.0rc9