mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-10-31 06:37:41 +00:00 
			
		
		
		
	Compare commits
	
		
			7 Commits
		
	
	
		
			puppeteer-
			...
			playwright
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | 3d2bc5049b | ||
|   | 186016e605 | ||
|   | 3a583a4e5d | ||
|   | cfb4decf67 | ||
|   | 8067d5170b | ||
|   | 5551acf67d | ||
|   | 45a030bac6 | 
| @@ -2,7 +2,7 @@ | ||||
|  | ||||
| # Read more https://github.com/dgtlmoon/changedetection.io/wiki | ||||
|  | ||||
| __version__ = '0.49.12' | ||||
| __version__ = '0.49.13' | ||||
|  | ||||
| from changedetectionio.strtobool import strtobool | ||||
| from json.decoder import JSONDecodeError | ||||
|   | ||||
							
								
								
									
										145
									
								
								changedetectionio/api/Notifications.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										145
									
								
								changedetectionio/api/Notifications.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,145 @@ | ||||
| from flask_expects_json import expects_json | ||||
| from flask_restful import Resource | ||||
| from . import auth | ||||
| from flask_restful import abort, Resource | ||||
| from flask import request | ||||
| from . import auth | ||||
| from . import schema_create_notification_urls, schema_delete_notification_urls | ||||
|  | ||||
| class Notifications(Resource): | ||||
|     def __init__(self, **kwargs): | ||||
|         # datastore is a black box dependency | ||||
|         self.datastore = kwargs['datastore'] | ||||
|  | ||||
|     @auth.check_token | ||||
|     def get(self): | ||||
|         """ | ||||
|         @api {get} /api/v1/notifications Return Notification URL List | ||||
|         @apiDescription Return the Notification URL List from the configuration | ||||
|         @apiExample {curl} Example usage: | ||||
|             curl http://localhost:5000/api/v1/notifications -H"x-api-key:813031b16330fe25e3780cf0325daa45" | ||||
|             HTTP/1.0 200 | ||||
|             { | ||||
|                 'notification_urls': ["notification-urls-list"] | ||||
|             } | ||||
|         @apiName Get | ||||
|         @apiGroup Notifications | ||||
|         """ | ||||
|  | ||||
|         notification_urls = self.datastore.data.get('settings', {}).get('application', {}).get('notification_urls', [])         | ||||
|  | ||||
|         return { | ||||
|                 'notification_urls': notification_urls, | ||||
|                }, 200 | ||||
|      | ||||
|     @auth.check_token | ||||
|     @expects_json(schema_create_notification_urls) | ||||
|     def post(self): | ||||
|         """ | ||||
|         @api {post} /api/v1/notifications Create Notification URLs | ||||
|         @apiDescription Add one or more notification URLs from the configuration | ||||
|         @apiExample {curl} Example usage: | ||||
|             curl http://localhost:5000/api/v1/notifications/batch -H"x-api-key:813031b16330fe25e3780cf0325daa45" -H "Content-Type: application/json" -d '{"notification_urls": ["url1", "url2"]}' | ||||
|         @apiName CreateBatch | ||||
|         @apiGroup Notifications | ||||
|         @apiSuccess (201) {Object[]} notification_urls List of added notification URLs | ||||
|         @apiError (400) {String} Invalid input | ||||
|         """ | ||||
|  | ||||
|         json_data = request.get_json() | ||||
|         notification_urls = json_data.get("notification_urls", []) | ||||
|  | ||||
|         from wtforms import ValidationError | ||||
|         try: | ||||
|             validate_notification_urls(notification_urls) | ||||
|         except ValidationError as e: | ||||
|             return str(e), 400 | ||||
|  | ||||
|         added_urls = [] | ||||
|  | ||||
|         for url in notification_urls: | ||||
|             clean_url = url.strip() | ||||
|             added_url = self.datastore.add_notification_url(clean_url) | ||||
|             if added_url: | ||||
|                 added_urls.append(added_url) | ||||
|  | ||||
|         if not added_urls: | ||||
|             return "No valid notification URLs were added", 400 | ||||
|  | ||||
|         return {'notification_urls': added_urls}, 201 | ||||
|      | ||||
|     @auth.check_token | ||||
|     @expects_json(schema_create_notification_urls) | ||||
|     def put(self): | ||||
|         """ | ||||
|         @api {put} /api/v1/notifications Replace Notification URLs | ||||
|         @apiDescription Replace all notification URLs with the provided list (can be empty) | ||||
|         @apiExample {curl} Example usage: | ||||
|             curl -X PUT http://localhost:5000/api/v1/notifications -H"x-api-key:813031b16330fe25e3780cf0325daa45" -H "Content-Type: application/json" -d '{"notification_urls": ["url1", "url2"]}' | ||||
|         @apiName Replace | ||||
|         @apiGroup Notifications | ||||
|         @apiSuccess (200) {Object[]} notification_urls List of current notification URLs | ||||
|         @apiError (400) {String} Invalid input | ||||
|         """ | ||||
|         json_data = request.get_json() | ||||
|         notification_urls = json_data.get("notification_urls", []) | ||||
|  | ||||
|         from wtforms import ValidationError | ||||
|         try: | ||||
|             validate_notification_urls(notification_urls) | ||||
|         except ValidationError as e: | ||||
|             return str(e), 400 | ||||
|          | ||||
|         if not isinstance(notification_urls, list): | ||||
|             return "Invalid input format", 400 | ||||
|  | ||||
|         clean_urls = [url.strip() for url in notification_urls if isinstance(url, str)] | ||||
|         self.datastore.data['settings']['application']['notification_urls'] = clean_urls | ||||
|         self.datastore.needs_write = True | ||||
|  | ||||
|         return {'notification_urls': clean_urls}, 200 | ||||
|          | ||||
|     @auth.check_token | ||||
|     @expects_json(schema_delete_notification_urls) | ||||
|     def delete(self): | ||||
|         """ | ||||
|         @api {delete} /api/v1/notifications Delete Notification URLs | ||||
|         @apiDescription Deletes one or more notification URLs from the configuration | ||||
|         @apiExample {curl} Example usage: | ||||
|             curl http://localhost:5000/api/v1/notifications -X DELETE -H"x-api-key:813031b16330fe25e3780cf0325daa45" -H "Content-Type: application/json" -d '{"notification_urls": ["url1", "url2"]}' | ||||
|         @apiParam {String[]} notification_urls The notification URLs to delete. | ||||
|         @apiName Delete | ||||
|         @apiGroup Notifications | ||||
|         @apiSuccess (204) {String} OK Deleted | ||||
|         @apiError (400) {String} No matching notification URLs found. | ||||
|         """ | ||||
|  | ||||
|         json_data = request.get_json() | ||||
|         urls_to_delete = json_data.get("notification_urls", []) | ||||
|         if not isinstance(urls_to_delete, list): | ||||
|             abort(400, message="Expected a list of notification URLs.") | ||||
|  | ||||
|         notification_urls = self.datastore.data['settings']['application'].get('notification_urls', []) | ||||
|         deleted = [] | ||||
|  | ||||
|         for url in urls_to_delete: | ||||
|             clean_url = url.strip() | ||||
|             if clean_url in notification_urls: | ||||
|                 notification_urls.remove(clean_url) | ||||
|                 deleted.append(clean_url) | ||||
|  | ||||
|         if not deleted: | ||||
|             abort(400, message="No matching notification URLs found.") | ||||
|  | ||||
|         self.datastore.data['settings']['application']['notification_urls'] = notification_urls | ||||
|         self.datastore.needs_write = True | ||||
|  | ||||
|         return 'OK', 204 | ||||
|      | ||||
| def validate_notification_urls(notification_urls): | ||||
|     from changedetectionio.forms import ValidateAppRiseServers | ||||
|     validator = ValidateAppRiseServers() | ||||
|     class DummyForm: pass | ||||
|     dummy_form = DummyForm() | ||||
|     field = type("Field", (object,), {"data": notification_urls, "gettext": lambda self, x: x})() | ||||
|     validator(dummy_form, field) | ||||
| @@ -19,8 +19,15 @@ schema_create_tag['required'] = ['title'] | ||||
| schema_update_tag = copy.deepcopy(schema_tag) | ||||
| schema_update_tag['additionalProperties'] = False | ||||
|  | ||||
| schema_notification_urls = copy.deepcopy(schema) | ||||
| schema_create_notification_urls = copy.deepcopy(schema_notification_urls) | ||||
| schema_create_notification_urls['required'] = ['notification_urls'] | ||||
| schema_delete_notification_urls = copy.deepcopy(schema_notification_urls) | ||||
| schema_delete_notification_urls['required'] = ['notification_urls'] | ||||
|  | ||||
| # Import all API resources | ||||
| from .Watch import Watch, WatchHistory, WatchSingleHistory, CreateWatch | ||||
| from .Tags import Tags, Tag | ||||
| from .Import import Import | ||||
| from .SystemInfo import SystemInfo | ||||
| from .Notifications import Notifications | ||||
|   | ||||
| @@ -19,6 +19,20 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe | ||||
|             if tag_uuid in watch.get('tags', []) and (tag.get('include_filters') or tag.get('subtractive_selectors')): | ||||
|                 return True | ||||
|  | ||||
|     def levenshtein_ratio_recent_history(watch): | ||||
|         try: | ||||
|             from Levenshtein import ratio, distance | ||||
|             k = list(watch.history.keys()) | ||||
|             if len(k) >= 2: | ||||
|                 a = watch.get_history_snapshot(timestamp=k[0]) | ||||
|                 b = watch.get_history_snapshot(timestamp=k[1]) | ||||
|                 distance = distance(a, b) | ||||
|                 return distance | ||||
|         except Exception as e: | ||||
|             logger.warning("Unable to calc similarity", e) | ||||
|             return "Unable to calc similarity" | ||||
|         return '' | ||||
|  | ||||
|     @edit_blueprint.route("/edit/<string:uuid>", methods=['GET', 'POST']) | ||||
|     @login_optionally_required | ||||
|     # https://stackoverflow.com/questions/42984453/wtforms-populate-form-with-data-if-data-exists | ||||
| @@ -247,14 +261,15 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe | ||||
|                 'has_default_notification_urls': True if len(datastore.data['settings']['application']['notification_urls']) else False, | ||||
|                 'has_extra_headers_file': len(datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) > 0, | ||||
|                 'has_special_tag_options': _watch_has_tag_options_set(watch=watch), | ||||
|                 'watch_uses_webdriver': watch_uses_webdriver, | ||||
|                 'jq_support': jq_support, | ||||
|                 'lev_info': levenshtein_ratio_recent_history(watch), | ||||
|                 'playwright_enabled': os.getenv('PLAYWRIGHT_DRIVER_URL', False), | ||||
|                 'settings_application': datastore.data['settings']['application'], | ||||
|                 'timezone_default_config': datastore.data['settings']['application'].get('timezone'), | ||||
|                 'using_global_webdriver_wait': not default['webdriver_delay'], | ||||
|                 'uuid': uuid, | ||||
|                 'watch': watch | ||||
|                 'watch': watch, | ||||
|                 'watch_uses_webdriver': watch_uses_webdriver, | ||||
|             } | ||||
|  | ||||
|             included_content = None | ||||
|   | ||||
| @@ -96,7 +96,7 @@ def execute_ruleset_against_all_plugins(current_watch_uuid: str, application_dat | ||||
|      | ||||
|     ruleset_settings = application_datastruct['watching'].get(current_watch_uuid) | ||||
|  | ||||
|     if ruleset_settings.get("conditions"): | ||||
|     if ruleset_settings and ruleset_settings.get("conditions"): | ||||
|         logic_operator = "and" if ruleset_settings.get("conditions_match_logic", "ALL") == "ALL" else "or" | ||||
|         complete_rules = filter_complete_rules(ruleset_settings['conditions']) | ||||
|         if complete_rules: | ||||
|   | ||||
| @@ -7,13 +7,13 @@ import os | ||||
| # Visual Selector scraper - 'Button' is there because some sites have <button>OUT OF STOCK</button>. | ||||
| visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,button' | ||||
|  | ||||
| SCREENSHOT_MAX_HEIGHT_DEFAULT = 16000 | ||||
| SCREENSHOT_MAX_HEIGHT_DEFAULT = 20000 | ||||
| SCREENSHOT_DEFAULT_QUALITY = 40 | ||||
|  | ||||
| # Maximum total height for the final image (When in stitch mode). | ||||
| # We limit this to 16000px due to the huge amount of RAM that was being used | ||||
| # Example: 16000 × 1400 × 3 = 67,200,000 bytes ≈ 64.1 MB (not including buffers in PIL etc) | ||||
| MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT)) | ||||
| SCREENSHOT_MAX_TOTAL_HEIGHT = int(os.getenv("SCREENSHOT_MAX_HEIGHT", SCREENSHOT_MAX_HEIGHT_DEFAULT)) | ||||
|  | ||||
| # The size at which we will switch to stitching method, when below this (and | ||||
| # MAX_TOTAL_HEIGHT which can be set by a user) we will use the default | ||||
|   | ||||
| @@ -5,13 +5,10 @@ from urllib.parse import urlparse | ||||
| from loguru import logger | ||||
|  | ||||
| from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \ | ||||
|     SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS | ||||
| from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker | ||||
|     SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_MAX_TOTAL_HEIGHT, XPATH_ELEMENT_JS, INSTOCK_DATA_JS | ||||
| from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent | ||||
| from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable | ||||
|  | ||||
|  | ||||
|  | ||||
| def capture_full_page(page): | ||||
|     import os | ||||
|     import time | ||||
| @@ -20,84 +17,61 @@ def capture_full_page(page): | ||||
|     start = time.time() | ||||
|  | ||||
|     page_height = page.evaluate("document.documentElement.scrollHeight") | ||||
|     page_width = page.evaluate("document.documentElement.scrollWidth") | ||||
|     original_viewport = page.viewport_size | ||||
|  | ||||
|     logger.debug(f"Playwright viewport size {page.viewport_size}") | ||||
|     logger.debug(f"Playwright viewport size {page.viewport_size} page height {page_height} page width {page_width}") | ||||
|  | ||||
|     ############################################################ | ||||
|     #### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) ##### | ||||
|     ############################################################ | ||||
|     # Use an approach similar to puppeteer: set a larger viewport and take screenshots in chunks | ||||
|     step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Size that won't cause GPU to overflow | ||||
|     screenshot_chunks = [] | ||||
|     y = 0 | ||||
|  | ||||
|     # Optimization to avoid unnecessary stitching if we can avoid it | ||||
|     # Use the default screenshot method for smaller pages to take advantage | ||||
|     # of GPU and native playwright screenshot optimizations | ||||
|     # - No PIL needed here, no danger of memory leaks, no sub process required | ||||
|     if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ): | ||||
|         logger.debug("Using default screenshot method") | ||||
|     if page_height > page.viewport_size['height']: | ||||
|         if page_height < step_size: | ||||
|             step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size | ||||
|         logger.debug(f"Setting bigger viewport to step through large page width W{page.viewport_size['width']}xH{step_size} because page_height > viewport_size") | ||||
|         # Set viewport to a larger size to capture more content at once | ||||
|         page.set_viewport_size({'width': page.viewport_size['width'], 'height': step_size}) | ||||
|  | ||||
|     # Capture screenshots in chunks up to the max total height | ||||
|     while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT): | ||||
|         page.request_gc() | ||||
|         screenshot = page.screenshot( | ||||
|         page.evaluate(f"window.scrollTo(0, {y})") | ||||
|         page.request_gc() | ||||
|         screenshot_chunks.append(page.screenshot( | ||||
|             type="jpeg", | ||||
|             quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)), | ||||
|             full_page=True, | ||||
|         ) | ||||
|             full_page=False, | ||||
|             quality=int(os.getenv("SCREENSHOT_QUALITY", 72)) | ||||
|         )) | ||||
|         y += step_size | ||||
|         page.request_gc() | ||||
|         logger.debug(f"Screenshot captured in {time.time() - start:.2f}s") | ||||
|  | ||||
|     # Restore original viewport size | ||||
|     page.set_viewport_size({'width': original_viewport['width'], 'height': original_viewport['height']}) | ||||
|  | ||||
|     # If we have multiple chunks, stitch them together | ||||
|     if len(screenshot_chunks) > 1: | ||||
|         from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker | ||||
|         logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together") | ||||
|         parent_conn, child_conn = Pipe() | ||||
|         p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT)) | ||||
|         p.start() | ||||
|         screenshot = parent_conn.recv_bytes() | ||||
|         p.join() | ||||
|         logger.debug( | ||||
|             f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s") | ||||
|         # Explicit cleanup | ||||
|         del screenshot_chunks | ||||
|         del p | ||||
|         del parent_conn, child_conn | ||||
|         screenshot_chunks = None | ||||
|         return screenshot | ||||
|  | ||||
|  | ||||
|  | ||||
|     ################################################################################### | ||||
|     #### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES  ##### | ||||
|     ################################################################################### | ||||
|     # - PIL can easily allocate memory and not release it cleanly | ||||
|     # - Fetching screenshot from playwright seems  OK | ||||
|     # Image.new is leaky even with .close() | ||||
|     # So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling | ||||
|  | ||||
|     logger.debug( | ||||
|         "Using stitching method for large screenshot because page height exceeds threshold" | ||||
|     ) | ||||
|         f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s") | ||||
|  | ||||
|     # Limit the total capture height | ||||
|     capture_height = min(page_height, MAX_TOTAL_HEIGHT) | ||||
|  | ||||
|     # Calculate number of chunks needed using ORIGINAL viewport height | ||||
|     num_chunks = (capture_height + page.viewport_size['height'] - 1) // page.viewport_size['height'] | ||||
|     screenshot_chunks = [] | ||||
|  | ||||
|     # Track cumulative paste position | ||||
|     y_offset = 0 | ||||
|     for _ in range(num_chunks): | ||||
|  | ||||
|         page.request_gc() | ||||
|         page.evaluate(f"window.scrollTo(0, {y_offset})") | ||||
|         page.request_gc() | ||||
|         h = min(page.viewport_size['height'], capture_height - y_offset) | ||||
|         screenshot_chunks.append(page.screenshot( | ||||
|                 type="jpeg", | ||||
|                 clip={ | ||||
|                     "x": 0, | ||||
|                     "y": 0, | ||||
|                     "width": page.viewport_size['width'], | ||||
|                     "height": h, | ||||
|                 }, | ||||
|                 quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)), | ||||
|             )) | ||||
|  | ||||
|         y_offset += h # maybe better to inspect the image here? | ||||
|         page.request_gc() | ||||
|  | ||||
|     # PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling | ||||
|  | ||||
|     parent_conn, child_conn = Pipe() | ||||
|     p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height)) | ||||
|     p.start() | ||||
|     result = parent_conn.recv_bytes() | ||||
|     p.join() | ||||
|  | ||||
|     screenshot_chunks = None | ||||
|     logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s") | ||||
|  | ||||
|     return result | ||||
|     return screenshot_chunks[0] | ||||
|  | ||||
|  | ||||
| class fetcher(Fetcher): | ||||
| @@ -292,6 +266,7 @@ class fetcher(Fetcher): | ||||
|             self.page.request_gc() | ||||
|  | ||||
|             self.content = self.page.content() | ||||
|             self.page.request_gc() | ||||
|             logger.debug(f"Scrape xPath element data in browser done in {time.time() - now:.2f}s") | ||||
|  | ||||
|             # Bug 3 in Playwright screenshot handling | ||||
| @@ -316,5 +291,28 @@ class fetcher(Fetcher): | ||||
|                     pass | ||||
|                  | ||||
|                 # Clean up resources properly | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 try: | ||||
|                     self.page.request_gc() | ||||
|                 except: | ||||
|                     pass | ||||
|  | ||||
|                 try: | ||||
|                     self.page.close() | ||||
|                 except: | ||||
|                     pass | ||||
|                 self.page = None | ||||
|  | ||||
|                 try: | ||||
|                     context.close() | ||||
|                 except: | ||||
|                     pass | ||||
|                 context = None | ||||
|  | ||||
|                 try: | ||||
|                     browser.close() | ||||
|                 except: | ||||
|                     pass | ||||
|                 browser = None | ||||
|  | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -7,10 +7,11 @@ from urllib.parse import urlparse | ||||
| from loguru import logger | ||||
|  | ||||
| from changedetectionio.content_fetchers import SCREENSHOT_MAX_HEIGHT_DEFAULT, visualselector_xpath_selectors, \ | ||||
|     SCREENSHOT_SIZE_STITCH_THRESHOLD, MAX_TOTAL_HEIGHT, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS | ||||
|     SCREENSHOT_SIZE_STITCH_THRESHOLD, SCREENSHOT_DEFAULT_QUALITY, XPATH_ELEMENT_JS, INSTOCK_DATA_JS, \ | ||||
|     SCREENSHOT_MAX_TOTAL_HEIGHT | ||||
| from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent | ||||
| from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError | ||||
| from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker | ||||
| from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, \ | ||||
|     BrowserConnectError | ||||
|  | ||||
|  | ||||
| # Bug 3 in Playwright screenshot handling | ||||
| @@ -27,71 +28,54 @@ async def capture_full_page(page): | ||||
|     start = time.time() | ||||
|  | ||||
|     page_height = await page.evaluate("document.documentElement.scrollHeight") | ||||
|     page_width = await page.evaluate("document.documentElement.scrollWidth") | ||||
|     original_viewport = page.viewport | ||||
|  | ||||
|     logger.debug(f"Puppeteer viewport size {page.viewport}") | ||||
|     logger.debug(f"Puppeteer viewport size {page.viewport} page height {page_height} page width {page_width}") | ||||
|  | ||||
|     ############################################################ | ||||
|     #### SCREENSHOT FITS INTO ONE SNAPSHOT (SMALLER PAGES) ##### | ||||
|     ############################################################ | ||||
|     # Bug 3 in Playwright screenshot handling | ||||
|     # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it | ||||
|     # JPEG is better here because the screenshots can be very very large | ||||
|  | ||||
|     # Optimization to avoid unnecessary stitching if we can avoid it | ||||
|     # Use the default screenshot method for smaller pages to take advantage | ||||
|     # of GPU and native playwright screenshot optimizations | ||||
|     # - No PIL needed here, no danger of memory leaks, no sub process required | ||||
|     if (page_height < SCREENSHOT_SIZE_STITCH_THRESHOLD and page_height < MAX_TOTAL_HEIGHT ): | ||||
|         logger.debug("Using default screenshot method") | ||||
|         await page.evaluate(f"window.scrollTo(0, 0)") | ||||
|         screenshot = await page.screenshot( | ||||
|             type_="jpeg", | ||||
|             quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)), | ||||
|             fullPage=True, | ||||
|         ) | ||||
|         logger.debug(f"Screenshot captured in {time.time() - start:.2f}s") | ||||
|     # Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded | ||||
|     # which will significantly increase the IO size between the server and client, it's recommended to use the lowest | ||||
|     # acceptable screenshot quality here | ||||
|  | ||||
|  | ||||
|     step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Something that will not cause the GPU to overflow when taking the screenshot | ||||
|     screenshot_chunks = [] | ||||
|     y = 0 | ||||
|     if page_height > page.viewport['height']: | ||||
|         if page_height < step_size: | ||||
|             step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size | ||||
|         await page.setViewport({'width': page.viewport['width'], 'height': step_size}) | ||||
|  | ||||
|     while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT): | ||||
|         await page.evaluate(f"window.scrollTo(0, {y})") | ||||
|         screenshot_chunks.append(await page.screenshot(type_='jpeg', | ||||
|                                                        fullPage=False, | ||||
|                                                        quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))) | ||||
|         y += step_size | ||||
|  | ||||
|     await page.setViewport({'width': original_viewport['width'], 'height': original_viewport['height']}) | ||||
|  | ||||
|     if len(screenshot_chunks) > 1: | ||||
|         from changedetectionio.content_fetchers.screenshot_handler import stitch_images_worker | ||||
|         logger.debug(f"Screenshot stitching {len(screenshot_chunks)} chunks together") | ||||
|         parent_conn, child_conn = Pipe() | ||||
|         p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, SCREENSHOT_MAX_TOTAL_HEIGHT)) | ||||
|         p.start() | ||||
|         screenshot = parent_conn.recv_bytes() | ||||
|         p.join() | ||||
|         logger.debug( | ||||
|             f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s") | ||||
|  | ||||
|         screenshot_chunks = None | ||||
|         return screenshot | ||||
|  | ||||
|     ################################################################################### | ||||
|     #### CASE FOR LARGE SCREENSHOTS THAT NEED TO BE TRIMMED DUE TO MEMORY ISSUES  ##### | ||||
|     ################################################################################### | ||||
|     # - PIL can easily allocate memory and not release it cleanly | ||||
|     # - Fetching screenshot from playwright seems  OK | ||||
|     # Image.new is leaky even with .close() | ||||
|     # So lets prepare all the data chunks and farm it out to a subprocess for clean memory handling | ||||
|  | ||||
|     logger.debug( | ||||
|         "Using stitching method for large screenshot because page height exceeds threshold" | ||||
|     ) | ||||
|  | ||||
|     # Limit the total capture height | ||||
|     capture_height = min(page_height, MAX_TOTAL_HEIGHT) | ||||
|  | ||||
|     # Calculate number of chunks needed using ORIGINAL viewport height | ||||
|     num_chunks = (capture_height + page.viewport['height'] - 1) // page.viewport['height'] | ||||
|     screenshot_chunks = [] | ||||
|  | ||||
|     # Track cumulative paste position | ||||
|     y_offset = 0 | ||||
|     for _ in range(num_chunks): | ||||
|         await page.evaluate(f"window.scrollTo(0, {y_offset})") | ||||
|         h = min(page.viewport['height'], capture_height - y_offset) | ||||
|         screenshot_chunks.append(await page.screenshot( | ||||
|                 type_="jpeg", | ||||
|                 quality=int(os.getenv("SCREENSHOT_QUALITY", SCREENSHOT_DEFAULT_QUALITY)), | ||||
|             )) | ||||
|  | ||||
|         y_offset += h # maybe better to inspect the image here? | ||||
|  | ||||
|     # PIL can leak memory in various situations, assign the work to a subprocess for totally clean handling | ||||
|  | ||||
|     parent_conn, child_conn = Pipe() | ||||
|     p = Process(target=stitch_images_worker, args=(child_conn, screenshot_chunks, page_height, capture_height)) | ||||
|     p.start() | ||||
|     result = parent_conn.recv_bytes() | ||||
|     p.join() | ||||
|  | ||||
|     screenshot_chunks = None | ||||
|     logger.debug(f"Screenshot - Page height: {page_height} Capture height: {capture_height} - Stitched together in {time.time() - start:.2f}s") | ||||
|  | ||||
|     return result | ||||
|         f"Screenshot Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s") | ||||
|     return screenshot_chunks[0] | ||||
|  | ||||
|  | ||||
| class fetcher(Fetcher): | ||||
|   | ||||
| @@ -33,7 +33,7 @@ from loguru import logger | ||||
|  | ||||
| from changedetectionio import __version__ | ||||
| from changedetectionio import queuedWatchMetaData | ||||
| from changedetectionio.api import Watch, WatchHistory, WatchSingleHistory, CreateWatch, Import, SystemInfo, Tag, Tags | ||||
| from changedetectionio.api import Watch, WatchHistory, WatchSingleHistory, CreateWatch, Import, SystemInfo, Tag, Tags, Notifications | ||||
| from changedetectionio.api.Search import Search | ||||
| from .time_handler import is_within_schedule | ||||
|  | ||||
| @@ -285,7 +285,8 @@ def changedetection_app(config=None, datastore_o=None): | ||||
|     watch_api.add_resource(Search, '/api/v1/search', | ||||
|                            resource_class_kwargs={'datastore': datastore}) | ||||
|  | ||||
|  | ||||
|     watch_api.add_resource(Notifications, '/api/v1/notifications', | ||||
|                            resource_class_kwargs={'datastore': datastore}) | ||||
|  | ||||
|     @login_manager.user_loader | ||||
|     def user_loader(email): | ||||
|   | ||||
| @@ -435,7 +435,9 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False | ||||
|  | ||||
|     return re.sub(pattern, repl, html_content) | ||||
|  | ||||
| def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False) -> str: | ||||
|  | ||||
| def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False): | ||||
|  | ||||
|     from inscriptis import get_text | ||||
|     from inscriptis.model.config import ParserConfig | ||||
|  | ||||
| @@ -470,9 +472,19 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals | ||||
|         html_content = re.sub(r'</title>', r'</h1>', html_content) | ||||
|  | ||||
|     text_content = get_text(html_content, config=parser_config) | ||||
|     conn.send(text_content) | ||||
|     conn.close() | ||||
|  | ||||
|     return text_content | ||||
| # NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON | ||||
| def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False): | ||||
|     from multiprocessing import Process, Pipe | ||||
|  | ||||
|     parent_conn, child_conn = Pipe() | ||||
|     p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss)) | ||||
|     p.start() | ||||
|     text = parent_conn.recv() | ||||
|     p.join() | ||||
|     return text | ||||
|  | ||||
| # Does LD+JSON exist with a @type=='product' and a .price set anywhere? | ||||
| def has_ldjson_product_info(content): | ||||
|   | ||||
| @@ -964,3 +964,25 @@ class ChangeDetectionStore: | ||||
|                         f_d.write(zlib.compress(f_j.read())) | ||||
|                         os.unlink(json_path) | ||||
|  | ||||
|     def add_notification_url(self, notification_url): | ||||
|          | ||||
|         logger.debug(f">>> Adding new notification_url - '{notification_url}'") | ||||
|  | ||||
|         notification_urls = self.data['settings']['application'].get('notification_urls', []) | ||||
|  | ||||
|         if notification_url in notification_urls: | ||||
|             return notification_url | ||||
|  | ||||
|         with self.lock: | ||||
|             notification_urls = self.__data['settings']['application'].get('notification_urls', []) | ||||
|  | ||||
|             if notification_url in notification_urls: | ||||
|                 return notification_url | ||||
|  | ||||
|             # Append and update the datastore | ||||
|             notification_urls.append(notification_url) | ||||
|             self.__data['settings']['application']['notification_urls'] = notification_urls | ||||
|             self.needs_write = True | ||||
|  | ||||
|         return notification_url | ||||
|  | ||||
|   | ||||
| @@ -443,6 +443,10 @@ Math: {{ 1 + 1 }}") }} | ||||
|                         </tr> | ||||
|                         </tbody> | ||||
|                     </table> | ||||
|  | ||||
|                     <h4>Text similarity</h4> | ||||
|                     <p><strong>Levenshtein Distance</strong> - Last 2 snapshots: {{ lev_info }}</p> | ||||
|                     <p style="max-width: 80%; font-size: 80%"><strong>Levenshtein Distance</strong> Calculates the minimum number of insertions, deletions, and substitutions required to change one text into the other.</p> | ||||
|                     {% if watch.history_n %} | ||||
|                         <p> | ||||
|                              <a href="{{url_for('ui.ui_edit.watch_get_latest_html', uuid=uuid)}}" class="pure-button button-small">Download latest HTML snapshot</a> | ||||
|   | ||||
							
								
								
									
										108
									
								
								changedetectionio/tests/test_api_notifications.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										108
									
								
								changedetectionio/tests/test_api_notifications.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,108 @@ | ||||
| #!/usr/bin/env python3 | ||||
|  | ||||
| from flask import url_for | ||||
| from .util import live_server_setup | ||||
| import json | ||||
|  | ||||
| def test_api_notifications_crud(client, live_server): | ||||
|     live_server_setup(live_server) | ||||
|     api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token') | ||||
|  | ||||
|     # Confirm notifications are initially empty | ||||
|     res = client.get( | ||||
|         url_for("notifications"), | ||||
|         headers={'x-api-key': api_key} | ||||
|     ) | ||||
|     assert res.status_code == 200 | ||||
|     assert res.json == {"notification_urls": []} | ||||
|  | ||||
|     # Add notification URLs | ||||
|     test_urls = ["posts://example.com/notify1", "posts://example.com/notify2"] | ||||
|     res = client.post( | ||||
|         url_for("notifications"), | ||||
|         data=json.dumps({"notification_urls": test_urls}), | ||||
|         headers={'content-type': 'application/json', 'x-api-key': api_key} | ||||
|     ) | ||||
|     assert res.status_code == 201 | ||||
|     for url in test_urls: | ||||
|         assert url in res.json["notification_urls"] | ||||
|  | ||||
|     # Confirm the notification URLs were added | ||||
|     res = client.get( | ||||
|         url_for("notifications"), | ||||
|         headers={'x-api-key': api_key} | ||||
|     ) | ||||
|     assert res.status_code == 200 | ||||
|     for url in test_urls: | ||||
|         assert url in res.json["notification_urls"] | ||||
|  | ||||
|     # Delete one notification URL | ||||
|     res = client.delete( | ||||
|         url_for("notifications"), | ||||
|         data=json.dumps({"notification_urls": [test_urls[0]]}), | ||||
|         headers={'content-type': 'application/json', 'x-api-key': api_key} | ||||
|     ) | ||||
|     assert res.status_code == 204 | ||||
|  | ||||
|     # Confirm it was removed and the other remains | ||||
|     res = client.get( | ||||
|         url_for("notifications"), | ||||
|         headers={'x-api-key': api_key} | ||||
|     ) | ||||
|     assert res.status_code == 200 | ||||
|     assert test_urls[0] not in res.json["notification_urls"] | ||||
|     assert test_urls[1] in res.json["notification_urls"] | ||||
|  | ||||
|     # Try deleting a non-existent URL | ||||
|     res = client.delete( | ||||
|         url_for("notifications"), | ||||
|         data=json.dumps({"notification_urls": ["posts://nonexistent.com"]}), | ||||
|         headers={'content-type': 'application/json', 'x-api-key': api_key} | ||||
|     ) | ||||
|     assert res.status_code == 400 | ||||
|  | ||||
|     res = client.post( | ||||
|         url_for("notifications"), | ||||
|         data=json.dumps({"notification_urls": test_urls}), | ||||
|         headers={'content-type': 'application/json', 'x-api-key': api_key} | ||||
|     ) | ||||
|     assert res.status_code == 201 | ||||
|  | ||||
|     # Replace with a new list | ||||
|     replacement_urls = ["posts://new.example.com"] | ||||
|     res = client.put( | ||||
|         url_for("notifications"), | ||||
|         data=json.dumps({"notification_urls": replacement_urls}), | ||||
|         headers={'content-type': 'application/json', 'x-api-key': api_key} | ||||
|     ) | ||||
|     assert res.status_code == 200 | ||||
|     assert res.json["notification_urls"] == replacement_urls | ||||
|  | ||||
|     # Replace with an empty list | ||||
|     res = client.put( | ||||
|         url_for("notifications"), | ||||
|         data=json.dumps({"notification_urls": []}), | ||||
|         headers={'content-type': 'application/json', 'x-api-key': api_key} | ||||
|     ) | ||||
|     assert res.status_code == 200 | ||||
|     assert res.json["notification_urls"] == [] | ||||
|  | ||||
|     # Provide an invalid AppRise URL to trigger validation error | ||||
|     invalid_urls = ["ftp://not-app-rise"] | ||||
|     res = client.post( | ||||
|         url_for("notifications"), | ||||
|         data=json.dumps({"notification_urls": invalid_urls}), | ||||
|         headers={'content-type': 'application/json', 'x-api-key': api_key} | ||||
|     ) | ||||
|     assert res.status_code == 400 | ||||
|     assert "is not a valid AppRise URL." in res.data.decode() | ||||
|  | ||||
|     res = client.put( | ||||
|         url_for("notifications"), | ||||
|         data=json.dumps({"notification_urls": invalid_urls}), | ||||
|         headers={'content-type': 'application/json', 'x-api-key': api_key} | ||||
|     ) | ||||
|     assert res.status_code == 400 | ||||
|     assert "is not a valid AppRise URL." in res.data.decode() | ||||
|  | ||||
|      | ||||
| @@ -74,6 +74,11 @@ def test_check_basic_change_detection_functionality(client, live_server, measure | ||||
|     res = client.get(url_for("ui.ui_edit.watch_get_latest_html", uuid=uuid)) | ||||
|     assert b'which has this one new line' in res.data | ||||
|  | ||||
|     # Check the 'levenshtein' distance calc showed something useful | ||||
|     res = client.get(url_for("ui.ui_edit.edit_page", uuid=uuid)) | ||||
|     assert b'Last 2 snapshots: 17' in res.data | ||||
|  | ||||
|  | ||||
|     # Now something should be ready, indicated by having a 'unviewed' class | ||||
|     res = client.get(url_for("watchlist.index")) | ||||
|     assert b'unviewed' in res.data | ||||
|   | ||||
| @@ -68,6 +68,8 @@ openpyxl | ||||
| jq~=1.3; python_version >= "3.8" and sys_platform == "darwin" | ||||
| jq~=1.3; python_version >= "3.8" and sys_platform == "linux" | ||||
|  | ||||
| levenshtein | ||||
|  | ||||
| # playwright is installed at Dockerfile build time because it's not available on all platforms | ||||
|  | ||||
| pyppeteer-ng==2.0.0rc9 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user