mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-10-30 22:27:52 +00:00 
			
		
		
		
	Compare commits
	
		
			7 Commits
		
	
	
		
			gh-paralle
			...
			browserste
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | e06275a4ad | ||
|   | acb642a937 | ||
|   | 3e7f2f2bad | ||
|   | 1f7a855529 | ||
|   | fa549b6e39 | ||
|   | 7ea66929e1 | ||
|   | f682a80c43 | 
							
								
								
									
										6
									
								
								.github/workflows/pypi-release.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								.github/workflows/pypi-release.yml
									
									
									
									
										vendored
									
									
								
							| @@ -45,8 +45,12 @@ jobs: | ||||
|     - name: Test that the basic pip built package runs without error | ||||
|       run: | | ||||
|         set -ex | ||||
|         pip3 install dist/changedetection.io*.whl | ||||
|         ls -alR  | ||||
|          | ||||
|         # Find and install the first .whl file | ||||
|         find dist -type f -name "*.whl" -exec pip3 install {} \; -quit | ||||
|         changedetection.io -d /tmp -p 10000 & | ||||
|          | ||||
|         sleep 3 | ||||
|         curl --retry-connrefused --retry 6 http://127.0.0.1:10000/static/styles/pure-min.css >/dev/null | ||||
|         curl --retry-connrefused --retry 6 http://127.0.0.1:10000/ >/dev/null | ||||
|   | ||||
| @@ -22,7 +22,10 @@ from loguru import logger | ||||
|  | ||||
| browsersteps_sessions = {} | ||||
| io_interface_context = None | ||||
|  | ||||
| import json | ||||
| import base64 | ||||
| import hashlib | ||||
| from flask import Response | ||||
|  | ||||
| def construct_blueprint(datastore: ChangeDetectionStore): | ||||
|     browser_steps_blueprint = Blueprint('browser_steps', __name__, template_folder="templates") | ||||
| @@ -160,14 +163,13 @@ def construct_blueprint(datastore: ChangeDetectionStore): | ||||
|         if not browsersteps_sessions.get(browsersteps_session_id): | ||||
|             return make_response('No session exists under that ID', 500) | ||||
|  | ||||
|  | ||||
|         is_last_step = False | ||||
|         # Actions - step/apply/etc, do the thing and return state | ||||
|         if request.method == 'POST': | ||||
|             # @todo - should always be an existing session | ||||
|             step_operation = request.form.get('operation') | ||||
|             step_selector = request.form.get('selector') | ||||
|             step_optional_value = request.form.get('optional_value') | ||||
|             step_n = int(request.form.get('step_n')) | ||||
|             is_last_step = strtobool(request.form.get('is_last_step')) | ||||
|  | ||||
|             # @todo try.. accept.. nice errors not popups.. | ||||
| @@ -182,16 +184,6 @@ def construct_blueprint(datastore: ChangeDetectionStore): | ||||
|                 # Try to find something of value to give back to the user | ||||
|                 return make_response(str(e).splitlines()[0], 401) | ||||
|  | ||||
|             # Get visual selector ready/update its data (also use the current filter info from the page?) | ||||
|             # When the last 'apply' button was pressed | ||||
|             # @todo this adds overhead because the xpath selection is happening twice | ||||
|             u = browsersteps_sessions[browsersteps_session_id]['browserstepper'].page.url | ||||
|             if is_last_step and u: | ||||
|                 (screenshot, xpath_data) = browsersteps_sessions[browsersteps_session_id]['browserstepper'].request_visualselector_data() | ||||
|                 watch = datastore.data['watching'].get(uuid) | ||||
|                 if watch: | ||||
|                     watch.save_screenshot(screenshot=screenshot) | ||||
|                     watch.save_xpath_data(data=xpath_data) | ||||
|  | ||||
| #        if not this_session.page: | ||||
| #            cleanup_playwright_session() | ||||
| @@ -199,31 +191,35 @@ def construct_blueprint(datastore: ChangeDetectionStore): | ||||
|  | ||||
|         # Screenshots and other info only needed on requesting a step (POST) | ||||
|         try: | ||||
|             state = browsersteps_sessions[browsersteps_session_id]['browserstepper'].get_current_state() | ||||
|             (screenshot, xpath_data) = browsersteps_sessions[browsersteps_session_id]['browserstepper'].get_current_state() | ||||
|             if is_last_step: | ||||
|                 watch = datastore.data['watching'].get(uuid) | ||||
|                 u = browsersteps_sessions[browsersteps_session_id]['browserstepper'].page.url | ||||
|                 if watch and u: | ||||
|                     watch.save_screenshot(screenshot=screenshot) | ||||
|                     watch.save_xpath_data(data=xpath_data) | ||||
|  | ||||
|         except playwright._impl._api_types.Error as e: | ||||
|             return make_response("Browser session ran out of time :( Please reload this page."+str(e), 401) | ||||
|         except Exception as e: | ||||
|             return make_response("Error fetching screenshot and element data - " + str(e), 401) | ||||
|  | ||||
|         # Use send_file() which is way faster than read/write loop on bytes | ||||
|         import json | ||||
|         from tempfile import mkstemp | ||||
|         from flask import send_file | ||||
|         tmp_fd, tmp_file = mkstemp(text=True, suffix=".json", prefix="changedetectionio-") | ||||
|         # SEND THIS BACK TO THE BROWSER | ||||
|  | ||||
|         output = json.dumps({'screenshot': "data:image/jpeg;base64,{}".format( | ||||
|             base64.b64encode(state[0]).decode('ascii')), | ||||
|             'xpath_data': state[1], | ||||
|             'session_age_start': browsersteps_sessions[browsersteps_session_id]['browserstepper'].age_start, | ||||
|             'browser_time_remaining': round(remaining) | ||||
|         }) | ||||
|         output = { | ||||
|             "screenshot": f"data:image/jpeg;base64,{base64.b64encode(screenshot).decode('ascii')}", | ||||
|             "xpath_data": xpath_data, | ||||
|             "session_age_start": browsersteps_sessions[browsersteps_session_id]['browserstepper'].age_start, | ||||
|             "browser_time_remaining": round(remaining) | ||||
|         } | ||||
|         json_data = json.dumps(output) | ||||
|  | ||||
|         with os.fdopen(tmp_fd, 'w') as f: | ||||
|             f.write(output) | ||||
|         # Generate an ETag (hash of the response body) | ||||
|         etag_hash = hashlib.md5(json_data.encode('utf-8')).hexdigest() | ||||
|  | ||||
|         response = make_response(send_file(path_or_file=tmp_file, | ||||
|                                            mimetype='application/json; charset=UTF-8', | ||||
|                                            etag=True)) | ||||
|         # No longer needed | ||||
|         os.unlink(tmp_file) | ||||
|         # Create the response with ETag | ||||
|         response = Response(json_data, mimetype="application/json; charset=UTF-8") | ||||
|         response.set_etag(etag_hash) | ||||
|  | ||||
|         return response | ||||
|  | ||||
|   | ||||
| @@ -1,14 +1,15 @@ | ||||
| #!/usr/bin/env python3 | ||||
|  | ||||
| import os | ||||
| import time | ||||
| import re | ||||
| from random import randint | ||||
| from loguru import logger | ||||
|  | ||||
| from changedetectionio.content_fetchers.helpers import capture_stitched_together_full_page, SCREENSHOT_SIZE_STITCH_THRESHOLD | ||||
| from changedetectionio.content_fetchers.base import manage_user_agent | ||||
| from changedetectionio.safe_jinja import render as jinja_render | ||||
|  | ||||
|  | ||||
|  | ||||
| # Two flags, tell the JS which of the "Selector" or "Value" field should be enabled in the front end | ||||
| # 0- off, 1- on | ||||
| browser_step_ui_config = {'Choose one': '0 0', | ||||
| @@ -279,6 +280,7 @@ class browsersteps_live_ui(steppable_browser_interface): | ||||
|         logger.debug(f"Time to browser setup {time.time()-now:.2f}s") | ||||
|         self.page.wait_for_timeout(1 * 1000) | ||||
|  | ||||
|  | ||||
|     def mark_as_closed(self): | ||||
|         logger.debug("Page closed, cleaning up..") | ||||
|  | ||||
| @@ -296,39 +298,30 @@ class browsersteps_live_ui(steppable_browser_interface): | ||||
|         now = time.time() | ||||
|         self.page.wait_for_timeout(1 * 1000) | ||||
|  | ||||
|         # The actual screenshot | ||||
|         screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=40) | ||||
|  | ||||
|         full_height = self.page.evaluate("document.documentElement.scrollHeight") | ||||
|  | ||||
|         if full_height >= SCREENSHOT_SIZE_STITCH_THRESHOLD: | ||||
|             logger.warning(f"Page full Height: {full_height}px longer than {SCREENSHOT_SIZE_STITCH_THRESHOLD}px, using 'stitched screenshot method'.") | ||||
|             screenshot = capture_stitched_together_full_page(self.page) | ||||
|         else: | ||||
|             screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=40) | ||||
|  | ||||
|         logger.debug(f"Time to get screenshot from browser {time.time() - now:.2f}s") | ||||
|  | ||||
|         now = time.time() | ||||
|         self.page.evaluate("var include_filters=''") | ||||
|         # Go find the interactive elements | ||||
|         # @todo in the future, something smarter that can scan for elements with .click/focus etc event handlers? | ||||
|         elements = 'a,button,input,select,textarea,i,th,td,p,li,h1,h2,h3,h4,div,span' | ||||
|         xpath_element_js = xpath_element_js.replace('%ELEMENTS%', elements) | ||||
|  | ||||
|         xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}") | ||||
|         # So the JS will find the smallest one first | ||||
|         xpath_data['size_pos'] = sorted(xpath_data['size_pos'], key=lambda k: k['width'] * k['height'], reverse=True) | ||||
|         logger.debug(f"Time to complete get_current_state of browser {time.time()-now:.2f}s") | ||||
|         # except | ||||
|         logger.debug(f"Time to scrape xpath element data in browser {time.time()-now:.2f}s") | ||||
|  | ||||
|         # playwright._impl._api_types.Error: Browser closed. | ||||
|         # @todo show some countdown timer? | ||||
|         return (screenshot, xpath_data) | ||||
|  | ||||
|     def request_visualselector_data(self): | ||||
|         """ | ||||
|         Does the same that the playwright operation in content_fetcher does | ||||
|         This is used to just bump the VisualSelector data so it' ready to go if they click on the tab | ||||
|         @todo refactor and remove duplicate code, add include_filters | ||||
|         :param xpath_data: | ||||
|         :param screenshot: | ||||
|         :param current_include_filters: | ||||
|         :return: | ||||
|         """ | ||||
|         import importlib.resources | ||||
|         self.page.evaluate("var include_filters=''") | ||||
|         xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text() | ||||
|         from changedetectionio.content_fetchers import visualselector_xpath_selectors | ||||
|         xpath_element_js = xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) | ||||
|         xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}") | ||||
|         screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 72))) | ||||
|  | ||||
|         return (screenshot, xpath_data) | ||||
|   | ||||
							
								
								
									
										104
									
								
								changedetectionio/content_fetchers/helpers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										104
									
								
								changedetectionio/content_fetchers/helpers.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,104 @@ | ||||
|  | ||||
| # Pages with a vertical height longer than this will use the 'stitch together' method. | ||||
|  | ||||
| # - Many GPUs have a max texture size of 16384x16384px (or lower on older devices). | ||||
| # - If a page is taller than ~8000–10000px, it risks exceeding GPU memory limits. | ||||
| # - This is especially important on headless Chromium, where Playwright may fail to allocate a massive full-page buffer. | ||||
|  | ||||
|  | ||||
| # The size at which we will switch to stitching method | ||||
| SCREENSHOT_SIZE_STITCH_THRESHOLD=8000 | ||||
|  | ||||
| from loguru import logger | ||||
|  | ||||
| def capture_stitched_together_full_page(page): | ||||
|     import io | ||||
|     import os | ||||
|     import time | ||||
|     from PIL import Image, ImageDraw, ImageFont | ||||
|  | ||||
|     MAX_TOTAL_HEIGHT = SCREENSHOT_SIZE_STITCH_THRESHOLD*4  # Maximum total height for the final image (When in stitch mode) | ||||
|     MAX_CHUNK_HEIGHT = 4000  # Height per screenshot chunk | ||||
|     WARNING_TEXT_HEIGHT = 20  # Height of the warning text overlay | ||||
|  | ||||
|     # Save the original viewport size | ||||
|     original_viewport = page.viewport_size | ||||
|     now = time.time() | ||||
|  | ||||
|     try: | ||||
|         viewport = page.viewport_size | ||||
|         page_height = page.evaluate("document.documentElement.scrollHeight") | ||||
|  | ||||
|         # Limit the total capture height | ||||
|         capture_height = min(page_height, MAX_TOTAL_HEIGHT) | ||||
|  | ||||
|         images = [] | ||||
|         total_captured_height = 0 | ||||
|  | ||||
|         for offset in range(0, capture_height, MAX_CHUNK_HEIGHT): | ||||
|             # Ensure we do not exceed the total height limit | ||||
|             chunk_height = min(MAX_CHUNK_HEIGHT, MAX_TOTAL_HEIGHT - total_captured_height) | ||||
|  | ||||
|             # Adjust viewport size for this chunk | ||||
|             page.set_viewport_size({"width": viewport["width"], "height": chunk_height}) | ||||
|  | ||||
|             # Scroll to the correct position | ||||
|             page.evaluate(f"window.scrollTo(0, {offset})") | ||||
|  | ||||
|             # Capture screenshot chunk | ||||
|             screenshot_bytes = page.screenshot(type='jpeg', quality=int(os.getenv("SCREENSHOT_QUALITY", 30))) | ||||
|             images.append(Image.open(io.BytesIO(screenshot_bytes))) | ||||
|  | ||||
|             total_captured_height += chunk_height | ||||
|  | ||||
|             # Stop if we reached the maximum total height | ||||
|             if total_captured_height >= MAX_TOTAL_HEIGHT: | ||||
|                 break | ||||
|  | ||||
|         # Create the final stitched image | ||||
|         stitched_image = Image.new('RGB', (viewport["width"], total_captured_height)) | ||||
|         y_offset = 0 | ||||
|  | ||||
|         # Stitch the screenshot chunks together | ||||
|         for img in images: | ||||
|             stitched_image.paste(img, (0, y_offset)) | ||||
|             y_offset += img.height | ||||
|  | ||||
|         logger.debug(f"Screenshot stitched together in {time.time()-now:.2f}s") | ||||
|  | ||||
|         # Overlay warning text if the screenshot was trimmed | ||||
|         if page_height > MAX_TOTAL_HEIGHT: | ||||
|             draw = ImageDraw.Draw(stitched_image) | ||||
|             warning_text = f"WARNING: Screenshot was {page_height}px but trimmed to {MAX_TOTAL_HEIGHT}px because it was too long" | ||||
|  | ||||
|             # Load font (default system font if Arial is unavailable) | ||||
|             try: | ||||
|                 font = ImageFont.truetype("arial.ttf", WARNING_TEXT_HEIGHT)  # Arial (Windows/Mac) | ||||
|             except IOError: | ||||
|                 font = ImageFont.load_default()  # Default font if Arial not found | ||||
|  | ||||
|             # Get text bounding box (correct method for newer Pillow versions) | ||||
|             text_bbox = draw.textbbox((0, 0), warning_text, font=font) | ||||
|             text_width = text_bbox[2] - text_bbox[0]  # Calculate text width | ||||
|             text_height = text_bbox[3] - text_bbox[1]  # Calculate text height | ||||
|  | ||||
|             # Define background rectangle (top of the image) | ||||
|             draw.rectangle([(0, 0), (viewport["width"], WARNING_TEXT_HEIGHT)], fill="white") | ||||
|  | ||||
|             # Center text horizontally within the warning area | ||||
|             text_x = (viewport["width"] - text_width) // 2 | ||||
|             text_y = (WARNING_TEXT_HEIGHT - text_height) // 2 | ||||
|  | ||||
|             # Draw the warning text in red | ||||
|             draw.text((text_x, text_y), warning_text, fill="red", font=font) | ||||
|  | ||||
|         # Save or return the final image | ||||
|         output = io.BytesIO() | ||||
|         stitched_image.save(output, format="JPEG", quality=int(os.getenv("SCREENSHOT_QUALITY", 30))) | ||||
|         screenshot = output.getvalue() | ||||
|  | ||||
|     finally: | ||||
|         # Restore the original viewport size | ||||
|         page.set_viewport_size(original_viewport) | ||||
|  | ||||
|     return screenshot | ||||
| @@ -4,6 +4,7 @@ from urllib.parse import urlparse | ||||
|  | ||||
| from loguru import logger | ||||
|  | ||||
| from changedetectionio.content_fetchers.helpers import capture_stitched_together_full_page, SCREENSHOT_SIZE_STITCH_THRESHOLD | ||||
| from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent | ||||
| from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable | ||||
|  | ||||
| @@ -89,6 +90,7 @@ class fetcher(Fetcher): | ||||
|         from playwright.sync_api import sync_playwright | ||||
|         import playwright._impl._errors | ||||
|         from changedetectionio.content_fetchers import visualselector_xpath_selectors | ||||
|         import time | ||||
|         self.delete_browser_steps_screenshots() | ||||
|         response = None | ||||
|  | ||||
| @@ -179,6 +181,7 @@ class fetcher(Fetcher): | ||||
|  | ||||
|             self.page.wait_for_timeout(extra_wait * 1000) | ||||
|  | ||||
|             now = time.time() | ||||
|             # So we can find an element on the page where its selector was entered manually (maybe not xPath etc) | ||||
|             if current_include_filters is not None: | ||||
|                 self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters))) | ||||
| @@ -190,6 +193,8 @@ class fetcher(Fetcher): | ||||
|             self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}") | ||||
|  | ||||
|             self.content = self.page.content() | ||||
|             logger.debug(f"Time to scrape xpath element data in browser {time.time() - now:.2f}s") | ||||
|  | ||||
|             # Bug 3 in Playwright screenshot handling | ||||
|             # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it | ||||
|             # JPEG is better here because the screenshots can be very very large | ||||
| @@ -199,10 +204,15 @@ class fetcher(Fetcher): | ||||
|             # acceptable screenshot quality here | ||||
|             try: | ||||
|                 # The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage | ||||
|                 self.screenshot = self.page.screenshot(type='jpeg', | ||||
|                                                        full_page=True, | ||||
|                                                        quality=int(os.getenv("SCREENSHOT_QUALITY", 72)), | ||||
|                                                        ) | ||||
|                 full_height = self.page.evaluate("document.documentElement.scrollHeight") | ||||
|  | ||||
|                 if full_height >= SCREENSHOT_SIZE_STITCH_THRESHOLD: | ||||
|                     logger.warning( | ||||
|                         f"Page full Height: {full_height}px longer than {SCREENSHOT_SIZE_STITCH_THRESHOLD}px, using 'stitched screenshot method'.") | ||||
|                     self.screenshot = capture_stitched_together_full_page(self.page) | ||||
|                 else: | ||||
|                     self.screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 30))) | ||||
|  | ||||
|             except Exception as e: | ||||
|                 # It's likely the screenshot was too long/big and something crashed | ||||
|                 raise ScreenshotUnavailable(url=url, status_code=self.status_code) | ||||
|   | ||||
| @@ -41,7 +41,7 @@ const findUpTag = (el) => { | ||||
|  | ||||
|     //  Strategy 1: If it's an input, with name, and there's only one, prefer that | ||||
|     if (el.name !== undefined && el.name.length) { | ||||
|         var proposed = el.tagName + "[name=" + el.name + "]"; | ||||
|         var proposed = el.tagName + "[name=\"" + CSS.escape(el.name) + "\"]"; | ||||
|         var proposed_element = window.document.querySelectorAll(proposed); | ||||
|         if (proposed_element.length) { | ||||
|             if (proposed_element.length === 1) { | ||||
| @@ -102,13 +102,15 @@ function collectVisibleElements(parent, visibleElements) { | ||||
|     const children = parent.children; | ||||
|     for (let i = 0; i < children.length; i++) { | ||||
|         const child = children[i]; | ||||
|         const computedStyle = window.getComputedStyle(child); | ||||
|  | ||||
|         if ( | ||||
|             child.nodeType === Node.ELEMENT_NODE && | ||||
|             window.getComputedStyle(child).display !== 'none' && | ||||
|             window.getComputedStyle(child).visibility !== 'hidden' && | ||||
|             computedStyle.display !== 'none' && | ||||
|             computedStyle.visibility !== 'hidden' && | ||||
|             child.offsetWidth >= 0 && | ||||
|             child.offsetHeight >= 0 && | ||||
|             window.getComputedStyle(child).contentVisibility !== 'hidden' | ||||
|             computedStyle.contentVisibility !== 'hidden' | ||||
|         ) { | ||||
|             // If the child is an element and is visible, recursively collect visible elements | ||||
|             collectVisibleElements(child, visibleElements); | ||||
| @@ -173,6 +175,7 @@ visibleElementsArray.forEach(function (element) { | ||||
|  | ||||
|     // Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training. | ||||
|     const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) &&  /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ; | ||||
|     const computedStyle = window.getComputedStyle(element); | ||||
|  | ||||
|     size_pos.push({ | ||||
|         xpath: xpath_result, | ||||
| @@ -184,10 +187,10 @@ visibleElementsArray.forEach(function (element) { | ||||
|         tagName: (element.tagName) ? element.tagName.toLowerCase() : '', | ||||
|         // tagtype used by Browser Steps | ||||
|         tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '', | ||||
|         isClickable: window.getComputedStyle(element).cursor === "pointer", | ||||
|         isClickable: computedStyle.cursor === "pointer", | ||||
|         // Used by the keras trainer | ||||
|         fontSize: window.getComputedStyle(element).getPropertyValue('font-size'), | ||||
|         fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'), | ||||
|         fontSize: computedStyle.getPropertyValue('font-size'), | ||||
|         fontWeight: computedStyle.getPropertyValue('font-weight'), | ||||
|         hasDigitCurrency: hasDigitCurrency, | ||||
|         label: label, | ||||
|     }); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user