mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-10-31 06:37:41 +00:00 
			
		
		
		
	Compare commits
	
		
			11 Commits
		
	
	
		
			0.40.0
			...
			fetch-reli
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | 6788796788 | ||
|   | efafc9bef8 | ||
|   | b7a2501d64 | ||
|   | e970fef991 | ||
|   | b76148a0f4 | ||
|   | 93cc30437f | ||
|   | 6562d6e0d4 | ||
|   | 6c217cc3b6 | ||
|   | f30cdf0674 | ||
|   | 14da0646a7 | ||
|   | b413cdecc7 | 
| @@ -1,7 +1,7 @@ | ||||
| # pip dependencies install stage | ||||
| FROM python:3.8-slim as builder | ||||
|  | ||||
| # rustc compiler would be needed on ARM type devices but theres an issue with some deps not building.. | ||||
| # See `cryptography` pin comment in requirements.txt | ||||
| ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1 | ||||
|  | ||||
| RUN apt-get update && apt-get install -y --no-install-recommends \ | ||||
| @@ -31,8 +31,7 @@ RUN pip install --target=/dependencies playwright~=1.27.1 \ | ||||
| # Final image stage | ||||
| FROM python:3.8-slim | ||||
|  | ||||
| # Actual packages needed at runtime, usually due to the notification (apprise) backend | ||||
| # rustc compiler would be needed on ARM type devices but theres an issue with some deps not building.. | ||||
| # See `cryptography` pin comment in requirements.txt | ||||
| ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1 | ||||
|  | ||||
| # Re #93, #73, excluding rustc (adds another 430Mb~) | ||||
|   | ||||
| @@ -1,9 +1,10 @@ | ||||
| recursive-include changedetectionio/api * | ||||
| recursive-include changedetectionio/templates * | ||||
| recursive-include changedetectionio/static * | ||||
| recursive-include changedetectionio/blueprint * | ||||
| recursive-include changedetectionio/model * | ||||
| recursive-include changedetectionio/tests * | ||||
| recursive-include changedetectionio/res * | ||||
| recursive-include changedetectionio/static * | ||||
| recursive-include changedetectionio/templates * | ||||
| recursive-include changedetectionio/tests * | ||||
| prune changedetectionio/static/package-lock.json | ||||
| prune changedetectionio/static/styles/node_modules | ||||
| prune changedetectionio/static/styles/package-lock.json | ||||
|   | ||||
							
								
								
									
										22
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										22
									
								
								README.md
									
									
									
									
									
								
							| @@ -187,11 +187,29 @@ When you enable a `json:` or `jq:` filter, you can even automatically extract an | ||||
| <html> | ||||
| ... | ||||
| <script type="application/ld+json"> | ||||
|   {"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula  800g","price": 23.50 } | ||||
|  | ||||
| { | ||||
|    "@context":"http://schema.org/", | ||||
|    "@type":"Product", | ||||
|    "offers":{ | ||||
|       "@type":"Offer", | ||||
|       "availability":"http://schema.org/InStock", | ||||
|       "price":"3949.99", | ||||
|       "priceCurrency":"USD", | ||||
|       "url":"https://www.newegg.com/p/3D5-000D-001T1" | ||||
|    }, | ||||
|    "description":"Cobratype King Cobra Hero Desktop Gaming PC", | ||||
|    "name":"Cobratype King Cobra Hero Desktop Gaming PC", | ||||
|    "sku":"3D5-000D-001T1", | ||||
|    "itemCondition":"NewCondition" | ||||
| } | ||||
| </script> | ||||
| ```   | ||||
|  | ||||
| `json:$.price` or `jq:.price` would give `23.50`, or you can extract the whole structure | ||||
| `json:$..price` or `jq:..price` would give `3949.99`, or you can extract the whole structure (use a JSONpath test website to validate with) | ||||
|  | ||||
| The application also supports notifying you that it can follow this information automatically | ||||
|  | ||||
|  | ||||
| ## Proxy Configuration | ||||
|  | ||||
|   | ||||
| @@ -10,6 +10,7 @@ import threading | ||||
| import time | ||||
| import timeago | ||||
|  | ||||
| from changedetectionio import queuedWatchMetaData | ||||
| from copy import deepcopy | ||||
| from distutils.util import strtobool | ||||
| from feedgen.feed import FeedGenerator | ||||
| @@ -35,7 +36,7 @@ from flask_wtf import CSRFProtect | ||||
| from changedetectionio import html_tools | ||||
| from changedetectionio.api import api_v1 | ||||
|  | ||||
| __version__ = '0.40.0' | ||||
| __version__ = '0.40.0.2' | ||||
|  | ||||
| datastore = None | ||||
|  | ||||
| @@ -404,7 +405,6 @@ def changedetection_app(config=None, datastore_o=None): | ||||
|                 sorted_watches.append(watch) | ||||
|  | ||||
|         existing_tags = datastore.get_all_tags() | ||||
|  | ||||
|         form = forms.quickWatchForm(request.form) | ||||
|         output = render_template("watch-overview.html", | ||||
|                                  form=form, | ||||
| @@ -416,7 +416,7 @@ def changedetection_app(config=None, datastore_o=None): | ||||
|                                  # Don't link to hosting when we're on the hosting environment | ||||
|                                  hosted_sticky=os.getenv("SALTED_PASS", False) == False, | ||||
|                                  guid=datastore.data['app_guid'], | ||||
|                                  queued_uuids=[uuid for p,uuid in update_q.queue]) | ||||
|                                  queued_uuids=[q_uuid.item['uuid'] for q_uuid in update_q.queue]) | ||||
|  | ||||
|  | ||||
|         if session.get('share-link'): | ||||
| @@ -596,25 +596,16 @@ def changedetection_app(config=None, datastore_o=None): | ||||
|                     using_default_check_time = False | ||||
|                     break | ||||
|  | ||||
|             # Use the default if its the same as system wide | ||||
|             # Use the default if it's the same as system-wide. | ||||
|             if form.fetch_backend.data == datastore.data['settings']['application']['fetch_backend']: | ||||
|                 extra_update_obj['fetch_backend'] = None | ||||
|  | ||||
|  | ||||
|  | ||||
|              # Ignore text | ||||
|             form_ignore_text = form.ignore_text.data | ||||
|             datastore.data['watching'][uuid]['ignore_text'] = form_ignore_text | ||||
|  | ||||
|             # Reset the previous_md5 so we process a new snapshot including stripping ignore text. | ||||
|             if form_ignore_text: | ||||
|                 if len(datastore.data['watching'][uuid].history): | ||||
|                     extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid) | ||||
|  | ||||
|             # Reset the previous_md5 so we process a new snapshot including stripping ignore text. | ||||
|             if form.include_filters.data != datastore.data['watching'][uuid].get('include_filters', []): | ||||
|                 if len(datastore.data['watching'][uuid].history): | ||||
|                     extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid) | ||||
|  | ||||
|             # Be sure proxy value is None | ||||
|             if datastore.proxy_list is not None and form.data['proxy'] == '': | ||||
|                 extra_update_obj['proxy'] = None | ||||
| @@ -632,7 +623,7 @@ def changedetection_app(config=None, datastore_o=None): | ||||
|             datastore.needs_write_urgent = True | ||||
|  | ||||
|             # Queue the watch for immediate recheck, with a higher priority | ||||
|             update_q.put((1, uuid)) | ||||
|             update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False})) | ||||
|  | ||||
|             # Diff page [edit] link should go back to diff page | ||||
|             if request.args.get("next") and request.args.get("next") == 'diff': | ||||
| @@ -773,7 +764,7 @@ def changedetection_app(config=None, datastore_o=None): | ||||
|                 importer = import_url_list() | ||||
|                 importer.run(data=request.values.get('urls'), flash=flash, datastore=datastore) | ||||
|                 for uuid in importer.new_uuids: | ||||
|                     update_q.put((1, uuid)) | ||||
|                     update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True})) | ||||
|  | ||||
|                 if len(importer.remaining_data) == 0: | ||||
|                     return redirect(url_for('index')) | ||||
| @@ -786,7 +777,7 @@ def changedetection_app(config=None, datastore_o=None): | ||||
|                 d_importer = import_distill_io_json() | ||||
|                 d_importer.run(data=request.values.get('distill-io'), flash=flash, datastore=datastore) | ||||
|                 for uuid in d_importer.new_uuids: | ||||
|                     update_q.put((1, uuid)) | ||||
|                     update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True})) | ||||
|  | ||||
|  | ||||
|  | ||||
| @@ -1151,7 +1142,7 @@ def changedetection_app(config=None, datastore_o=None): | ||||
|  | ||||
|         if not add_paused and new_uuid: | ||||
|             # Straight into the queue. | ||||
|             update_q.put((1, new_uuid)) | ||||
|             update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid})) | ||||
|             flash("Watch added.") | ||||
|  | ||||
|         if add_paused: | ||||
| @@ -1188,7 +1179,7 @@ def changedetection_app(config=None, datastore_o=None): | ||||
|             uuid = list(datastore.data['watching'].keys()).pop() | ||||
|  | ||||
|         new_uuid = datastore.clone(uuid) | ||||
|         update_q.put((5, new_uuid)) | ||||
|         update_q.put(queuedWatchMetaData.PrioritizedItem(priority=5, item={'uuid': new_uuid, 'skip_when_checksum_same': True})) | ||||
|         flash('Cloned.') | ||||
|  | ||||
|         return redirect(url_for('index')) | ||||
| @@ -1196,7 +1187,7 @@ def changedetection_app(config=None, datastore_o=None): | ||||
|     @app.route("/api/checknow", methods=['GET']) | ||||
|     @login_required | ||||
|     def form_watch_checknow(): | ||||
|  | ||||
|         # Forced recheck will skip the 'skip if content is the same' rule (, 'reprocess_existing_data': True}))) | ||||
|         tag = request.args.get('tag') | ||||
|         uuid = request.args.get('uuid') | ||||
|         i = 0 | ||||
| @@ -1205,11 +1196,9 @@ def changedetection_app(config=None, datastore_o=None): | ||||
|         for t in running_update_threads: | ||||
|             running_uuids.append(t.current_uuid) | ||||
|  | ||||
|         # @todo check thread is running and skip | ||||
|  | ||||
|         if uuid: | ||||
|             if uuid not in running_uuids: | ||||
|                 update_q.put((1, uuid)) | ||||
|                 update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False})) | ||||
|             i = 1 | ||||
|  | ||||
|         elif tag != None: | ||||
| @@ -1217,14 +1206,14 @@ def changedetection_app(config=None, datastore_o=None): | ||||
|             for watch_uuid, watch in datastore.data['watching'].items(): | ||||
|                 if (tag != None and tag in watch['tag']): | ||||
|                     if watch_uuid not in running_uuids and not datastore.data['watching'][watch_uuid]['paused']: | ||||
|                         update_q.put((1, watch_uuid)) | ||||
|                         update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid, 'skip_when_checksum_same': False})) | ||||
|                         i += 1 | ||||
|  | ||||
|         else: | ||||
|             # No tag, no uuid, add everything. | ||||
|             for watch_uuid, watch in datastore.data['watching'].items(): | ||||
|                 if watch_uuid not in running_uuids and not datastore.data['watching'][watch_uuid]['paused']: | ||||
|                     update_q.put((1, watch_uuid)) | ||||
|                     update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid, 'skip_when_checksum_same': False})) | ||||
|                     i += 1 | ||||
|         flash("{} watches are queued for rechecking.".format(i)) | ||||
|         return redirect(url_for('index', tag=tag)) | ||||
| @@ -1344,7 +1333,7 @@ def changedetection_app(config=None, datastore_o=None): | ||||
|     app.register_blueprint(browser_steps.construct_blueprint(datastore), url_prefix='/browser-steps') | ||||
|  | ||||
|     import changedetectionio.blueprint.price_data_follower as price_data_follower | ||||
|     app.register_blueprint(price_data_follower.construct_blueprint(datastore), url_prefix='/price_data_follower') | ||||
|     app.register_blueprint(price_data_follower.construct_blueprint(datastore, update_q), url_prefix='/price_data_follower') | ||||
|  | ||||
|  | ||||
|     # @todo handle ctrl break | ||||
| @@ -1492,7 +1481,7 @@ def ticker_thread_check_time_launch_checks(): | ||||
|             seconds_since_last_recheck = now - watch['last_checked'] | ||||
|  | ||||
|             if seconds_since_last_recheck >= (threshold + watch.jitter_seconds) and seconds_since_last_recheck >= recheck_time_minimum_seconds: | ||||
|                 if not uuid in running_uuids and uuid not in [q_uuid for p,q_uuid in update_q.queue]: | ||||
|                 if not uuid in running_uuids and uuid not in [q_uuid.item['uuid'] for q_uuid in update_q.queue]: | ||||
|  | ||||
|                     # Proxies can be set to have a limit on seconds between which they can be called | ||||
|                     watch_proxy = datastore.get_preferred_proxy_for_watch(uuid=uuid) | ||||
| @@ -1523,8 +1512,9 @@ def ticker_thread_check_time_launch_checks(): | ||||
|                             priority, | ||||
|                             watch.jitter_seconds, | ||||
|                             now - watch['last_checked'])) | ||||
|  | ||||
|                     # Into the queue with you | ||||
|                     update_q.put((priority, uuid)) | ||||
|                     update_q.put(queuedWatchMetaData.PrioritizedItem(priority=priority, item={'uuid': uuid, 'skip_when_checksum_same': True})) | ||||
|  | ||||
|                     # Reset for next time | ||||
|                     watch.jitter_seconds = 0 | ||||
|   | ||||
| @@ -1,3 +1,4 @@ | ||||
| from changedetectionio import queuedWatchMetaData | ||||
| from flask_restful import abort, Resource | ||||
| from flask import request, make_response | ||||
| import validators | ||||
| @@ -24,7 +25,7 @@ class Watch(Resource): | ||||
|             abort(404, message='No watch exists with the UUID of {}'.format(uuid)) | ||||
|  | ||||
|         if request.args.get('recheck'): | ||||
|             self.update_q.put((1, uuid)) | ||||
|             self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True})) | ||||
|             return "OK", 200 | ||||
|  | ||||
|         # Return without history, get that via another API call | ||||
| @@ -100,7 +101,7 @@ class CreateWatch(Resource): | ||||
|         extras = {'title': json_data['title'].strip()} if json_data.get('title') else {} | ||||
|  | ||||
|         new_uuid = self.datastore.add_watch(url=json_data['url'].strip(), tag=tag, extras=extras) | ||||
|         self.update_q.put((1, new_uuid)) | ||||
|         self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid, 'skip_when_checksum_same': True})) | ||||
|         return {'uuid': new_uuid}, 201 | ||||
|  | ||||
|     # Return concise list of available watches and some very basic info | ||||
| @@ -118,7 +119,7 @@ class CreateWatch(Resource): | ||||
|  | ||||
|         if request.args.get('recheck_all'): | ||||
|             for uuid in self.datastore.data['watching'].keys(): | ||||
|                 self.update_q.put((1, uuid)) | ||||
|                 self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True})) | ||||
|             return {'status': "OK"}, 200 | ||||
|  | ||||
|         return list, 200 | ||||
|   | ||||
| @@ -75,15 +75,13 @@ class steppable_browser_interface(): | ||||
|     def action_goto_url(self, url, optional_value): | ||||
|         # self.page.set_viewport_size({"width": 1280, "height": 5000}) | ||||
|         now = time.time() | ||||
|         response = self.page.goto(url, timeout=0, wait_until='domcontentloaded') | ||||
|         print("Time to goto URL", time.time() - now) | ||||
|         response = self.page.goto(url, timeout=0, wait_until='commit') | ||||
|  | ||||
|         # Wait_until = commit | ||||
|         # - `'commit'` - consider operation to be finished when network response is received and the document started loading. | ||||
|         # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds | ||||
|         # This seemed to solve nearly all 'TimeoutErrors' | ||||
|         extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) | ||||
|         self.page.wait_for_timeout(extra_wait * 1000) | ||||
|         print("Time to goto URL ", time.time() - now) | ||||
|  | ||||
|     def action_click_element_containing_text(self, selector=None, value=''): | ||||
|         if not len(value.strip()): | ||||
|   | ||||
| @@ -3,22 +3,28 @@ from distutils.util import strtobool | ||||
| from flask import Blueprint, flash, redirect, url_for | ||||
| from flask_login import login_required | ||||
| from changedetectionio.store import ChangeDetectionStore | ||||
| from changedetectionio import queuedWatchMetaData | ||||
| from queue import PriorityQueue | ||||
|  | ||||
| def construct_blueprint(datastore: ChangeDetectionStore): | ||||
| PRICE_DATA_TRACK_ACCEPT = 'accepted' | ||||
| PRICE_DATA_TRACK_REJECT = 'rejected' | ||||
|  | ||||
| def construct_blueprint(datastore: ChangeDetectionStore, update_q: PriorityQueue): | ||||
|  | ||||
|     price_data_follower_blueprint = Blueprint('price_data_follower', __name__) | ||||
|  | ||||
|     @login_required | ||||
|     @price_data_follower_blueprint.route("/<string:uuid>/accept", methods=['GET']) | ||||
|     def accept(uuid): | ||||
|         datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'accepted' | ||||
|         datastore.data['watching'][uuid]['track_ldjson_price_data'] = PRICE_DATA_TRACK_ACCEPT | ||||
|         update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False})) | ||||
|         return redirect(url_for("form_watch_checknow", uuid=uuid)) | ||||
|  | ||||
|  | ||||
|     @login_required | ||||
|     @price_data_follower_blueprint.route("/<string:uuid>/reject", methods=['GET']) | ||||
|     def reject(uuid): | ||||
|         datastore.data['watching'][uuid]['track_ldjson_price_data'] = 'rejected' | ||||
|         datastore.data['watching'][uuid]['track_ldjson_price_data'] = PRICE_DATA_TRACK_REJECT | ||||
|         return redirect(url_for("index")) | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -23,6 +23,9 @@ class Non200ErrorCodeReceived(Exception): | ||||
|             self.page_text = html_tools.html_to_text(page_html) | ||||
|         return | ||||
|  | ||||
| class checksumFromPreviousCheckWasTheSame(Exception): | ||||
|     def __init__(self): | ||||
|         return | ||||
|  | ||||
| class JSActionExceptions(Exception): | ||||
|     def __init__(self, status_code, url, screenshot, message=''): | ||||
| @@ -39,7 +42,7 @@ class BrowserStepsStepTimout(Exception): | ||||
|  | ||||
|  | ||||
| class PageUnloadable(Exception): | ||||
|     def __init__(self, status_code, url, screenshot=False, message=False): | ||||
|     def __init__(self, status_code, url, message, screenshot=False): | ||||
|         # Set this so we can use it in other parts of the app | ||||
|         self.status_code = status_code | ||||
|         self.url = url | ||||
| @@ -286,6 +289,8 @@ class base_html_playwright(Fetcher): | ||||
|                 proxy=self.proxy, | ||||
|                 # This is needed to enable JavaScript execution on GitHub and others | ||||
|                 bypass_csp=True, | ||||
|                 # Can't think why we need the service workers for our use case? | ||||
|                 service_workers='block', | ||||
|                 # Should never be needed | ||||
|                 accept_downloads=False | ||||
|             ) | ||||
| @@ -294,24 +299,34 @@ class base_html_playwright(Fetcher): | ||||
|             if len(request_headers): | ||||
|                 context.set_extra_http_headers(request_headers) | ||||
|  | ||||
|             try: | ||||
|                 self.page.set_default_navigation_timeout(90000) | ||||
|                 self.page.set_default_timeout(90000) | ||||
|  | ||||
|                 # Listen for all console events and handle errors | ||||
|                 self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}")) | ||||
|  | ||||
|                 # Bug - never set viewport size BEFORE page.goto | ||||
|  | ||||
|  | ||||
|                 # Waits for the next navigation. Using Python context manager | ||||
|                 # prevents a race condition between clicking and waiting for a navigation. | ||||
|                 with self.page.expect_navigation(): | ||||
|                     response = self.page.goto(url, wait_until='load') | ||||
|             # Goto page | ||||
|             try: | ||||
|                 # Wait_until = commit | ||||
|                 # - `'commit'` - consider operation to be finished when network response is received and the document started loading. | ||||
|                 # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds | ||||
|                 # This seemed to solve nearly all 'TimeoutErrors' | ||||
|                 response = self.page.goto(url, wait_until='commit') | ||||
|             except playwright._impl._api_types.Error as e: | ||||
|                 # Retry once - https://github.com/browserless/chrome/issues/2485 | ||||
|                 # Sometimes errors related to invalid cert's and other can be random | ||||
|                 print ("Content Fetcher > retrying request got error - ", str(e)) | ||||
|                 time.sleep(1) | ||||
|                 response = self.page.goto(url, wait_until='commit') | ||||
|  | ||||
|             except Exception as e: | ||||
|                 print ("Content Fetcher > Other exception when page.goto", str(e)) | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 raise PageUnloadable(url=url, status_code=None, message=str(e)) | ||||
|  | ||||
|             # Execute any browser steps | ||||
|             try: | ||||
|                 extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay | ||||
|                 self.page.wait_for_timeout(extra_wait * 1000) | ||||
|  | ||||
| @@ -324,17 +339,15 @@ class base_html_playwright(Fetcher): | ||||
|                 # This can be ok, we will try to grab what we could retrieve | ||||
|                 pass | ||||
|             except Exception as e: | ||||
|                 print ("other exception when page.goto") | ||||
|                 print (str(e)) | ||||
|                 print ("Content Fetcher > Other exception when executing custom JS code", str(e)) | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 raise PageUnloadable(url=url, status_code=None) | ||||
|  | ||||
|                 raise PageUnloadable(url=url, status_code=None, message=str(e)) | ||||
|  | ||||
|             if response is None: | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 print ("response object was none") | ||||
|                 print ("Content Fetcher > Response object was none") | ||||
|                 raise EmptyReply(url=url, status_code=None) | ||||
|  | ||||
|             # Bug 2(?) Set the viewport size AFTER loading the page | ||||
| @@ -353,7 +366,7 @@ class base_html_playwright(Fetcher): | ||||
|             if len(self.page.content().strip()) == 0: | ||||
|                 context.close() | ||||
|                 browser.close() | ||||
|                 print ("Content was empty") | ||||
|                 print ("Content Fetcher > Content was empty") | ||||
|                 raise EmptyReply(url=url, status_code=None) | ||||
|  | ||||
|             # Bug 2(?) Set the viewport size AFTER loading the page | ||||
| @@ -498,7 +511,7 @@ class base_html_webdriver(Fetcher): | ||||
|             try: | ||||
|                 self.driver.quit() | ||||
|             except Exception as e: | ||||
|                 print("Exception in chrome shutdown/quit" + str(e)) | ||||
|                 print("Content Fetcher > Exception in chrome shutdown/quit" + str(e)) | ||||
|  | ||||
|  | ||||
| # "html_requests" is listed as the default fetcher in store.py! | ||||
|   | ||||
| @@ -1,10 +1,13 @@ | ||||
| import hashlib | ||||
| import json | ||||
| import logging | ||||
| import os | ||||
| import re | ||||
| import urllib3 | ||||
|  | ||||
| from changedetectionio import content_fetcher, html_tools | ||||
| from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT | ||||
| from copy import deepcopy | ||||
|  | ||||
| urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | ||||
|  | ||||
| @@ -37,8 +40,7 @@ class perform_site_check(): | ||||
|  | ||||
|         return regex | ||||
|  | ||||
|     def run(self, uuid): | ||||
|         from copy import deepcopy | ||||
|     def run(self, uuid, skip_when_checksum_same=True): | ||||
|         changed_detected = False | ||||
|         screenshot = False  # as bytes | ||||
|         stripped_text_from_html = "" | ||||
| @@ -121,6 +123,14 @@ class perform_site_check(): | ||||
|         self.screenshot = fetcher.screenshot | ||||
|         self.xpath_data = fetcher.xpath_data | ||||
|  | ||||
|         # Watches added automatically in the queue manager will skip if its the same checksum as the previous run | ||||
|         # Saves a lot of CPU | ||||
|         update_obj['previous_md5_before_filters'] = hashlib.md5(fetcher.content.encode('utf-8')).hexdigest() | ||||
|         if skip_when_checksum_same: | ||||
|             if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'): | ||||
|                 raise content_fetcher.checksumFromPreviousCheckWasTheSame() | ||||
|  | ||||
|  | ||||
|         # Fetching complete, now filters | ||||
|         # @todo move to class / maybe inside of fetcher abstract base? | ||||
|  | ||||
| @@ -148,7 +158,7 @@ class perform_site_check(): | ||||
|         ) | ||||
|  | ||||
|         # Inject a virtual LD+JSON price tracker rule | ||||
|         if watch.get('track_ldjson_price_data'): | ||||
|         if watch.get('track_ldjson_price_data', '') == PRICE_DATA_TRACK_ACCEPT: | ||||
|             include_filters_rule.append(html_tools.LD_JSON_PRODUCT_OFFER_SELECTOR) | ||||
|  | ||||
|         has_filter_rule = include_filters_rule and len("".join(include_filters_rule).strip()) | ||||
| @@ -158,6 +168,14 @@ class perform_site_check(): | ||||
|             include_filters_rule.append("json:$") | ||||
|             has_filter_rule = True | ||||
|  | ||||
|         if is_json: | ||||
|             # Sort the JSON so we dont get false alerts when the content is just re-ordered | ||||
|             try: | ||||
|                 fetcher.content = json.dumps(json.loads(fetcher.content), sort_keys=True) | ||||
|             except Exception as e: | ||||
|                 # Might have just been a snippet, or otherwise bad JSON, continue | ||||
|                 pass | ||||
|  | ||||
|         if has_filter_rule: | ||||
|             json_filter_prefixes = ['json:', 'jq:'] | ||||
|             for filter in include_filters_rule: | ||||
| @@ -165,6 +183,8 @@ class perform_site_check(): | ||||
|                     stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter) | ||||
|                     is_html = False | ||||
|  | ||||
|  | ||||
|  | ||||
|         if is_html or is_source: | ||||
|  | ||||
|             # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text | ||||
|   | ||||
| @@ -14,51 +14,52 @@ from changedetectionio.notification import ( | ||||
|  | ||||
| class model(dict): | ||||
|     __newest_history_key = None | ||||
|     __history_n=0 | ||||
|     __history_n = 0 | ||||
|     __base_config = { | ||||
|             #'history': {},  # Dict of timestamp and output stripped filename (removed) | ||||
|             #'newest_history_key': 0, (removed, taken from history.txt index) | ||||
|             'body': None, | ||||
|             'check_unique_lines': False, # On change-detected, compare against all history if its something new | ||||
|             'check_count': 0, | ||||
|             'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine. | ||||
|             'extract_text': [],  # Extract text by regex after filters | ||||
|             'extract_title_as_title': False, | ||||
|             'fetch_backend': None, | ||||
|             'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')), | ||||
|             'has_ldjson_price_data': None, | ||||
|             'track_ldjson_price_data': None, | ||||
|             'headers': {},  # Extra headers to send | ||||
|             'ignore_text': [],  # List of text to ignore when calculating the comparison checksum | ||||
|             'include_filters': [], | ||||
|             'last_checked': 0, | ||||
|             'last_error': False, | ||||
|             'last_viewed': 0,  # history key value of the last viewed via the [diff] link | ||||
|             'method': 'GET', | ||||
|              # Custom notification content | ||||
|             'notification_body': None, | ||||
|             'notification_format': default_notification_format_for_watch, | ||||
|             'notification_muted': False, | ||||
|             'notification_title': None, | ||||
|             'notification_screenshot': False, # Include the latest screenshot if available and supported by the apprise URL | ||||
|             'notification_urls': [],  # List of URLs to add to the notification Queue (Usually AppRise) | ||||
|             'paused': False, | ||||
|             'previous_md5': False, | ||||
|             'proxy': None, # Preferred proxy connection | ||||
|             'subtractive_selectors': [], | ||||
|             'tag': None, | ||||
|             'text_should_not_be_present': [], # Text that should not present | ||||
|             # Re #110, so then if this is set to None, we know to use the default value instead | ||||
|             # Requires setting to None on submit if it's the same as the default | ||||
|             # Should be all None by default, so we use the system default in this case. | ||||
|             'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None}, | ||||
|             'title': None, | ||||
|             'trigger_text': [],  # List of text or regex to wait for until a change is detected | ||||
|             'url': None, | ||||
|             'uuid': str(uuid.uuid4()), | ||||
|             'webdriver_delay': None, | ||||
|             'webdriver_js_execute_code': None, # Run before change-detection | ||||
|         } | ||||
|         # 'history': {},  # Dict of timestamp and output stripped filename (removed) | ||||
|         # 'newest_history_key': 0, (removed, taken from history.txt index) | ||||
|         'body': None, | ||||
|         'check_unique_lines': False,  # On change-detected, compare against all history if its something new | ||||
|         'check_count': 0, | ||||
|         'consecutive_filter_failures': 0,  # Every time the CSS/xPath filter cannot be located, reset when all is fine. | ||||
|         'extract_text': [],  # Extract text by regex after filters | ||||
|         'extract_title_as_title': False, | ||||
|         'fetch_backend': None, | ||||
|         'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')), | ||||
|         'has_ldjson_price_data': None, | ||||
|         'track_ldjson_price_data': None, | ||||
|         'headers': {},  # Extra headers to send | ||||
|         'ignore_text': [],  # List of text to ignore when calculating the comparison checksum | ||||
|         'include_filters': [], | ||||
|         'last_checked': 0, | ||||
|         'last_error': False, | ||||
|         'last_viewed': 0,  # history key value of the last viewed via the [diff] link | ||||
|         'method': 'GET', | ||||
|         # Custom notification content | ||||
|         'notification_body': None, | ||||
|         'notification_format': default_notification_format_for_watch, | ||||
|         'notification_muted': False, | ||||
|         'notification_title': None, | ||||
|         'notification_screenshot': False,  # Include the latest screenshot if available and supported by the apprise URL | ||||
|         'notification_urls': [],  # List of URLs to add to the notification Queue (Usually AppRise) | ||||
|         'paused': False, | ||||
|         'previous_md5': False, | ||||
|         'previous_md5_before_filters': False,  # Used for skipping changedetection entirely | ||||
|         'proxy': None,  # Preferred proxy connection | ||||
|         'subtractive_selectors': [], | ||||
|         'tag': None, | ||||
|         'text_should_not_be_present': [],  # Text that should not present | ||||
|         # Re #110, so then if this is set to None, we know to use the default value instead | ||||
|         # Requires setting to None on submit if it's the same as the default | ||||
|         # Should be all None by default, so we use the system default in this case. | ||||
|         'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None}, | ||||
|         'title': None, | ||||
|         'trigger_text': [],  # List of text or regex to wait for until a change is detected | ||||
|         'url': None, | ||||
|         'uuid': str(uuid.uuid4()), | ||||
|         'webdriver_delay': None, | ||||
|         'webdriver_js_execute_code': None,  # Run before change-detection | ||||
|     } | ||||
|     jitter_seconds = 0 | ||||
|  | ||||
|     def __init__(self, *arg, **kw): | ||||
|   | ||||
							
								
								
									
										10
									
								
								changedetectionio/queuedWatchMetaData.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								changedetectionio/queuedWatchMetaData.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,10 @@ | ||||
| from dataclasses import dataclass, field | ||||
| from typing import Any | ||||
|  | ||||
| # So that we can queue some metadata in `item` | ||||
| # https://docs.python.org/3/library/queue.html#queue.PriorityQueue | ||||
| # | ||||
| @dataclass(order=True) | ||||
| class PrioritizedItem: | ||||
|     priority: int | ||||
|     item: Any=field(compare=False) | ||||
| @@ -174,10 +174,23 @@ if (include_filters.length) { | ||||
|         } | ||||
|  | ||||
|         if (q) { | ||||
|             bbox = q.getBoundingClientRect(); | ||||
|             console.log("xpath_element_scraper: Got filter element, scroll from top was "+scroll_y) | ||||
|         } else { | ||||
|             console.log("xpath_element_scraper: filter element "+f+" was not found"); | ||||
|             // #1231 - IN the case XPath attribute filter is applied, we will have to traverse up and find the element. | ||||
|             if (q.hasOwnProperty('getBoundingClientRect')) { | ||||
|                 bbox = q.getBoundingClientRect(); | ||||
|                 console.log("xpath_element_scraper: Got filter element, scroll from top was " + scroll_y) | ||||
|             } else { | ||||
|                 try { | ||||
|                     // Try and see we can find its ownerElement | ||||
|                     bbox = q.ownerElement.getBoundingClientRect(); | ||||
|                     console.log("xpath_element_scraper: Got filter by ownerElement element, scroll from top was " + scroll_y) | ||||
|                 } catch (e) { | ||||
|                     console.log("xpath_element_scraper: error looking up ownerElement") | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|          | ||||
|         if(!q) { | ||||
|             console.log("xpath_element_scraper: filter element " + f + " was not found"); | ||||
|         } | ||||
|  | ||||
|         if (bbox && bbox['width'] > 0 && bbox['height'] > 0) { | ||||
|   | ||||
| @@ -394,6 +394,48 @@ def check_json_ext_filter(json_filter, client, live_server): | ||||
|     res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) | ||||
|     assert b'Deleted' in res.data | ||||
|  | ||||
| def test_ignore_json_order(client, live_server): | ||||
|     # A change in order shouldn't trigger a notification | ||||
|  | ||||
|     with open("test-datastore/endpoint-content.txt", "w") as f: | ||||
|         f.write('{"hello" : 123, "world": 123}') | ||||
|  | ||||
|  | ||||
|     # Add our URL to the import page | ||||
|     test_url = url_for('test_endpoint', content_type="application/json", _external=True) | ||||
|     res = client.post( | ||||
|         url_for("import_page"), | ||||
|         data={"urls": test_url}, | ||||
|         follow_redirects=True | ||||
|     ) | ||||
|     assert b"1 Imported" in res.data | ||||
|  | ||||
|     time.sleep(2) | ||||
|  | ||||
|     with open("test-datastore/endpoint-content.txt", "w") as f: | ||||
|         f.write('{"world" : 123, "hello": 123}') | ||||
|  | ||||
|     # Trigger a check | ||||
|     client.get(url_for("form_watch_checknow"), follow_redirects=True) | ||||
|     time.sleep(2) | ||||
|  | ||||
|     res = client.get(url_for("index")) | ||||
|     assert b'unviewed' not in res.data | ||||
|  | ||||
|     # Just to be sure it still works | ||||
|     with open("test-datastore/endpoint-content.txt", "w") as f: | ||||
|         f.write('{"world" : 123, "hello": 124}') | ||||
|  | ||||
|     # Trigger a check | ||||
|     client.get(url_for("form_watch_checknow"), follow_redirects=True) | ||||
|     time.sleep(2) | ||||
|  | ||||
|     res = client.get(url_for("index")) | ||||
|     assert b'unviewed' in res.data | ||||
|  | ||||
|     res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) | ||||
|     assert b'Deleted' in res.data | ||||
|  | ||||
| def test_check_jsonpath_ext_filter(client, live_server): | ||||
|     check_json_ext_filter('json:$[?(@.status==Sold)]', client, live_server) | ||||
|  | ||||
|   | ||||
| @@ -4,6 +4,7 @@ import queue | ||||
| import time | ||||
|  | ||||
| from changedetectionio import content_fetcher | ||||
| from changedetectionio import queuedWatchMetaData | ||||
| from changedetectionio.fetch_site_status import FilterNotFoundInResponse | ||||
|  | ||||
| # A single update worker | ||||
| @@ -157,11 +158,12 @@ class update_worker(threading.Thread): | ||||
|         while not self.app.config.exit.is_set(): | ||||
|  | ||||
|             try: | ||||
|                 priority, uuid = self.q.get(block=False) | ||||
|                 queued_item_data = self.q.get(block=False) | ||||
|             except queue.Empty: | ||||
|                 pass | ||||
|  | ||||
|             else: | ||||
|                 uuid = queued_item_data.item.get('uuid') | ||||
|                 self.current_uuid = uuid | ||||
|  | ||||
|                 if uuid in list(self.datastore.data['watching'].keys()): | ||||
| @@ -171,11 +173,11 @@ class update_worker(threading.Thread): | ||||
|                     update_obj= {} | ||||
|                     xpath_data = False | ||||
|                     process_changedetection_results = True | ||||
|                     print("> Processing UUID {} Priority {} URL {}".format(uuid, priority, self.datastore.data['watching'][uuid]['url'])) | ||||
|                     print("> Processing UUID {} Priority {} URL {}".format(uuid, queued_item_data.priority, self.datastore.data['watching'][uuid]['url'])) | ||||
|                     now = time.time() | ||||
|  | ||||
|                     try: | ||||
|                         changed_detected, update_obj, contents = update_handler.run(uuid) | ||||
|                         changed_detected, update_obj, contents = update_handler.run(uuid, skip_when_checksum_same=queued_item_data.item.get('skip_when_checksum_same')) | ||||
|                         # Re #342 | ||||
|                         # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes. | ||||
|                         # We then convert/.decode('utf-8') for the notification etc | ||||
| @@ -241,6 +243,10 @@ class update_worker(threading.Thread): | ||||
|  | ||||
|                         process_changedetection_results = True | ||||
|  | ||||
|                     except content_fetcher.checksumFromPreviousCheckWasTheSame as e: | ||||
|                         # Yes fine, so nothing todo | ||||
|                         pass | ||||
|  | ||||
|                     except content_fetcher.BrowserStepsStepTimout as e: | ||||
|  | ||||
|                         if not self.datastore.data['watching'].get(uuid): | ||||
|   | ||||
| @@ -29,8 +29,9 @@ apprise~=1.2.0 | ||||
| # apprise mqtt https://github.com/dgtlmoon/changedetection.io/issues/315 | ||||
| paho-mqtt | ||||
|  | ||||
| # Pinned version of cryptography otherwise | ||||
| # ERROR: Could not build wheels for cryptography which use PEP 517 and cannot be installed directly | ||||
| # This mainly affects some ARM builds, which unlike the other builds ignores "ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1" | ||||
| # so without this pinning, the newer versions on ARM will forcefully try to build rust, which results in "rust compiler not found" | ||||
| # (introduced once apprise became a dep) | ||||
| cryptography~=3.4 | ||||
|  | ||||
| # Used for CSS filtering | ||||
|   | ||||
		Reference in New Issue
	
	Block a user