From 7f5bdd47aef97249a5d84ebd1d51f81eb4fc577d Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 10 Jul 2024 15:44:54 +0200 Subject: [PATCH] Make processors more cleanly separated --- .../blueprint/check_proxies/__init__.py | 2 +- changedetectionio/flask_app.py | 24 ++++-- changedetectionio/forms.py | 55 ------------- changedetectionio/model/Watch.py | 32 +++++++- changedetectionio/model/__init__.py | 24 ------ changedetectionio/processors/README.md | 6 +- changedetectionio/processors/__init__.py | 78 ++++++++++++++++++- .../processors/restock_diff/__init__.py | 44 +++++++++++ .../processors/restock_diff/forms.py | 61 +++++++++++++++ .../processor.py} | 10 +-- .../processors/text_json_diff/__init__.py | 0 .../processor.py} | 5 +- changedetectionio/store.py | 47 ++++------- .../tests/test_history_consistency.py | 5 ++ .../tests/test_ignore_regex_text.py | 2 - changedetectionio/tests/test_ignore_text.py | 3 - .../tests/test_restock_itemprop.py | 15 ++++ changedetectionio/update_worker.py | 29 +++---- 18 files changed, 287 insertions(+), 155 deletions(-) create mode 100644 changedetectionio/processors/restock_diff/__init__.py create mode 100644 changedetectionio/processors/restock_diff/forms.py rename changedetectionio/processors/{restock_diff.py => restock_diff/processor.py} (98%) create mode 100644 changedetectionio/processors/text_json_diff/__init__.py rename changedetectionio/processors/{text_json_diff.py => text_json_diff/processor.py} (99%) diff --git a/changedetectionio/blueprint/check_proxies/__init__.py b/changedetectionio/blueprint/check_proxies/__init__.py index 62a7dab3..8d7df73f 100644 --- a/changedetectionio/blueprint/check_proxies/__init__.py +++ b/changedetectionio/blueprint/check_proxies/__init__.py @@ -30,7 +30,7 @@ def construct_blueprint(datastore: ChangeDetectionStore): def long_task(uuid, preferred_proxy): import time from changedetectionio.content_fetchers import exceptions as content_fetcher_exceptions - from changedetectionio.processors import text_json_diff + from changedetectionio.processors.text_json_diff import text_json_diff from changedetectionio.safe_jinja import render as jinja_render status = {'status': '', 'length': 0, 'text': ''} diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py index 49a70196..f533d6b8 100644 --- a/changedetectionio/flask_app.py +++ b/changedetectionio/flask_app.py @@ -8,6 +8,7 @@ import time from jinja2 import Template +from .processors import find_processors, get_parent_module from .safe_jinja import render as jinja_render from changedetectionio.strtobool import strtobool from copy import deepcopy @@ -623,6 +624,7 @@ def changedetection_app(config=None, datastore_o=None): from . import forms from .blueprint.browser_steps.browser_steps import browser_step_ui_config from . import processors + import importlib # More for testing, possible to return the first/only if not datastore.data['watching'].keys(): @@ -656,13 +658,21 @@ def changedetection_app(config=None, datastore_o=None): default['proxy'] = '' # proxy_override set to the json/text list of the items - processor = datastore.data['watching'][uuid].get('processor', '') - form_class_name = f"processor_{processor}_form" - try: - form_class = getattr(forms, form_class_name) - except AttributeError: - flash(f"Cannot load the edit form for processor/plugin '{processor}', plugin missing?", 'error') - return redirect(url_for('index')) + form_class = forms + + # Does it use some custom form? does one exist? + processor_name = datastore.data['watching'][uuid].get('processor', '') + custom_processor_class = next((tpl for tpl in find_processors() if tpl[1] == processor_name), None) + if custom_processor_class: + try: + # Get the parent of the "processor.py" go up one, get the form (kinda spaghetti but its reusing existing code) + parent_module = get_parent_module(custom_processor_class[0]) + forms_module = importlib.import_module(f"{parent_module.__name__}.forms") + # Access the 'processor_settings_form' class from the 'forms' module + form_class = getattr(forms_module, 'processor_settings_form') + except AttributeError as e: + flash(f"Cannot load the edit form for processor/plugin '{custom_processor_class[1]}', plugin missing?", 'error') + return redirect(url_for('index')) form = form_class(formdata=request.form if request.method == 'POST' else None, data=default diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index b01e1f25..2cefae90 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -1,8 +1,6 @@ import os import re -from wtforms.fields.numeric import FloatField - from changedetectionio.strtobool import strtobool from wtforms import ( @@ -522,59 +520,6 @@ class processor_text_json_diff_form(commonSettingsForm): result = False return result - -class processor_restock_diff_form(processor_text_json_diff_form): - in_stock_only = BooleanField('Only trigger when product goes BACK to in-stock', default=True) - price_change_min = FloatField('Minimum amount to trigger notification', [validators.Optional()], render_kw={"placeholder": "No limit", "size": "10"}) - price_change_max = FloatField('Maximum amount to trigger notification', [validators.Optional()], render_kw={"placeholder": "No limit", "size": "10"}) - price_change_threshold_percent = FloatField('Threshold in % for price changes', validators=[ - validators.Optional(), - validators.NumberRange(min=0, max=100, message="Should be between 0 and 100"), - ], render_kw={"placeholder": "0%", "size": "5"}) - - - follow_price_changes = BooleanField('Follow price changes', default=False) - - def extra_tab_content(self): - return 'Restock & Price Detection' - - def extra_form_content(self): - return """ - {% from '_helpers.html' import render_field, render_checkbox_field, render_button %} - - - -
-
-
- {{ render_checkbox_field(form.in_stock_only) }} - Only trigger notifications when page changes from out of stock to back in stock -
-
- {{ render_checkbox_field(form.follow_price_changes) }} - Changes in price should trigger a notification - When OFF - only care about restock detection -
-
- {{ render_field(form.price_change_min, placeholder=watch['restock']['price']) }} - Minimum amount, only trigger a change when the price is less than this amount. -
-
- {{ render_field(form.price_change_max, placeholder=watch['restock']['price']) }} - Maximum amount, only trigger a change when the price is more than this amount. -
-
- {{ render_field(form.price_change_threshold_percent) }} - Price must change more than this % to trigger a change.
- For example, If the product is $1,000 USD, 2% would mean it has to change more than $20 since the first check.
-
-
-
""" - class SingleExtraProxy(Form): # maybe better to set some + + +
+
+
+ {{ render_checkbox_field(form.in_stock_only) }} + Only trigger notifications when page changes from out of stock to back in stock +
+
+ {{ render_checkbox_field(form.follow_price_changes) }} + Changes in price should trigger a notification + When OFF - only care about restock detection +
+
+ {{ render_field(form.price_change_min, placeholder=watch['restock']['price']) }} + Minimum amount, only trigger a change when the price is less than this amount. +
+
+ {{ render_field(form.price_change_max, placeholder=watch['restock']['price']) }} + Maximum amount, only trigger a change when the price is more than this amount. +
+
+ {{ render_field(form.price_change_threshold_percent) }} + Price must change more than this % to trigger a change.
+ For example, If the product is $1,000 USD, 2% would mean it has to change more than $20 since the first check.
+
+
+
""" \ No newline at end of file diff --git a/changedetectionio/processors/restock_diff.py b/changedetectionio/processors/restock_diff/processor.py similarity index 98% rename from changedetectionio/processors/restock_diff.py rename to changedetectionio/processors/restock_diff/processor.py index 4cb19f49..e543aaca 100644 --- a/changedetectionio/processors/restock_diff.py +++ b/changedetectionio/processors/restock_diff/processor.py @@ -1,5 +1,5 @@ -from . import difference_detection_processor -from ..model import Restock +from .. import difference_detection_processor +from . import Restock from loguru import logger import hashlib import re @@ -7,11 +7,9 @@ import urllib3 import time urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - name = 'Re-stock & Price detection for single product pages' description = 'Detects if the product goes back to in-stock' - class UnableToExtractRestockData(Exception): def __init__(self, status_code): # Set this so we can use it in other parts of the app @@ -47,6 +45,7 @@ def get_itemprop_availability(html_content) -> Restock: logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s") # First phase, dead simple scanning of anything that looks useful + value = Restock() if data: logger.debug(f"Using jsonpath to find price/availability/etc") price_parse = parse('$..(price|Price|highPrice)') @@ -84,7 +83,7 @@ def get_itemprop_availability(html_content) -> Restock: logger.trace(f"Processed with Extruct in {time.time()-now:.3f}s") - return Restock(value) + return value def is_between(number, lower=None, upper=None): @@ -154,7 +153,6 @@ class perform_site_check(difference_detection_processor): # Main detection method fetched_md5 = None if self.fetcher.instock_data: - fetched_md5 = hashlib.md5(self.fetcher.instock_data.encode('utf-8')).hexdigest() # 'Possibly in stock' comes from stock-not-in-stock.js when no string found above the fold. update_obj["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False logger.debug(f"Watch UUID {watch.get('uuid')} restock check returned '{self.fetcher.instock_data}' from JS scraper.") diff --git a/changedetectionio/processors/text_json_diff/__init__.py b/changedetectionio/processors/text_json_diff/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff/processor.py similarity index 99% rename from changedetectionio/processors/text_json_diff.py rename to changedetectionio/processors/text_json_diff/processor.py index 797b6c2b..3234c422 100644 --- a/changedetectionio/processors/text_json_diff.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -6,8 +6,8 @@ import os import re import urllib3 -from . import difference_detection_processor -from ..html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text +from changedetectionio.processors import difference_detection_processor +from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text from changedetectionio import html_tools, content_fetchers from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT from loguru import logger @@ -16,6 +16,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) name = 'Webpage Text/HTML, JSON and PDF changes' description = 'Detects all text changes where possible' + json_filter_prefixes = ['json:', 'jq:', 'jqraw:'] class FilterNotFoundInResponse(ValueError): diff --git a/changedetectionio/store.py b/changedetectionio/store.py index ac33c2c1..d423f3ee 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -18,6 +18,8 @@ import time import uuid as uuid_builder from loguru import logger +from .processors import get_custom_watch_obj_for_processor + # Because the server will run as a daemon and wont know the URL for notification links when firing off a notification BASE_URL_NOT_SET_TEXT = '("Base URL" not set - see settings - notifications)' @@ -80,9 +82,15 @@ class ChangeDetectionStore: self.__data['settings']['application'].update(from_disk['settings']['application']) # Convert each existing watch back to the Watch.model object + for uuid, watch in self.__data['watching'].items(): - watch['uuid']=uuid - self.__data['watching'][uuid] = Watch.model(datastore_path=self.datastore_path, default=watch) + watch['uuid'] = uuid + watch_class = get_custom_watch_obj_for_processor(watch.get('processor')) + if watch.get('uuid') != 'text_json_diff': + logger.trace(f"Loading Watch object '{watch_class.__module__}.{watch_class.__name__}' for UUID {uuid}") + + self.__data['watching'][uuid] = watch_class(datastore_path=self.datastore_path, default=watch) + logger.info(f"Watching: {uuid} {self.__data['watching'][uuid]['url']}") # First time ran, Create the datastore. @@ -240,34 +248,7 @@ class ChangeDetectionStore: # Remove a watchs data but keep the entry (URL etc) def clear_watch_history(self, uuid): - import pathlib - from .model import Restock - - # JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc - for item in pathlib.Path(os.path.join(self.datastore_path, uuid)).rglob("*.*"): - unlink(item) - - # Force the attr to recalculate - bump = self.__data['watching'][uuid].history - - # Do this last because it will trigger a recheck due to last_checked being zero - self.__data['watching'][uuid].update({ - 'browser_steps_last_error_step' : None, - 'check_count': 0, - 'fetch_time' : 0.0, - 'has_ldjson_price_data': None, - 'in_stock': None, - 'last_checked': 0, - 'last_error': False, - 'last_notification_error': False, - 'last_viewed': 0, - 'previous_md5': False, - 'previous_md5_before_filters': False, - 'remote_server_reply': None, - 'track_ldjson_price_data': None, - 'restock': Restock() - }) - + self.__data['watching'][uuid].clear_watch() self.needs_write_urgent = True def add_watch(self, url, tag='', extras=None, tag_uuids=None, write_to_disk_now=True): @@ -344,11 +325,13 @@ class ChangeDetectionStore: if apply_extras.get('tags'): apply_extras['tags'] = list(set(apply_extras.get('tags'))) - new_watch = Watch.model(datastore_path=self.datastore_path, url=url) + # If the processor also has its own Watch implementation + watch_class = get_custom_watch_obj_for_processor(apply_extras.get('processor')) + new_watch = watch_class(datastore_path=self.datastore_path, url=url) new_uuid = new_watch.get('uuid') - logger.debug(f"Adding URL {url} - {new_uuid}") + logger.debug(f"Adding URL '{url}' - {new_uuid}") for k in ['uuid', 'history', 'last_checked', 'last_changed', 'newest_history_key', 'previous_md5', 'viewed']: if k in apply_extras: diff --git a/changedetectionio/tests/test_history_consistency.py b/changedetectionio/tests/test_history_consistency.py index c20aa34b..96287887 100644 --- a/changedetectionio/tests/test_history_consistency.py +++ b/changedetectionio/tests/test_history_consistency.py @@ -74,3 +74,8 @@ def test_consistent_history(client, live_server): assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot" + + + json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json') + with open(json_db_file, 'r') as f: + assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved" diff --git a/changedetectionio/tests/test_ignore_regex_text.py b/changedetectionio/tests/test_ignore_regex_text.py index 45f73392..f480c91b 100644 --- a/changedetectionio/tests/test_ignore_regex_text.py +++ b/changedetectionio/tests/test_ignore_regex_text.py @@ -9,8 +9,6 @@ def test_setup(live_server): # Unit test of the stripper # Always we are dealing in utf-8 def test_strip_regex_text_func(): - from ..processors import text_json_diff as fetch_site_status - test_content = """ but sometimes we want to remove the lines. diff --git a/changedetectionio/tests/test_ignore_text.py b/changedetectionio/tests/test_ignore_text.py index 5d6d7149..4edead7c 100644 --- a/changedetectionio/tests/test_ignore_text.py +++ b/changedetectionio/tests/test_ignore_text.py @@ -11,9 +11,6 @@ def test_setup(live_server): # Unit test of the stripper # Always we are dealing in utf-8 def test_strip_text_func(): - from ..processors import text_json_diff as fetch_site_status - - test_content = """ Some content is listed here diff --git a/changedetectionio/tests/test_restock_itemprop.py b/changedetectionio/tests/test_restock_itemprop.py index 09125754..5279bbbe 100644 --- a/changedetectionio/tests/test_restock_itemprop.py +++ b/changedetectionio/tests/test_restock_itemprop.py @@ -286,4 +286,19 @@ def test_data_sanity(client, live_server): res = client.get(url_for("index")) assert str(res.data.decode()).count("950.95") == 1, "Price should only show once (for the watch added, no other watches yet)" + ## different test, check the edit page works on an empty request result + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + client.post( + url_for("form_quick_watch_add"), + data={"url": test_url2, "tags": 'restock tests', 'processor': 'restock_diff'}, + follow_redirects=True + ) + wait_for_all_checks(client) + + res = client.get( + url_for("edit_page", uuid="first")) + assert test_url2.encode('utf-8') in res.data + + # @todo look at the url-watches and make sure there is not key called "default" !!! \ No newline at end of file diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 197b7d79..ed591ac6 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -1,8 +1,9 @@ from . import content_fetchers -from .processors.restock_diff import UnableToExtractRestockData -from .processors.text_json_diff import FilterNotFoundInResponse +from .processors.restock_diff.processor import UnableToExtractRestockData +from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse from changedetectionio import html_tools -from copy import deepcopy + +import importlib import os import queue import threading @@ -13,7 +14,6 @@ import time # Requests for checking on a single site(watch) from a queue of watches # (another process inserts watches into the queue that are time-ready for checking) -import sys from loguru import logger class update_worker(threading.Thread): @@ -226,8 +226,6 @@ class update_worker(threading.Thread): os.unlink(full_path) def run(self): - - from .processors import text_json_diff, restock_diff now = time.time() while not self.app.config.exit.is_set(): @@ -258,24 +256,21 @@ class update_worker(threading.Thread): try: # Processor is what we are using for detecting the "Change" processor = watch.get('processor', 'text_json_diff') - # if system... - # Abort processing when the content was the same as the last fetch skip_when_same_checksum = queued_item_data.item.get('skip_when_checksum_same') - # @todo some way to switch by name - # Init a new 'difference_detection_processor' + # Init a new 'difference_detection_processor', first look in processors + processor_module_name = f"changedetectionio.processors.{processor}.processor" + try: + processor_module = importlib.import_module(processor_module_name) + except ModuleNotFoundError as e: + print(f"Processor module '{processor}' not found.") + raise e - if processor == 'restock_diff': - update_handler = restock_diff.perform_site_check(datastore=self.datastore, + update_handler = processor_module.perform_site_check(datastore=self.datastore, watch_uuid=uuid ) - else: - # Used as a default and also by some tests - update_handler = text_json_diff.perform_site_check(datastore=self.datastore, - watch_uuid=uuid - ) update_handler.call_browser()