Cleaner history suffix handling

More tidyup
fetch right class
2025-10-30 14:17:40 +00:00 · 2022-08-31 19:22:26 +02:00 · 2022-08-31 18:11:18 +02:00 · 2022-08-31 18:02:22 +02:00 · 2022-08-31 18:00:53 +02:00 · 2022-08-31 17:57:02 +02:00
9 changed files with 65 additions and 59 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@@ -500,7 +500,7 @@ def changedetection_app(config=None, datastore_o=None):

        import hashlib

-        from changedetectionio import fetch_site_status
+        from .fetch_processor import json_html_plaintext

        # Get the most recent one
        newest_history_key = datastore.data['watching'][uuid].get('newest_history_key')
@@ -514,7 +514,7 @@ def changedetection_app(config=None, datastore_o=None):
                      encoding='utf-8') as file:
                raw_content = file.read()

-                handler = fetch_site_status.perform_site_check(datastore=datastore)
+                handler = json_html_plaintext.perform_site_check(datastore=datastore)
                stripped_content = html_tools.strip_ignore_text(raw_content,
                                                             datastore.data['watching'][uuid]['ignore_text'])

--- a/changedetectionio/fetch_processor/init.py
+++ b/changedetectionio/fetch_processor/init.py
@@ -0,0 +1,41 @@
+class fetch_processor():
+    contents = b''
+    screenshot = None
+    history_artifact_suffix = 'txt'
+
+    """
+    base class for all fetch processors
+    - json_html_plaintext
+    - image (future)
+    """
+    def __init__(self, *args, datastore, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.datastore = datastore
+
+    # If there was a proxy list enabled, figure out what proxy_args/which proxy to use
+    # if watch.proxy use that
+    # fetcher.proxy_override = watch.proxy or main config proxy
+    # Allows override the proxy on a per-request basis
+    # ALWAYS use the first one is nothing selected
+
+    def set_proxy_from_list(self, watch):
+        proxy_args = None
+        if self.datastore.proxy_list is None:
+            return None
+
+        # If its a valid one
+        if any([watch['proxy'] in p for p in self.datastore.proxy_list]):
+            proxy_args = watch['proxy']
+
+        # not valid (including None), try the system one
+        else:
+            system_proxy = self.datastore.data['settings']['requests']['proxy']
+            # Is not None and exists
+            if any([system_proxy in p for p in self.datastore.proxy_list]):
+                proxy_args = system_proxy
+
+        # Fallback - Did not resolve anything, use the first available
+        if proxy_args is None:
+            proxy_args = self.datastore.proxy_list[0][0]
+
+        return proxy_args
--- a/changedetectionio/fetch_processor/json_html_plaintext.py
+++ b/changedetectionio/fetch_processor/json_html_plaintext.py
@@ -9,45 +9,14 @@ from changedetectionio import content_fetcher, html_tools

 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

+from . import fetch_processor

 # Some common stuff here that can be moved to a base class
 # (set_proxy_from_list)
-class perform_site_check():
-    screenshot = None
+class perform_site_check(fetch_processor):
+
    xpath_data = None

-    def __init__(self, *args, datastore, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.datastore = datastore
-
-    # If there was a proxy list enabled, figure out what proxy_args/which proxy to use
-    # if watch.proxy use that
-    # fetcher.proxy_override = watch.proxy or main config proxy
-    # Allows override the proxy on a per-request basis
-    # ALWAYS use the first one is nothing selected
-
-    def set_proxy_from_list(self, watch):
-        proxy_args = None
-        if self.datastore.proxy_list is None:
-            return None
-
-        # If its a valid one
-        if any([watch['proxy'] in p for p in self.datastore.proxy_list]):
-            proxy_args = watch['proxy']
-
-        # not valid (including None), try the system one
-        else:
-            system_proxy = self.datastore.data['settings']['requests']['proxy']
-            # Is not None and exists
-            if any([system_proxy in p for p in self.datastore.proxy_list]):
-                proxy_args = system_proxy
-
-        # Fallback - Did not resolve anything, use the first available
-        if proxy_args is None:
-            proxy_args = self.datastore.proxy_list[0][0]
-
-        return proxy_args
-
    # Doesn't look like python supports forward slash auto enclosure in re.findall
    # So convert it to inline flag "foobar(?i)" type configuration
    def forward_slash_enclosed_regex_to_options(self, regex):
@@ -315,4 +284,6 @@ class perform_site_check():
        if not watch.get('previous_md5'):
            watch['previous_md5'] = fetched_md5

-        return changed_detected, update_obj, text_content_before_ignored_filter
+        self.contents = text_content_before_ignored_filter
+
+        return changed_detected, update_obj
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -148,9 +148,7 @@ class model(dict):
        bump = self.history
        return self.__newest_history_key

-    # Save some text file to the appropriate path and bump the history
-    # result_obj from fetch_site_status.run()
-    def save_history_text(self, contents, timestamp):
+    def save_history_artifact(self, contents: bytes, timestamp, suffix='txt'):
        import uuid
        import logging

@@ -158,8 +156,8 @@ class model(dict):

        self.ensure_data_dir_exists()

-        snapshot_fname = "{}/{}.stripped.txt".format(output_path, uuid.uuid4())
-        logging.debug("Saving history text {}".format(snapshot_fname))
+        snapshot_fname = "{}/{}.{}".format(output_path, uuid.uuid4(), suffix)
+        logging.debug("Saving history artifact {}".format(snapshot_fname))

        with open(snapshot_fname, 'wb') as f:
            f.write(contents)
--- a/changedetectionio/tests/test_css_selector.py
+++ b/changedetectionio/tests/test_css_selector.py
@@ -47,7 +47,6 @@ def set_modified_response():

 # Test that the CSS extraction works how we expect, important here is the right placing of new lines \n's
 def test_css_filter_output():
-    from changedetectionio import fetch_site_status
    from inscriptis import get_text

    # Check text with sub-parts renders correctly
--- a/changedetectionio/tests/test_element_removal.py
+++ b/changedetectionio/tests/test_element_removal.py
@@ -71,7 +71,6 @@ def set_modified_response():


 def test_element_removal_output():
-    from changedetectionio import fetch_site_status
    from inscriptis import get_text

    # Check text with sub-parts renders correctly
--- a/changedetectionio/tests/test_ignore_regex_text.py
+++ b/changedetectionio/tests/test_ignore_regex_text.py
@@ -1,7 +1,5 @@
 #!/usr/bin/python3

-import time
-from flask import url_for
 from . util import live_server_setup
 from changedetectionio import html_tools

@@ -11,7 +9,7 @@ def test_setup(live_server):
 # Unit test of the stripper
 # Always we are dealing in utf-8
 def test_strip_regex_text_func():
-    from changedetectionio import fetch_site_status
+    from ..fetch_processor import json_html_plaintext

    test_content = """
    but sometimes we want to remove the lines.
@@ -23,7 +21,7 @@ def test_strip_regex_text_func():

    ignore_lines = ["sometimes", "/\s\d{2,3}\s/", "/ignore-case text/"]

-    fetcher = fetch_site_status.perform_site_check(datastore=False)
+    fetcher = json_html_plaintext.perform_site_check(datastore=False)
    stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)

    assert b"but 1 lines" in stripped_content
--- a/changedetectionio/tests/test_ignore_text.py
+++ b/changedetectionio/tests/test_ignore_text.py
@@ -11,7 +11,7 @@ def test_setup(live_server):
 # Unit test of the stripper
 # Always we are dealing in utf-8
 def test_strip_text_func():
-    from changedetectionio import fetch_site_status
+    from ..fetch_processor import json_html_plaintext

    test_content = """
    Some content
@@ -23,7 +23,9 @@ def test_strip_text_func():

    ignore_lines = ["sometimes"]

-    fetcher = fetch_site_status.perform_site_check(datastore=False)
+    from ..fetch_processor import json_html_plaintext
+
+    fetcher = json_html_plaintext.perform_site_check(datastore=False)
    stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)

    assert b"sometimes" not in stripped_content
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -117,9 +117,9 @@ class update_worker(threading.Thread):
                os.unlink(full_path)

    def run(self):
-        from changedetectionio import fetch_site_status
+        from .fetch_processor import json_html_plaintext
+

-        update_handler = fetch_site_status.perform_site_check(datastore=self.datastore)

        while not self.app.config.exit.is_set():

@@ -132,21 +132,20 @@ class update_worker(threading.Thread):
                self.current_uuid = uuid

                if uuid in list(self.datastore.data['watching'].keys()):
+                    update_handler = None # Interface object
                    changed_detected = False
-                    contents = b''
-                    screenshot = False
                    update_obj= {}
-                    xpath_data = False
                    process_changedetection_results = True
                    print("> Processing UUID {} Priority {} URL {}".format(uuid, priority, self.datastore.data['watching'][uuid]['url']))
                    now = time.time()

                    try:
-                        changed_detected, update_obj, contents = update_handler.run(uuid)
+                        update_handler = json_html_plaintext.perform_site_check(datastore=self.datastore)
+                        changed_detected, update_obj = update_handler.run(uuid)
                        # Re #342
                        # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
                        # We then convert/.decode('utf-8') for the notification etc
-                        if not isinstance(contents, (bytes, bytearray)):
+                        if not isinstance(update_handler.contents, (bytes, bytearray)):
                            raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
                    except PermissionError as e:
                        self.app.logger.error("File permission error updating", uuid, str(e))
@@ -251,12 +250,11 @@ class update_worker(threading.Thread):
                    if process_changedetection_results:
                        try:
                            watch = self.datastore.data['watching'][uuid]
-                            fname = "" # Saved history text filename

                            # For the FIRST time we check a site, or a change detected, save the snapshot.
                            if changed_detected or not watch['last_checked']:
                                # A change was detected
-                                watch.save_history_text(contents=contents, timestamp=str(round(time.time())))
+                                watch.save_history_artifact(contents=update_handler.contents, timestamp=str(round(time.time())), suffix=update_handler.history_artifact_suffix)

                            self.datastore.update_watch(uuid=uuid, update_obj=update_obj)
Author	SHA1	Message	Date
dgtlmoon	f35d91e4fb	Cleaner history suffix handling	2022-08-31 19:22:26 +02:00
dgtlmoon	687cf9beb4	More tidyup	2022-08-31 18:11:18 +02:00
dgtlmoon	f59b198ffb	fetch right class	2022-08-31 18:02:22 +02:00
dgtlmoon	518bdf5a3f	move this	2022-08-31 18:00:53 +02:00
dgtlmoon	dcd09359eb	cleanup	2022-08-31 17:57:02 +02:00
dgtlmoon	425f8ea632	Abstract out the fetch handlers for different fetch types	2022-08-31 17:52:32 +02:00