Compare commits

...

6 Commits

Author SHA1 Message Date
dgtlmoon
f35d91e4fb Cleaner history suffix handling 2022-08-31 19:22:26 +02:00
dgtlmoon
687cf9beb4 More tidyup 2022-08-31 18:11:18 +02:00
dgtlmoon
f59b198ffb fetch right class 2022-08-31 18:02:22 +02:00
dgtlmoon
518bdf5a3f move this 2022-08-31 18:00:53 +02:00
dgtlmoon
dcd09359eb cleanup 2022-08-31 17:57:02 +02:00
dgtlmoon
425f8ea632 Abstract out the fetch handlers for different fetch types 2022-08-31 17:52:32 +02:00
9 changed files with 65 additions and 59 deletions

View File

@@ -500,7 +500,7 @@ def changedetection_app(config=None, datastore_o=None):
import hashlib
from changedetectionio import fetch_site_status
from .fetch_processor import json_html_plaintext
# Get the most recent one
newest_history_key = datastore.data['watching'][uuid].get('newest_history_key')
@@ -514,7 +514,7 @@ def changedetection_app(config=None, datastore_o=None):
encoding='utf-8') as file:
raw_content = file.read()
handler = fetch_site_status.perform_site_check(datastore=datastore)
handler = json_html_plaintext.perform_site_check(datastore=datastore)
stripped_content = html_tools.strip_ignore_text(raw_content,
datastore.data['watching'][uuid]['ignore_text'])

View File

@@ -0,0 +1,41 @@
class fetch_processor():
contents = b''
screenshot = None
history_artifact_suffix = 'txt'
"""
base class for all fetch processors
- json_html_plaintext
- image (future)
"""
def __init__(self, *args, datastore, **kwargs):
super().__init__(*args, **kwargs)
self.datastore = datastore
# If there was a proxy list enabled, figure out what proxy_args/which proxy to use
# if watch.proxy use that
# fetcher.proxy_override = watch.proxy or main config proxy
# Allows override the proxy on a per-request basis
# ALWAYS use the first one is nothing selected
def set_proxy_from_list(self, watch):
proxy_args = None
if self.datastore.proxy_list is None:
return None
# If its a valid one
if any([watch['proxy'] in p for p in self.datastore.proxy_list]):
proxy_args = watch['proxy']
# not valid (including None), try the system one
else:
system_proxy = self.datastore.data['settings']['requests']['proxy']
# Is not None and exists
if any([system_proxy in p for p in self.datastore.proxy_list]):
proxy_args = system_proxy
# Fallback - Did not resolve anything, use the first available
if proxy_args is None:
proxy_args = self.datastore.proxy_list[0][0]
return proxy_args

View File

@@ -9,45 +9,14 @@ from changedetectionio import content_fetcher, html_tools
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from . import fetch_processor
# Some common stuff here that can be moved to a base class
# (set_proxy_from_list)
class perform_site_check():
screenshot = None
class perform_site_check(fetch_processor):
xpath_data = None
def __init__(self, *args, datastore, **kwargs):
super().__init__(*args, **kwargs)
self.datastore = datastore
# If there was a proxy list enabled, figure out what proxy_args/which proxy to use
# if watch.proxy use that
# fetcher.proxy_override = watch.proxy or main config proxy
# Allows override the proxy on a per-request basis
# ALWAYS use the first one is nothing selected
def set_proxy_from_list(self, watch):
proxy_args = None
if self.datastore.proxy_list is None:
return None
# If its a valid one
if any([watch['proxy'] in p for p in self.datastore.proxy_list]):
proxy_args = watch['proxy']
# not valid (including None), try the system one
else:
system_proxy = self.datastore.data['settings']['requests']['proxy']
# Is not None and exists
if any([system_proxy in p for p in self.datastore.proxy_list]):
proxy_args = system_proxy
# Fallback - Did not resolve anything, use the first available
if proxy_args is None:
proxy_args = self.datastore.proxy_list[0][0]
return proxy_args
# Doesn't look like python supports forward slash auto enclosure in re.findall
# So convert it to inline flag "foobar(?i)" type configuration
def forward_slash_enclosed_regex_to_options(self, regex):
@@ -315,4 +284,6 @@ class perform_site_check():
if not watch.get('previous_md5'):
watch['previous_md5'] = fetched_md5
return changed_detected, update_obj, text_content_before_ignored_filter
self.contents = text_content_before_ignored_filter
return changed_detected, update_obj

View File

@@ -148,9 +148,7 @@ class model(dict):
bump = self.history
return self.__newest_history_key
# Save some text file to the appropriate path and bump the history
# result_obj from fetch_site_status.run()
def save_history_text(self, contents, timestamp):
def save_history_artifact(self, contents: bytes, timestamp, suffix='txt'):
import uuid
import logging
@@ -158,8 +156,8 @@ class model(dict):
self.ensure_data_dir_exists()
snapshot_fname = "{}/{}.stripped.txt".format(output_path, uuid.uuid4())
logging.debug("Saving history text {}".format(snapshot_fname))
snapshot_fname = "{}/{}.{}".format(output_path, uuid.uuid4(), suffix)
logging.debug("Saving history artifact {}".format(snapshot_fname))
with open(snapshot_fname, 'wb') as f:
f.write(contents)

View File

@@ -47,7 +47,6 @@ def set_modified_response():
# Test that the CSS extraction works how we expect, important here is the right placing of new lines \n's
def test_css_filter_output():
from changedetectionio import fetch_site_status
from inscriptis import get_text
# Check text with sub-parts renders correctly

View File

@@ -71,7 +71,6 @@ def set_modified_response():
def test_element_removal_output():
from changedetectionio import fetch_site_status
from inscriptis import get_text
# Check text with sub-parts renders correctly

View File

@@ -1,7 +1,5 @@
#!/usr/bin/python3
import time
from flask import url_for
from . util import live_server_setup
from changedetectionio import html_tools
@@ -11,7 +9,7 @@ def test_setup(live_server):
# Unit test of the stripper
# Always we are dealing in utf-8
def test_strip_regex_text_func():
from changedetectionio import fetch_site_status
from ..fetch_processor import json_html_plaintext
test_content = """
but sometimes we want to remove the lines.
@@ -23,7 +21,7 @@ def test_strip_regex_text_func():
ignore_lines = ["sometimes", "/\s\d{2,3}\s/", "/ignore-case text/"]
fetcher = fetch_site_status.perform_site_check(datastore=False)
fetcher = json_html_plaintext.perform_site_check(datastore=False)
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)
assert b"but 1 lines" in stripped_content

View File

@@ -11,7 +11,7 @@ def test_setup(live_server):
# Unit test of the stripper
# Always we are dealing in utf-8
def test_strip_text_func():
from changedetectionio import fetch_site_status
from ..fetch_processor import json_html_plaintext
test_content = """
Some content
@@ -23,7 +23,9 @@ def test_strip_text_func():
ignore_lines = ["sometimes"]
fetcher = fetch_site_status.perform_site_check(datastore=False)
from ..fetch_processor import json_html_plaintext
fetcher = json_html_plaintext.perform_site_check(datastore=False)
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)
assert b"sometimes" not in stripped_content

View File

@@ -117,9 +117,9 @@ class update_worker(threading.Thread):
os.unlink(full_path)
def run(self):
from changedetectionio import fetch_site_status
from .fetch_processor import json_html_plaintext
update_handler = fetch_site_status.perform_site_check(datastore=self.datastore)
while not self.app.config.exit.is_set():
@@ -132,21 +132,20 @@ class update_worker(threading.Thread):
self.current_uuid = uuid
if uuid in list(self.datastore.data['watching'].keys()):
update_handler = None # Interface object
changed_detected = False
contents = b''
screenshot = False
update_obj= {}
xpath_data = False
process_changedetection_results = True
print("> Processing UUID {} Priority {} URL {}".format(uuid, priority, self.datastore.data['watching'][uuid]['url']))
now = time.time()
try:
changed_detected, update_obj, contents = update_handler.run(uuid)
update_handler = json_html_plaintext.perform_site_check(datastore=self.datastore)
changed_detected, update_obj = update_handler.run(uuid)
# Re #342
# In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
# We then convert/.decode('utf-8') for the notification etc
if not isinstance(contents, (bytes, bytearray)):
if not isinstance(update_handler.contents, (bytes, bytearray)):
raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
except PermissionError as e:
self.app.logger.error("File permission error updating", uuid, str(e))
@@ -251,12 +250,11 @@ class update_worker(threading.Thread):
if process_changedetection_results:
try:
watch = self.datastore.data['watching'][uuid]
fname = "" # Saved history text filename
# For the FIRST time we check a site, or a change detected, save the snapshot.
if changed_detected or not watch['last_checked']:
# A change was detected
watch.save_history_text(contents=contents, timestamp=str(round(time.time())))
watch.save_history_artifact(contents=update_handler.contents, timestamp=str(round(time.time())), suffix=update_handler.history_artifact_suffix)
self.datastore.update_watch(uuid=uuid, update_obj=update_obj)