mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-10-30 14:17:40 +00:00
Compare commits
6 Commits
1646-clone
...
fetchers-a
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f35d91e4fb | ||
|
|
687cf9beb4 | ||
|
|
f59b198ffb | ||
|
|
518bdf5a3f | ||
|
|
dcd09359eb | ||
|
|
425f8ea632 |
@@ -500,7 +500,7 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
|
||||
import hashlib
|
||||
|
||||
from changedetectionio import fetch_site_status
|
||||
from .fetch_processor import json_html_plaintext
|
||||
|
||||
# Get the most recent one
|
||||
newest_history_key = datastore.data['watching'][uuid].get('newest_history_key')
|
||||
@@ -514,7 +514,7 @@ def changedetection_app(config=None, datastore_o=None):
|
||||
encoding='utf-8') as file:
|
||||
raw_content = file.read()
|
||||
|
||||
handler = fetch_site_status.perform_site_check(datastore=datastore)
|
||||
handler = json_html_plaintext.perform_site_check(datastore=datastore)
|
||||
stripped_content = html_tools.strip_ignore_text(raw_content,
|
||||
datastore.data['watching'][uuid]['ignore_text'])
|
||||
|
||||
|
||||
41
changedetectionio/fetch_processor/__init__.py
Normal file
41
changedetectionio/fetch_processor/__init__.py
Normal file
@@ -0,0 +1,41 @@
|
||||
class fetch_processor():
|
||||
contents = b''
|
||||
screenshot = None
|
||||
history_artifact_suffix = 'txt'
|
||||
|
||||
"""
|
||||
base class for all fetch processors
|
||||
- json_html_plaintext
|
||||
- image (future)
|
||||
"""
|
||||
def __init__(self, *args, datastore, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.datastore = datastore
|
||||
|
||||
# If there was a proxy list enabled, figure out what proxy_args/which proxy to use
|
||||
# if watch.proxy use that
|
||||
# fetcher.proxy_override = watch.proxy or main config proxy
|
||||
# Allows override the proxy on a per-request basis
|
||||
# ALWAYS use the first one is nothing selected
|
||||
|
||||
def set_proxy_from_list(self, watch):
|
||||
proxy_args = None
|
||||
if self.datastore.proxy_list is None:
|
||||
return None
|
||||
|
||||
# If its a valid one
|
||||
if any([watch['proxy'] in p for p in self.datastore.proxy_list]):
|
||||
proxy_args = watch['proxy']
|
||||
|
||||
# not valid (including None), try the system one
|
||||
else:
|
||||
system_proxy = self.datastore.data['settings']['requests']['proxy']
|
||||
# Is not None and exists
|
||||
if any([system_proxy in p for p in self.datastore.proxy_list]):
|
||||
proxy_args = system_proxy
|
||||
|
||||
# Fallback - Did not resolve anything, use the first available
|
||||
if proxy_args is None:
|
||||
proxy_args = self.datastore.proxy_list[0][0]
|
||||
|
||||
return proxy_args
|
||||
@@ -9,45 +9,14 @@ from changedetectionio import content_fetcher, html_tools
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
from . import fetch_processor
|
||||
|
||||
# Some common stuff here that can be moved to a base class
|
||||
# (set_proxy_from_list)
|
||||
class perform_site_check():
|
||||
screenshot = None
|
||||
class perform_site_check(fetch_processor):
|
||||
|
||||
xpath_data = None
|
||||
|
||||
def __init__(self, *args, datastore, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.datastore = datastore
|
||||
|
||||
# If there was a proxy list enabled, figure out what proxy_args/which proxy to use
|
||||
# if watch.proxy use that
|
||||
# fetcher.proxy_override = watch.proxy or main config proxy
|
||||
# Allows override the proxy on a per-request basis
|
||||
# ALWAYS use the first one is nothing selected
|
||||
|
||||
def set_proxy_from_list(self, watch):
|
||||
proxy_args = None
|
||||
if self.datastore.proxy_list is None:
|
||||
return None
|
||||
|
||||
# If its a valid one
|
||||
if any([watch['proxy'] in p for p in self.datastore.proxy_list]):
|
||||
proxy_args = watch['proxy']
|
||||
|
||||
# not valid (including None), try the system one
|
||||
else:
|
||||
system_proxy = self.datastore.data['settings']['requests']['proxy']
|
||||
# Is not None and exists
|
||||
if any([system_proxy in p for p in self.datastore.proxy_list]):
|
||||
proxy_args = system_proxy
|
||||
|
||||
# Fallback - Did not resolve anything, use the first available
|
||||
if proxy_args is None:
|
||||
proxy_args = self.datastore.proxy_list[0][0]
|
||||
|
||||
return proxy_args
|
||||
|
||||
# Doesn't look like python supports forward slash auto enclosure in re.findall
|
||||
# So convert it to inline flag "foobar(?i)" type configuration
|
||||
def forward_slash_enclosed_regex_to_options(self, regex):
|
||||
@@ -315,4 +284,6 @@ class perform_site_check():
|
||||
if not watch.get('previous_md5'):
|
||||
watch['previous_md5'] = fetched_md5
|
||||
|
||||
return changed_detected, update_obj, text_content_before_ignored_filter
|
||||
self.contents = text_content_before_ignored_filter
|
||||
|
||||
return changed_detected, update_obj
|
||||
@@ -148,9 +148,7 @@ class model(dict):
|
||||
bump = self.history
|
||||
return self.__newest_history_key
|
||||
|
||||
# Save some text file to the appropriate path and bump the history
|
||||
# result_obj from fetch_site_status.run()
|
||||
def save_history_text(self, contents, timestamp):
|
||||
def save_history_artifact(self, contents: bytes, timestamp, suffix='txt'):
|
||||
import uuid
|
||||
import logging
|
||||
|
||||
@@ -158,8 +156,8 @@ class model(dict):
|
||||
|
||||
self.ensure_data_dir_exists()
|
||||
|
||||
snapshot_fname = "{}/{}.stripped.txt".format(output_path, uuid.uuid4())
|
||||
logging.debug("Saving history text {}".format(snapshot_fname))
|
||||
snapshot_fname = "{}/{}.{}".format(output_path, uuid.uuid4(), suffix)
|
||||
logging.debug("Saving history artifact {}".format(snapshot_fname))
|
||||
|
||||
with open(snapshot_fname, 'wb') as f:
|
||||
f.write(contents)
|
||||
|
||||
@@ -47,7 +47,6 @@ def set_modified_response():
|
||||
|
||||
# Test that the CSS extraction works how we expect, important here is the right placing of new lines \n's
|
||||
def test_css_filter_output():
|
||||
from changedetectionio import fetch_site_status
|
||||
from inscriptis import get_text
|
||||
|
||||
# Check text with sub-parts renders correctly
|
||||
|
||||
@@ -71,7 +71,6 @@ def set_modified_response():
|
||||
|
||||
|
||||
def test_element_removal_output():
|
||||
from changedetectionio import fetch_site_status
|
||||
from inscriptis import get_text
|
||||
|
||||
# Check text with sub-parts renders correctly
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import time
|
||||
from flask import url_for
|
||||
from . util import live_server_setup
|
||||
from changedetectionio import html_tools
|
||||
|
||||
@@ -11,7 +9,7 @@ def test_setup(live_server):
|
||||
# Unit test of the stripper
|
||||
# Always we are dealing in utf-8
|
||||
def test_strip_regex_text_func():
|
||||
from changedetectionio import fetch_site_status
|
||||
from ..fetch_processor import json_html_plaintext
|
||||
|
||||
test_content = """
|
||||
but sometimes we want to remove the lines.
|
||||
@@ -23,7 +21,7 @@ def test_strip_regex_text_func():
|
||||
|
||||
ignore_lines = ["sometimes", "/\s\d{2,3}\s/", "/ignore-case text/"]
|
||||
|
||||
fetcher = fetch_site_status.perform_site_check(datastore=False)
|
||||
fetcher = json_html_plaintext.perform_site_check(datastore=False)
|
||||
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)
|
||||
|
||||
assert b"but 1 lines" in stripped_content
|
||||
|
||||
@@ -11,7 +11,7 @@ def test_setup(live_server):
|
||||
# Unit test of the stripper
|
||||
# Always we are dealing in utf-8
|
||||
def test_strip_text_func():
|
||||
from changedetectionio import fetch_site_status
|
||||
from ..fetch_processor import json_html_plaintext
|
||||
|
||||
test_content = """
|
||||
Some content
|
||||
@@ -23,7 +23,9 @@ def test_strip_text_func():
|
||||
|
||||
ignore_lines = ["sometimes"]
|
||||
|
||||
fetcher = fetch_site_status.perform_site_check(datastore=False)
|
||||
from ..fetch_processor import json_html_plaintext
|
||||
|
||||
fetcher = json_html_plaintext.perform_site_check(datastore=False)
|
||||
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)
|
||||
|
||||
assert b"sometimes" not in stripped_content
|
||||
|
||||
@@ -117,9 +117,9 @@ class update_worker(threading.Thread):
|
||||
os.unlink(full_path)
|
||||
|
||||
def run(self):
|
||||
from changedetectionio import fetch_site_status
|
||||
from .fetch_processor import json_html_plaintext
|
||||
|
||||
|
||||
update_handler = fetch_site_status.perform_site_check(datastore=self.datastore)
|
||||
|
||||
while not self.app.config.exit.is_set():
|
||||
|
||||
@@ -132,21 +132,20 @@ class update_worker(threading.Thread):
|
||||
self.current_uuid = uuid
|
||||
|
||||
if uuid in list(self.datastore.data['watching'].keys()):
|
||||
update_handler = None # Interface object
|
||||
changed_detected = False
|
||||
contents = b''
|
||||
screenshot = False
|
||||
update_obj= {}
|
||||
xpath_data = False
|
||||
process_changedetection_results = True
|
||||
print("> Processing UUID {} Priority {} URL {}".format(uuid, priority, self.datastore.data['watching'][uuid]['url']))
|
||||
now = time.time()
|
||||
|
||||
try:
|
||||
changed_detected, update_obj, contents = update_handler.run(uuid)
|
||||
update_handler = json_html_plaintext.perform_site_check(datastore=self.datastore)
|
||||
changed_detected, update_obj = update_handler.run(uuid)
|
||||
# Re #342
|
||||
# In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
|
||||
# We then convert/.decode('utf-8') for the notification etc
|
||||
if not isinstance(contents, (bytes, bytearray)):
|
||||
if not isinstance(update_handler.contents, (bytes, bytearray)):
|
||||
raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
|
||||
except PermissionError as e:
|
||||
self.app.logger.error("File permission error updating", uuid, str(e))
|
||||
@@ -251,12 +250,11 @@ class update_worker(threading.Thread):
|
||||
if process_changedetection_results:
|
||||
try:
|
||||
watch = self.datastore.data['watching'][uuid]
|
||||
fname = "" # Saved history text filename
|
||||
|
||||
# For the FIRST time we check a site, or a change detected, save the snapshot.
|
||||
if changed_detected or not watch['last_checked']:
|
||||
# A change was detected
|
||||
watch.save_history_text(contents=contents, timestamp=str(round(time.time())))
|
||||
watch.save_history_artifact(contents=update_handler.contents, timestamp=str(round(time.time())), suffix=update_handler.history_artifact_suffix)
|
||||
|
||||
self.datastore.update_watch(uuid=uuid, update_obj=update_obj)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user