mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-22 17:36:09 +00:00
Compare commits
7 Commits
update-sel
...
0.49.17
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c162ec9d52 | ||
|
|
bb7f7f473b | ||
|
|
a9ca511004 | ||
|
|
8df61f5eaa | ||
|
|
162f573967 | ||
|
|
eada0ef08d | ||
|
|
f57bc10973 |
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
|
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
|
||||||
|
|
||||||
__version__ = '0.49.15'
|
__version__ = '0.49.17'
|
||||||
|
|
||||||
from changedetectionio.strtobool import strtobool
|
from changedetectionio.strtobool import strtobool
|
||||||
from json.decoder import JSONDecodeError
|
from json.decoder import JSONDecodeError
|
||||||
|
|||||||
@@ -168,9 +168,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
|
|||||||
step_optional_value = request.form.get('optional_value')
|
step_optional_value = request.form.get('optional_value')
|
||||||
is_last_step = strtobool(request.form.get('is_last_step'))
|
is_last_step = strtobool(request.form.get('is_last_step'))
|
||||||
|
|
||||||
# @todo try.. accept.. nice errors not popups..
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
browsersteps_sessions[browsersteps_session_id]['browserstepper'].call_action(action_name=step_operation,
|
browsersteps_sessions[browsersteps_session_id]['browserstepper'].call_action(action_name=step_operation,
|
||||||
selector=step_selector,
|
selector=step_selector,
|
||||||
optional_value=step_optional_value)
|
optional_value=step_optional_value)
|
||||||
|
|||||||
@@ -62,23 +62,6 @@ class steppable_browser_interface():
|
|||||||
def __init__(self, start_url):
|
def __init__(self, start_url):
|
||||||
self.start_url = start_url
|
self.start_url = start_url
|
||||||
|
|
||||||
def safe_page_operation(self, operation_fn, default_return=None):
|
|
||||||
"""Safely execute a page operation with error handling"""
|
|
||||||
if self.page is None:
|
|
||||||
logger.warning("Attempted operation on None page object")
|
|
||||||
return default_return
|
|
||||||
|
|
||||||
try:
|
|
||||||
return operation_fn()
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Page operation failed: {str(e)}")
|
|
||||||
# Try to reclaim memory if possible
|
|
||||||
try:
|
|
||||||
self.page.request_gc()
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
return default_return
|
|
||||||
|
|
||||||
# Convert and perform "Click Button" for example
|
# Convert and perform "Click Button" for example
|
||||||
def call_action(self, action_name, selector=None, optional_value=None):
|
def call_action(self, action_name, selector=None, optional_value=None):
|
||||||
if self.page is None:
|
if self.page is None:
|
||||||
@@ -109,20 +92,11 @@ class steppable_browser_interface():
|
|||||||
if optional_value and ('{%' in optional_value or '{{' in optional_value):
|
if optional_value and ('{%' in optional_value or '{{' in optional_value):
|
||||||
optional_value = jinja_render(template_str=optional_value)
|
optional_value = jinja_render(template_str=optional_value)
|
||||||
|
|
||||||
try:
|
|
||||||
action_handler(selector, optional_value)
|
action_handler(selector, optional_value)
|
||||||
# Safely wait for timeout
|
# Safely wait for timeout
|
||||||
def wait_timeout():
|
|
||||||
self.page.wait_for_timeout(1.5 * 1000)
|
self.page.wait_for_timeout(1.5 * 1000)
|
||||||
self.safe_page_operation(wait_timeout)
|
|
||||||
logger.debug(f"Call action done in {time.time()-now:.2f}s")
|
logger.debug(f"Call action done in {time.time()-now:.2f}s")
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error executing action '{call_action_name}': {str(e)}")
|
|
||||||
# Request garbage collection to free up resources after error
|
|
||||||
try:
|
|
||||||
self.page.request_gc()
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def action_goto_url(self, selector=None, value=None):
|
def action_goto_url(self, selector=None, value=None):
|
||||||
if not value:
|
if not value:
|
||||||
@@ -130,11 +104,7 @@ class steppable_browser_interface():
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
now = time.time()
|
now = time.time()
|
||||||
|
response = self.page.goto(value, timeout=0, wait_until='load')
|
||||||
def goto_operation():
|
|
||||||
return self.page.goto(value, timeout=0, wait_until='load')
|
|
||||||
|
|
||||||
response = self.safe_page_operation(goto_operation)
|
|
||||||
logger.debug(f"Time to goto URL {time.time()-now:.2f}s")
|
logger.debug(f"Time to goto URL {time.time()-now:.2f}s")
|
||||||
return response
|
return response
|
||||||
|
|
||||||
@@ -147,61 +117,47 @@ class steppable_browser_interface():
|
|||||||
if not value or not len(value.strip()):
|
if not value or not len(value.strip()):
|
||||||
return
|
return
|
||||||
|
|
||||||
def click_operation():
|
|
||||||
elem = self.page.get_by_text(value)
|
elem = self.page.get_by_text(value)
|
||||||
if elem.count():
|
if elem.count():
|
||||||
elem.first.click(delay=randint(200, 500), timeout=self.action_timeout)
|
elem.first.click(delay=randint(200, 500), timeout=self.action_timeout)
|
||||||
|
|
||||||
self.safe_page_operation(click_operation)
|
|
||||||
|
|
||||||
def action_click_element_containing_text_if_exists(self, selector=None, value=''):
|
def action_click_element_containing_text_if_exists(self, selector=None, value=''):
|
||||||
logger.debug("Clicking element containing text if exists")
|
logger.debug("Clicking element containing text if exists")
|
||||||
if not value or not len(value.strip()):
|
if not value or not len(value.strip()):
|
||||||
return
|
return
|
||||||
|
|
||||||
def click_if_exists_operation():
|
|
||||||
elem = self.page.get_by_text(value)
|
elem = self.page.get_by_text(value)
|
||||||
logger.debug(f"Clicking element containing text - {elem.count()} elements found")
|
logger.debug(f"Clicking element containing text - {elem.count()} elements found")
|
||||||
if elem.count():
|
if elem.count():
|
||||||
elem.first.click(delay=randint(200, 500), timeout=self.action_timeout)
|
elem.first.click(delay=randint(200, 500), timeout=self.action_timeout)
|
||||||
|
|
||||||
self.safe_page_operation(click_if_exists_operation)
|
|
||||||
|
|
||||||
def action_enter_text_in_field(self, selector, value):
|
def action_enter_text_in_field(self, selector, value):
|
||||||
if not selector or not len(selector.strip()):
|
if not selector or not len(selector.strip()):
|
||||||
return
|
return
|
||||||
|
|
||||||
def fill_operation():
|
|
||||||
self.page.fill(selector, value, timeout=self.action_timeout)
|
self.page.fill(selector, value, timeout=self.action_timeout)
|
||||||
|
|
||||||
self.safe_page_operation(fill_operation)
|
|
||||||
|
|
||||||
def action_execute_js(self, selector, value):
|
def action_execute_js(self, selector, value):
|
||||||
if not value:
|
if not value:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def evaluate_operation():
|
|
||||||
return self.page.evaluate(value)
|
return self.page.evaluate(value)
|
||||||
|
|
||||||
return self.safe_page_operation(evaluate_operation)
|
|
||||||
|
|
||||||
def action_click_element(self, selector, value):
|
def action_click_element(self, selector, value):
|
||||||
logger.debug("Clicking element")
|
logger.debug("Clicking element")
|
||||||
if not selector or not len(selector.strip()):
|
if not selector or not len(selector.strip()):
|
||||||
return
|
return
|
||||||
|
|
||||||
def click_operation():
|
|
||||||
self.page.click(selector=selector, timeout=self.action_timeout + 20 * 1000, delay=randint(200, 500))
|
self.page.click(selector=selector, timeout=self.action_timeout + 20 * 1000, delay=randint(200, 500))
|
||||||
|
|
||||||
self.safe_page_operation(click_operation)
|
|
||||||
|
|
||||||
def action_click_element_if_exists(self, selector, value):
|
def action_click_element_if_exists(self, selector, value):
|
||||||
import playwright._impl._errors as _api_types
|
import playwright._impl._errors as _api_types
|
||||||
logger.debug("Clicking element if exists")
|
logger.debug("Clicking element if exists")
|
||||||
if not selector or not len(selector.strip()):
|
if not selector or not len(selector.strip()):
|
||||||
return
|
return
|
||||||
|
|
||||||
def click_if_exists_operation():
|
|
||||||
try:
|
try:
|
||||||
self.page.click(selector, timeout=self.action_timeout, delay=randint(200, 500))
|
self.page.click(selector, timeout=self.action_timeout, delay=randint(200, 500))
|
||||||
except _api_types.TimeoutError:
|
except _api_types.TimeoutError:
|
||||||
@@ -210,7 +166,6 @@ class steppable_browser_interface():
|
|||||||
# Element was there, but page redrew and now its long long gone
|
# Element was there, but page redrew and now its long long gone
|
||||||
return
|
return
|
||||||
|
|
||||||
self.safe_page_operation(click_if_exists_operation)
|
|
||||||
|
|
||||||
def action_click_x_y(self, selector, value):
|
def action_click_x_y(self, selector, value):
|
||||||
if not value or not re.match(r'^\s?\d+\s?,\s?\d+\s?$', value):
|
if not value or not re.match(r'^\s?\d+\s?,\s?\d+\s?$', value):
|
||||||
@@ -222,10 +177,8 @@ class steppable_browser_interface():
|
|||||||
x = int(float(x.strip()))
|
x = int(float(x.strip()))
|
||||||
y = int(float(y.strip()))
|
y = int(float(y.strip()))
|
||||||
|
|
||||||
def click_xy_operation():
|
|
||||||
self.page.mouse.click(x=x, y=y, delay=randint(200, 500))
|
self.page.mouse.click(x=x, y=y, delay=randint(200, 500))
|
||||||
|
|
||||||
self.safe_page_operation(click_xy_operation)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error parsing x,y coordinates: {str(e)}")
|
logger.error(f"Error parsing x,y coordinates: {str(e)}")
|
||||||
|
|
||||||
@@ -233,27 +186,17 @@ class steppable_browser_interface():
|
|||||||
if not selector or not len(selector.strip()):
|
if not selector or not len(selector.strip()):
|
||||||
return
|
return
|
||||||
|
|
||||||
def select_operation():
|
|
||||||
self.page.select_option(selector, label=value, timeout=self.action_timeout)
|
self.page.select_option(selector, label=value, timeout=self.action_timeout)
|
||||||
|
|
||||||
self.safe_page_operation(select_operation)
|
|
||||||
|
|
||||||
def action_scroll_down(self, selector, value):
|
def action_scroll_down(self, selector, value):
|
||||||
def scroll_operation():
|
|
||||||
# Some sites this doesnt work on for some reason
|
# Some sites this doesnt work on for some reason
|
||||||
self.page.mouse.wheel(0, 600)
|
self.page.mouse.wheel(0, 600)
|
||||||
self.page.wait_for_timeout(1000)
|
self.page.wait_for_timeout(1000)
|
||||||
|
|
||||||
self.safe_page_operation(scroll_operation)
|
|
||||||
|
|
||||||
def action_wait_for_seconds(self, selector, value):
|
def action_wait_for_seconds(self, selector, value):
|
||||||
try:
|
try:
|
||||||
seconds = float(value.strip()) if value else 1.0
|
seconds = float(value.strip()) if value else 1.0
|
||||||
|
|
||||||
def wait_operation():
|
|
||||||
self.page.wait_for_timeout(seconds * 1000)
|
self.page.wait_for_timeout(seconds * 1000)
|
||||||
|
|
||||||
self.safe_page_operation(wait_operation)
|
|
||||||
except (ValueError, TypeError) as e:
|
except (ValueError, TypeError) as e:
|
||||||
logger.error(f"Invalid value for wait_for_seconds: {str(e)}")
|
logger.error(f"Invalid value for wait_for_seconds: {str(e)}")
|
||||||
|
|
||||||
@@ -263,14 +206,11 @@ class steppable_browser_interface():
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
v = json.dumps(value)
|
v = json.dumps(value)
|
||||||
|
|
||||||
def wait_for_text_operation():
|
|
||||||
self.page.wait_for_function(
|
self.page.wait_for_function(
|
||||||
f'document.querySelector("body").innerText.includes({v});',
|
f'document.querySelector("body").innerText.includes({v});',
|
||||||
timeout=30000
|
timeout=30000
|
||||||
)
|
)
|
||||||
|
|
||||||
self.safe_page_operation(wait_for_text_operation)
|
|
||||||
|
|
||||||
def action_wait_for_text_in_element(self, selector, value):
|
def action_wait_for_text_in_element(self, selector, value):
|
||||||
if not selector or not value:
|
if not selector or not value:
|
||||||
@@ -280,68 +220,48 @@ class steppable_browser_interface():
|
|||||||
s = json.dumps(selector)
|
s = json.dumps(selector)
|
||||||
v = json.dumps(value)
|
v = json.dumps(value)
|
||||||
|
|
||||||
def wait_for_text_in_element_operation():
|
|
||||||
self.page.wait_for_function(
|
self.page.wait_for_function(
|
||||||
f'document.querySelector({s}).innerText.includes({v});',
|
f'document.querySelector({s}).innerText.includes({v});',
|
||||||
timeout=30000
|
timeout=30000
|
||||||
)
|
)
|
||||||
|
|
||||||
self.safe_page_operation(wait_for_text_in_element_operation)
|
|
||||||
|
|
||||||
# @todo - in the future make some popout interface to capture what needs to be set
|
# @todo - in the future make some popout interface to capture what needs to be set
|
||||||
# https://playwright.dev/python/docs/api/class-keyboard
|
# https://playwright.dev/python/docs/api/class-keyboard
|
||||||
def action_press_enter(self, selector, value):
|
def action_press_enter(self, selector, value):
|
||||||
def press_operation():
|
|
||||||
self.page.keyboard.press("Enter", delay=randint(200, 500))
|
self.page.keyboard.press("Enter", delay=randint(200, 500))
|
||||||
|
|
||||||
self.safe_page_operation(press_operation)
|
|
||||||
|
|
||||||
def action_press_page_up(self, selector, value):
|
def action_press_page_up(self, selector, value):
|
||||||
def press_operation():
|
|
||||||
self.page.keyboard.press("PageUp", delay=randint(200, 500))
|
self.page.keyboard.press("PageUp", delay=randint(200, 500))
|
||||||
|
|
||||||
self.safe_page_operation(press_operation)
|
|
||||||
|
|
||||||
def action_press_page_down(self, selector, value):
|
def action_press_page_down(self, selector, value):
|
||||||
def press_operation():
|
|
||||||
self.page.keyboard.press("PageDown", delay=randint(200, 500))
|
self.page.keyboard.press("PageDown", delay=randint(200, 500))
|
||||||
|
|
||||||
self.safe_page_operation(press_operation)
|
|
||||||
|
|
||||||
def action_check_checkbox(self, selector, value):
|
def action_check_checkbox(self, selector, value):
|
||||||
if not selector:
|
if not selector:
|
||||||
return
|
return
|
||||||
|
|
||||||
def check_operation():
|
|
||||||
self.page.locator(selector).check(timeout=self.action_timeout)
|
self.page.locator(selector).check(timeout=self.action_timeout)
|
||||||
|
|
||||||
self.safe_page_operation(check_operation)
|
|
||||||
|
|
||||||
def action_uncheck_checkbox(self, selector, value):
|
def action_uncheck_checkbox(self, selector, value):
|
||||||
if not selector:
|
if not selector:
|
||||||
return
|
return
|
||||||
|
|
||||||
def uncheck_operation():
|
|
||||||
self.page.locator(selector).uncheck(timeout=self.action_timeout)
|
self.page.locator(selector).uncheck(timeout=self.action_timeout)
|
||||||
|
|
||||||
self.safe_page_operation(uncheck_operation)
|
|
||||||
|
|
||||||
def action_remove_elements(self, selector, value):
|
def action_remove_elements(self, selector, value):
|
||||||
"""Removes all elements matching the given selector from the DOM."""
|
"""Removes all elements matching the given selector from the DOM."""
|
||||||
if not selector:
|
if not selector:
|
||||||
return
|
return
|
||||||
|
|
||||||
def remove_operation():
|
|
||||||
self.page.locator(selector).evaluate_all("els => els.forEach(el => el.remove())")
|
self.page.locator(selector).evaluate_all("els => els.forEach(el => el.remove())")
|
||||||
|
|
||||||
self.safe_page_operation(remove_operation)
|
|
||||||
|
|
||||||
def action_make_all_child_elements_visible(self, selector, value):
|
def action_make_all_child_elements_visible(self, selector, value):
|
||||||
"""Recursively makes all child elements inside the given selector fully visible."""
|
"""Recursively makes all child elements inside the given selector fully visible."""
|
||||||
if not selector:
|
if not selector:
|
||||||
return
|
return
|
||||||
|
|
||||||
def make_visible_operation():
|
|
||||||
self.page.locator(selector).locator("*").evaluate_all("""
|
self.page.locator(selector).locator("*").evaluate_all("""
|
||||||
els => els.forEach(el => {
|
els => els.forEach(el => {
|
||||||
el.style.display = 'block'; // Forces it to be displayed
|
el.style.display = 'block'; // Forces it to be displayed
|
||||||
@@ -355,8 +275,6 @@ class steppable_browser_interface():
|
|||||||
})
|
})
|
||||||
""")
|
""")
|
||||||
|
|
||||||
self.safe_page_operation(make_visible_operation)
|
|
||||||
|
|
||||||
# Responsible for maintaining a live 'context' with the chrome CDP
|
# Responsible for maintaining a live 'context' with the chrome CDP
|
||||||
# @todo - how long do contexts live for anyway?
|
# @todo - how long do contexts live for anyway?
|
||||||
class browsersteps_live_ui(steppable_browser_interface):
|
class browsersteps_live_ui(steppable_browser_interface):
|
||||||
|
|||||||
@@ -194,7 +194,6 @@ class fetcher(Fetcher):
|
|||||||
browsersteps_interface.page = self.page
|
browsersteps_interface.page = self.page
|
||||||
|
|
||||||
response = browsersteps_interface.action_goto_url(value=url)
|
response = browsersteps_interface.action_goto_url(value=url)
|
||||||
self.headers = response.all_headers()
|
|
||||||
|
|
||||||
if response is None:
|
if response is None:
|
||||||
context.close()
|
context.close()
|
||||||
@@ -202,6 +201,8 @@ class fetcher(Fetcher):
|
|||||||
logger.debug("Content Fetcher > Response object from the browser communication was none")
|
logger.debug("Content Fetcher > Response object from the browser communication was none")
|
||||||
raise EmptyReply(url=url, status_code=None)
|
raise EmptyReply(url=url, status_code=None)
|
||||||
|
|
||||||
|
self.headers = response.all_headers()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
|
if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
|
||||||
browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)
|
browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ class fetcher(Fetcher):
|
|||||||
|
|
||||||
import chardet
|
import chardet
|
||||||
import requests
|
import requests
|
||||||
|
from requests.exceptions import ProxyError, ConnectionError, RequestException
|
||||||
|
|
||||||
if self.browser_steps_get_valid_steps():
|
if self.browser_steps_get_valid_steps():
|
||||||
raise BrowserStepsInUnsupportedFetcher(url=url)
|
raise BrowserStepsInUnsupportedFetcher(url=url)
|
||||||
@@ -52,7 +53,7 @@ class fetcher(Fetcher):
|
|||||||
if strtobool(os.getenv('ALLOW_FILE_URI', 'false')) and url.startswith('file://'):
|
if strtobool(os.getenv('ALLOW_FILE_URI', 'false')) and url.startswith('file://'):
|
||||||
from requests_file import FileAdapter
|
from requests_file import FileAdapter
|
||||||
session.mount('file://', FileAdapter())
|
session.mount('file://', FileAdapter())
|
||||||
|
try:
|
||||||
r = session.request(method=request_method,
|
r = session.request(method=request_method,
|
||||||
data=request_body.encode('utf-8') if type(request_body) is str else request_body,
|
data=request_body.encode('utf-8') if type(request_body) is str else request_body,
|
||||||
url=url,
|
url=url,
|
||||||
@@ -60,6 +61,11 @@ class fetcher(Fetcher):
|
|||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
proxies=proxies,
|
proxies=proxies,
|
||||||
verify=False)
|
verify=False)
|
||||||
|
except Exception as e:
|
||||||
|
msg = str(e)
|
||||||
|
if proxies and 'SOCKSHTTPSConnectionPool' in msg:
|
||||||
|
msg = f"Proxy connection failed? {msg}"
|
||||||
|
raise Exception(msg) from e
|
||||||
|
|
||||||
# If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
|
# If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
|
||||||
# For example - some sites don't tell us it's utf-8, but return utf-8 content
|
# For example - some sites don't tell us it's utf-8, but return utf-8 content
|
||||||
|
|||||||
@@ -10,16 +10,13 @@ class fetcher(Fetcher):
|
|||||||
else:
|
else:
|
||||||
fetcher_description = "WebDriver Chrome/Javascript"
|
fetcher_description = "WebDriver Chrome/Javascript"
|
||||||
|
|
||||||
# Configs for Proxy setup
|
|
||||||
# In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy"
|
|
||||||
selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy',
|
|
||||||
'proxyAutoconfigUrl', 'sslProxy', 'autodetect',
|
|
||||||
'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
|
|
||||||
proxy = None
|
proxy = None
|
||||||
|
proxy_url = None
|
||||||
|
|
||||||
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
|
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
from selenium.webdriver.common.proxy import Proxy as SeleniumProxy
|
from urllib.parse import urlparse
|
||||||
|
from selenium.webdriver.common.proxy import Proxy
|
||||||
|
|
||||||
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
|
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
|
||||||
if not custom_browser_connection_url:
|
if not custom_browser_connection_url:
|
||||||
@@ -28,25 +25,27 @@ class fetcher(Fetcher):
|
|||||||
self.browser_connection_is_custom = True
|
self.browser_connection_is_custom = True
|
||||||
self.browser_connection_url = custom_browser_connection_url
|
self.browser_connection_url = custom_browser_connection_url
|
||||||
|
|
||||||
# If any proxy settings are enabled, then we should setup the proxy object
|
|
||||||
proxy_args = {}
|
|
||||||
for k in self.selenium_proxy_settings_mappings:
|
|
||||||
v = os.getenv('webdriver_' + k, False)
|
|
||||||
if v:
|
|
||||||
proxy_args[k] = v.strip('"')
|
|
||||||
|
|
||||||
# Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy
|
##### PROXY SETUP #####
|
||||||
if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy:
|
|
||||||
proxy_args['httpProxy'] = self.system_http_proxy
|
|
||||||
if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy:
|
|
||||||
proxy_args['httpsProxy'] = self.system_https_proxy
|
|
||||||
|
|
||||||
# Allows override the proxy on a per-request basis
|
proxy_sources = [
|
||||||
if proxy_override is not None:
|
self.system_http_proxy,
|
||||||
proxy_args['httpProxy'] = proxy_override
|
self.system_https_proxy,
|
||||||
|
os.getenv('webdriver_proxySocks'),
|
||||||
|
os.getenv('webdriver_socksProxy'),
|
||||||
|
os.getenv('webdriver_proxyHttp'),
|
||||||
|
os.getenv('webdriver_httpProxy'),
|
||||||
|
os.getenv('webdriver_proxyHttps'),
|
||||||
|
os.getenv('webdriver_httpsProxy'),
|
||||||
|
os.getenv('webdriver_sslProxy'),
|
||||||
|
proxy_override, # last one should override
|
||||||
|
]
|
||||||
|
# The built in selenium proxy handling is super unreliable!!! so we just grab which ever proxy setting we can find and throw it in --proxy-server=
|
||||||
|
for k in filter(None, proxy_sources):
|
||||||
|
if not k:
|
||||||
|
continue
|
||||||
|
self.proxy_url = k.strip()
|
||||||
|
|
||||||
if proxy_args:
|
|
||||||
self.proxy = SeleniumProxy(raw=proxy_args)
|
|
||||||
|
|
||||||
def run(self,
|
def run(self,
|
||||||
url,
|
url,
|
||||||
@@ -59,9 +58,7 @@ class fetcher(Fetcher):
|
|||||||
is_binary=False,
|
is_binary=False,
|
||||||
empty_pages_are_a_change=False):
|
empty_pages_are_a_change=False):
|
||||||
|
|
||||||
from selenium import webdriver
|
|
||||||
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
||||||
from selenium.common.exceptions import WebDriverException
|
|
||||||
# request_body, request_method unused for now, until some magic in the future happens.
|
# request_body, request_method unused for now, until some magic in the future happens.
|
||||||
|
|
||||||
options = ChromeOptions()
|
options = ChromeOptions()
|
||||||
@@ -76,30 +73,48 @@ class fetcher(Fetcher):
|
|||||||
for opt in CHROME_OPTIONS:
|
for opt in CHROME_OPTIONS:
|
||||||
options.add_argument(opt)
|
options.add_argument(opt)
|
||||||
|
|
||||||
if self.proxy:
|
# 1. proxy_config /Proxy(proxy_config) selenium object is REALLY unreliable
|
||||||
options.proxy = self.proxy
|
# 2. selenium-wire cant be used because the websocket version conflicts with pypeteer-ng
|
||||||
|
# 3. selenium only allows ONE runner at a time by default!
|
||||||
|
# 4. driver must use quit() or it will continue to block/hold the selenium process!!
|
||||||
|
|
||||||
self.driver = webdriver.Remote(
|
if self.proxy_url:
|
||||||
command_executor=self.browser_connection_url,
|
options.add_argument(f'--proxy-server={self.proxy_url}')
|
||||||
options=options)
|
|
||||||
|
from selenium.webdriver.remote.remote_connection import RemoteConnection
|
||||||
|
from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
|
||||||
|
driver = None
|
||||||
|
try:
|
||||||
|
# Create the RemoteConnection and set timeout (e.g., 30 seconds)
|
||||||
|
remote_connection = RemoteConnection(
|
||||||
|
self.browser_connection_url,
|
||||||
|
)
|
||||||
|
remote_connection.set_timeout(30) # seconds
|
||||||
|
|
||||||
|
# Now create the driver with the RemoteConnection
|
||||||
|
driver = RemoteWebDriver(
|
||||||
|
command_executor=remote_connection,
|
||||||
|
options=options
|
||||||
|
)
|
||||||
|
|
||||||
|
driver.set_page_load_timeout(int(os.getenv("WEBDRIVER_PAGELOAD_TIMEOUT", 45)))
|
||||||
|
except Exception as e:
|
||||||
|
if driver:
|
||||||
|
driver.quit()
|
||||||
|
raise e
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.driver.get(url)
|
driver.get(url)
|
||||||
except WebDriverException as e:
|
|
||||||
# Be sure we close the session window
|
|
||||||
self.quit()
|
|
||||||
raise
|
|
||||||
|
|
||||||
if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
|
if not "--window-size" in os.getenv("CHROME_OPTIONS", ""):
|
||||||
self.driver.set_window_size(1280, 1024)
|
driver.set_window_size(1280, 1024)
|
||||||
|
|
||||||
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
|
driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
|
||||||
|
|
||||||
if self.webdriver_js_execute_code is not None:
|
if self.webdriver_js_execute_code is not None:
|
||||||
self.driver.execute_script(self.webdriver_js_execute_code)
|
driver.execute_script(self.webdriver_js_execute_code)
|
||||||
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
|
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
|
||||||
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
|
driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
|
||||||
|
|
||||||
|
|
||||||
# @todo - how to check this? is it possible?
|
# @todo - how to check this? is it possible?
|
||||||
self.status_code = 200
|
self.status_code = 200
|
||||||
@@ -108,27 +123,12 @@ class fetcher(Fetcher):
|
|||||||
|
|
||||||
# @todo - dom wait loaded?
|
# @todo - dom wait loaded?
|
||||||
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
|
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
|
||||||
self.content = self.driver.page_source
|
self.content = driver.page_source
|
||||||
self.headers = {}
|
self.headers = {}
|
||||||
|
self.screenshot = driver.get_screenshot_as_png()
|
||||||
self.screenshot = self.driver.get_screenshot_as_png()
|
|
||||||
|
|
||||||
# Does the connection to the webdriver work? run a test connection.
|
|
||||||
def is_ready(self):
|
|
||||||
from selenium import webdriver
|
|
||||||
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
|
||||||
|
|
||||||
self.driver = webdriver.Remote(
|
|
||||||
command_executor=self.command_executor,
|
|
||||||
options=ChromeOptions())
|
|
||||||
|
|
||||||
# driver.quit() seems to cause better exceptions
|
|
||||||
self.quit()
|
|
||||||
return True
|
|
||||||
|
|
||||||
def quit(self, watch=None):
|
|
||||||
if self.driver:
|
|
||||||
try:
|
|
||||||
self.driver.quit()
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Content Fetcher > Exception in chrome shutdown/quit {str(e)}")
|
driver.quit()
|
||||||
|
raise e
|
||||||
|
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
|||||||
@@ -224,27 +224,37 @@ class StringDictKeyValue(StringField):
|
|||||||
|
|
||||||
def _value(self):
|
def _value(self):
|
||||||
if self.data:
|
if self.data:
|
||||||
output = u''
|
output = ''
|
||||||
for k in self.data.keys():
|
for k, v in self.data.items():
|
||||||
output += "{}: {}\r\n".format(k, self.data[k])
|
output += f"{k}: {v}\r\n"
|
||||||
|
|
||||||
return output
|
return output
|
||||||
else:
|
else:
|
||||||
return u''
|
return ''
|
||||||
|
|
||||||
# incoming
|
# incoming data processing + validation
|
||||||
def process_formdata(self, valuelist):
|
def process_formdata(self, valuelist):
|
||||||
|
self.data = {}
|
||||||
|
errors = []
|
||||||
if valuelist:
|
if valuelist:
|
||||||
self.data = {}
|
# Remove empty strings (blank lines)
|
||||||
# Remove empty strings
|
cleaned = [line.strip() for line in valuelist[0].split("\n") if line.strip()]
|
||||||
cleaned = list(filter(None, valuelist[0].split("\n")))
|
for idx, s in enumerate(cleaned, start=1):
|
||||||
for s in cleaned:
|
if ':' not in s:
|
||||||
parts = s.strip().split(':', 1)
|
errors.append(f"Line {idx} is missing a ':' separator.")
|
||||||
if len(parts) == 2:
|
continue
|
||||||
self.data.update({parts[0].strip(): parts[1].strip()})
|
parts = s.split(':', 1)
|
||||||
|
key = parts[0].strip()
|
||||||
|
value = parts[1].strip()
|
||||||
|
|
||||||
else:
|
if not key:
|
||||||
self.data = {}
|
errors.append(f"Line {idx} has an empty key.")
|
||||||
|
if not value:
|
||||||
|
errors.append(f"Line {idx} has an empty value.")
|
||||||
|
|
||||||
|
self.data[key] = value
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
raise ValidationError("Invalid input:\n" + "\n".join(errors))
|
||||||
|
|
||||||
class ValidateContentFetcherIsReady(object):
|
class ValidateContentFetcherIsReady(object):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -309,10 +309,10 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
|
|||||||
soup = BeautifulSoup(content, 'html.parser')
|
soup = BeautifulSoup(content, 'html.parser')
|
||||||
|
|
||||||
if ensure_is_ldjson_info_type:
|
if ensure_is_ldjson_info_type:
|
||||||
bs_result = soup.findAll('script', {"type": "application/ld+json"})
|
bs_result = soup.find_all('script', {"type": "application/ld+json"})
|
||||||
else:
|
else:
|
||||||
bs_result = soup.findAll('script')
|
bs_result = soup.find_all('script')
|
||||||
bs_result += soup.findAll('body')
|
bs_result += soup.find_all('body')
|
||||||
|
|
||||||
bs_jsons = []
|
bs_jsons = []
|
||||||
for result in bs_result:
|
for result in bs_result:
|
||||||
@@ -436,55 +436,27 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
|
|||||||
return re.sub(pattern, repl, html_content)
|
return re.sub(pattern, repl, html_content)
|
||||||
|
|
||||||
|
|
||||||
def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
|
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
|
||||||
|
|
||||||
|
|
||||||
|
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
|
||||||
from inscriptis import get_text
|
from inscriptis import get_text
|
||||||
from inscriptis.model.config import ParserConfig
|
from inscriptis.model.config import ParserConfig
|
||||||
|
|
||||||
"""Converts html string to a string with just the text. If ignoring
|
|
||||||
rendering anchor tag content is enable, anchor tag content are also
|
|
||||||
included in the text
|
|
||||||
|
|
||||||
:param html_content: string with html content
|
|
||||||
:param render_anchor_tag_content: boolean flag indicating whether to extract
|
|
||||||
hyperlinks (the anchor tag content) together with text. This refers to the
|
|
||||||
'href' inside 'a' tags.
|
|
||||||
Anchor tag content is rendered in the following manner:
|
|
||||||
'[ text ](anchor tag content)'
|
|
||||||
:return: extracted text from the HTML
|
|
||||||
"""
|
|
||||||
# if anchor tag content flag is set to True define a config for
|
|
||||||
# extracting this content
|
|
||||||
if render_anchor_tag_content:
|
if render_anchor_tag_content:
|
||||||
parser_config = ParserConfig(
|
parser_config = ParserConfig(
|
||||||
annotation_rules={"a": ["hyperlink"]},
|
annotation_rules={"a": ["hyperlink"]},
|
||||||
display_links=True
|
display_links=True
|
||||||
)
|
)
|
||||||
# otherwise set config to None/default
|
|
||||||
else:
|
else:
|
||||||
parser_config = None
|
parser_config = None
|
||||||
|
|
||||||
# RSS Mode - Inscriptis will treat `title` as something else.
|
|
||||||
# Make it as a regular block display element (//item/title)
|
|
||||||
# This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
|
|
||||||
if is_rss:
|
if is_rss:
|
||||||
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
|
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
|
||||||
html_content = re.sub(r'</title>', r'</h1>', html_content)
|
html_content = re.sub(r'</title>', r'</h1>', html_content)
|
||||||
|
|
||||||
text_content = get_text(html_content, config=parser_config)
|
text_content = get_text(html_content, config=parser_config)
|
||||||
conn.send(text_content)
|
return text_content
|
||||||
conn.close()
|
|
||||||
|
|
||||||
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
|
|
||||||
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
|
|
||||||
from multiprocessing import Process, Pipe
|
|
||||||
|
|
||||||
parent_conn, child_conn = Pipe()
|
|
||||||
p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
|
|
||||||
p.start()
|
|
||||||
text = parent_conn.recv()
|
|
||||||
p.join()
|
|
||||||
return text
|
|
||||||
|
|
||||||
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
|
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
|
||||||
def has_ldjson_product_info(content):
|
def has_ldjson_product_info(content):
|
||||||
|
|||||||
@@ -38,6 +38,9 @@ pytest tests/test_backend.py
|
|||||||
pytest tests/test_rss.py
|
pytest tests/test_rss.py
|
||||||
pytest tests/test_unique_lines.py
|
pytest tests/test_unique_lines.py
|
||||||
|
|
||||||
|
# Try high concurrency
|
||||||
|
FETCH_WORKERS=130 pytest tests/test_history_consistency.py -v -l
|
||||||
|
|
||||||
# Check file:// will pickup a file when enabled
|
# Check file:// will pickup a file when enabled
|
||||||
echo "Hello world" > /tmp/test-file.txt
|
echo "Hello world" > /tmp/test-file.txt
|
||||||
ALLOW_FILE_URI=yes pytest tests/test_security.py
|
ALLOW_FILE_URI=yes pytest tests/test_security.py
|
||||||
|
|||||||
@@ -82,3 +82,25 @@ done
|
|||||||
|
|
||||||
|
|
||||||
docker kill squid-one squid-two squid-custom
|
docker kill squid-one squid-two squid-custom
|
||||||
|
|
||||||
|
# Test that the UI is returning the correct error message when a proxy is not available
|
||||||
|
|
||||||
|
# Requests
|
||||||
|
docker run --network changedet-network \
|
||||||
|
test-changedetectionio \
|
||||||
|
bash -c 'cd changedetectionio && pytest tests/proxy_list/test_proxy_noconnect.py'
|
||||||
|
|
||||||
|
# Playwright
|
||||||
|
docker run --network changedet-network \
|
||||||
|
test-changedetectionio \
|
||||||
|
bash -c 'cd changedetectionio && PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000 pytest tests/proxy_list/test_proxy_noconnect.py'
|
||||||
|
|
||||||
|
# Puppeteer fast
|
||||||
|
docker run --network changedet-network \
|
||||||
|
test-changedetectionio \
|
||||||
|
bash -c 'cd changedetectionio && FAST_PUPPETEER_CHROME_FETCHER=1 PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000 pytest tests/proxy_list/test_proxy_noconnect.py'
|
||||||
|
|
||||||
|
# Selenium
|
||||||
|
docker run --network changedet-network \
|
||||||
|
test-changedetectionio \
|
||||||
|
bash -c 'cd changedetectionio && WEBDRIVER_URL=http://selenium:4444/wd/hub pytest tests/proxy_list/test_proxy_noconnect.py'
|
||||||
|
|||||||
68
changedetectionio/tests/proxy_list/test_proxy_noconnect.py
Normal file
68
changedetectionio/tests/proxy_list/test_proxy_noconnect.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from flask import url_for
|
||||||
|
from ..util import live_server_setup, wait_for_all_checks
|
||||||
|
import os
|
||||||
|
from ... import strtobool
|
||||||
|
|
||||||
|
|
||||||
|
# Just to be sure the UI outputs the right error message on proxy connection failed
|
||||||
|
# docker run -p 4444:4444 --rm --shm-size="2g" selenium/standalone-chrome:4
|
||||||
|
# PLAYWRIGHT_DRIVER_URL=ws://127.0.0.1:3000 pytest tests/proxy_list/test_proxy_noconnect.py
|
||||||
|
# FAST_PUPPETEER_CHROME_FETCHER=True PLAYWRIGHT_DRIVER_URL=ws://127.0.0.1:3000 pytest tests/proxy_list/test_proxy_noconnect.py
|
||||||
|
# WEBDRIVER_URL=http://127.0.0.1:4444/wd/hub pytest tests/proxy_list/test_proxy_noconnect.py
|
||||||
|
|
||||||
|
def test_proxy_noconnect_custom(client, live_server, measure_memory_usage):
|
||||||
|
live_server_setup(live_server)
|
||||||
|
|
||||||
|
# Goto settings, add our custom one
|
||||||
|
res = client.post(
|
||||||
|
url_for("settings.settings_page"),
|
||||||
|
data={
|
||||||
|
"requests-time_between_check-minutes": 180,
|
||||||
|
"application-ignore_whitespace": "y",
|
||||||
|
"application-fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') or os.getenv("WEBDRIVER_URL") else 'html_requests',
|
||||||
|
"requests-extra_proxies-0-proxy_name": "custom-test-proxy",
|
||||||
|
# test:awesome is set in tests/proxy_list/squid-passwords.txt
|
||||||
|
"requests-extra_proxies-0-proxy_url": "http://127.0.0.1:3128",
|
||||||
|
},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
|
assert b"Settings updated." in res.data
|
||||||
|
|
||||||
|
test_url = "https://changedetection.io"
|
||||||
|
res = client.post(
|
||||||
|
url_for("ui.ui_views.form_quick_watch_add"),
|
||||||
|
data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
|
assert b"Watch added in Paused state, saving will unpause" in res.data
|
||||||
|
|
||||||
|
options = {
|
||||||
|
"url": test_url,
|
||||||
|
"fetch_backend": "html_webdriver" if os.getenv('PLAYWRIGHT_DRIVER_URL') or os.getenv("WEBDRIVER_URL") else "html_requests",
|
||||||
|
"proxy": "ui-0custom-test-proxy",
|
||||||
|
}
|
||||||
|
|
||||||
|
res = client.post(
|
||||||
|
url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1),
|
||||||
|
data=options,
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b"unpaused" in res.data
|
||||||
|
import time
|
||||||
|
wait_for_all_checks(client)
|
||||||
|
|
||||||
|
# Requests default
|
||||||
|
check_string = b'Cannot connect to proxy'
|
||||||
|
|
||||||
|
if os.getenv('PLAYWRIGHT_DRIVER_URL') or strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')) or os.getenv("WEBDRIVER_URL"):
|
||||||
|
check_string = b'ERR_PROXY_CONNECTION_FAILED'
|
||||||
|
|
||||||
|
|
||||||
|
res = client.get(url_for("watchlist.index"))
|
||||||
|
#with open("/tmp/debug.html", 'wb') as f:
|
||||||
|
# f.write(res.data)
|
||||||
|
assert check_string in res.data
|
||||||
@@ -10,8 +10,8 @@ from urllib.parse import urlparse, parse_qs
|
|||||||
|
|
||||||
def test_consistent_history(client, live_server, measure_memory_usage):
|
def test_consistent_history(client, live_server, measure_memory_usage):
|
||||||
live_server_setup(live_server)
|
live_server_setup(live_server)
|
||||||
|
workers = int(os.getenv("FETCH_WORKERS", 10))
|
||||||
r = range(1, 30)
|
r = range(1, 10+workers)
|
||||||
|
|
||||||
for one in r:
|
for one in r:
|
||||||
test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)
|
test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)
|
||||||
@@ -46,9 +46,10 @@ def test_consistent_history(client, live_server, measure_memory_usage):
|
|||||||
|
|
||||||
# assert the right amount of watches was found in the JSON
|
# assert the right amount of watches was found in the JSON
|
||||||
assert len(json_obj['watching']) == len(r), "Correct number of watches was found in the JSON"
|
assert len(json_obj['watching']) == len(r), "Correct number of watches was found in the JSON"
|
||||||
|
i=0
|
||||||
# each one should have a history.txt containing just one line
|
# each one should have a history.txt containing just one line
|
||||||
for w in json_obj['watching'].keys():
|
for w in json_obj['watching'].keys():
|
||||||
|
i+=1
|
||||||
history_txt_index_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, 'history.txt')
|
history_txt_index_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, 'history.txt')
|
||||||
assert os.path.isfile(history_txt_index_file), f"History.txt should exist where I expect it at {history_txt_index_file}"
|
assert os.path.isfile(history_txt_index_file), f"History.txt should exist where I expect it at {history_txt_index_file}"
|
||||||
|
|
||||||
@@ -58,8 +59,8 @@ def test_consistent_history(client, live_server, measure_memory_usage):
|
|||||||
assert len(tmp_history) == 1, "History.txt should contain 1 line"
|
assert len(tmp_history) == 1, "History.txt should contain 1 line"
|
||||||
|
|
||||||
# Should be two files,. the history.txt , and the snapshot.txt
|
# Should be two files,. the history.txt , and the snapshot.txt
|
||||||
files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path,
|
files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w))
|
||||||
w))
|
|
||||||
# Find the snapshot one
|
# Find the snapshot one
|
||||||
for fname in files_in_watch_dir:
|
for fname in files_in_watch_dir:
|
||||||
if fname != 'history.txt' and 'html' not in fname:
|
if fname != 'history.txt' and 'html' not in fname:
|
||||||
@@ -75,7 +76,6 @@ def test_consistent_history(client, live_server, measure_memory_usage):
|
|||||||
|
|
||||||
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"
|
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"
|
||||||
|
|
||||||
|
|
||||||
json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
|
json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
|
||||||
with open(json_db_file, 'r') as f:
|
with open(json_db_file, 'r') as f:
|
||||||
assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"
|
assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"
|
||||||
|
|||||||
@@ -424,3 +424,27 @@ def test_headers_textfile_in_request(client, live_server, measure_memory_usage):
|
|||||||
# unlink headers.txt on start/stop
|
# unlink headers.txt on start/stop
|
||||||
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
|
||||||
assert b'Deleted' in res.data
|
assert b'Deleted' in res.data
|
||||||
|
|
||||||
|
def test_headers_validation(client, live_server):
|
||||||
|
#live_server_setup(live_server)
|
||||||
|
|
||||||
|
test_url = url_for('test_headers', _external=True)
|
||||||
|
res = client.post(
|
||||||
|
url_for("imports.import_page"),
|
||||||
|
data={"urls": test_url},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
assert b"1 Imported" in res.data
|
||||||
|
|
||||||
|
res = client.post(
|
||||||
|
url_for("ui.ui_edit.edit_page", uuid="first"),
|
||||||
|
data={
|
||||||
|
"url": test_url,
|
||||||
|
"fetch_backend": 'html_requests',
|
||||||
|
"headers": "User-AGent agent-from-watch\r\nsadfsadfsadfsdaf\r\n:foobar"},
|
||||||
|
follow_redirects=True
|
||||||
|
)
|
||||||
|
|
||||||
|
assert b"Line 1 is missing a ':' separator." in res.data
|
||||||
|
assert b"Line 3 has an empty key." in res.data
|
||||||
|
|
||||||
|
|||||||
@@ -126,18 +126,51 @@ def extract_UUID_from_client(client):
|
|||||||
uuid = m.group(1)
|
uuid = m.group(1)
|
||||||
return uuid.strip()
|
return uuid.strip()
|
||||||
|
|
||||||
def wait_for_all_checks(client):
|
|
||||||
# actually this is not entirely true, it can still be 'processing' but not in the queue
|
def wait_for_all_checks(client=None):
|
||||||
# Loop waiting until done..
|
"""
|
||||||
|
Waits until the queue is empty and remains empty for at least `required_empty_duration` seconds,
|
||||||
|
and also ensures no running threads have `current_uuid` set.
|
||||||
|
Retries for up to `max_attempts` times, sleeping `wait_between_attempts` seconds between checks.
|
||||||
|
"""
|
||||||
|
from changedetectionio.flask_app import update_q as global_update_q, running_update_threads
|
||||||
|
|
||||||
|
# Configuration
|
||||||
attempt = 0
|
attempt = 0
|
||||||
# because sub-second rechecks are problematic in testing, use lots of delays
|
i=0
|
||||||
time.sleep(1)
|
max_attempts = 60
|
||||||
while attempt < 60:
|
wait_between_attempts = 2
|
||||||
res = client.get(url_for("watchlist.index"))
|
required_empty_duration = 2
|
||||||
if not b'Checking now' in res.data:
|
|
||||||
|
logger = logging.getLogger()
|
||||||
|
time.sleep(1.2)
|
||||||
|
|
||||||
|
empty_since = None
|
||||||
|
|
||||||
|
while attempt < max_attempts:
|
||||||
|
q_length = global_update_q.qsize()
|
||||||
|
|
||||||
|
# Check if any threads are still processing
|
||||||
|
time.sleep(1.2)
|
||||||
|
any_threads_busy = any(t.current_uuid for t in running_update_threads)
|
||||||
|
|
||||||
|
|
||||||
|
if q_length == 0 and not any_threads_busy:
|
||||||
|
if empty_since is None:
|
||||||
|
empty_since = time.time()
|
||||||
|
logger.info(f"Queue empty and no active threads at attempt {attempt}, starting empty timer...")
|
||||||
|
elif time.time() - empty_since >= required_empty_duration:
|
||||||
|
logger.info(f"Queue has been empty and threads idle for {required_empty_duration} seconds. Done waiting.")
|
||||||
break
|
break
|
||||||
logging.getLogger().info("Waiting for watch-list to not say 'Checking now'.. {}".format(attempt))
|
else:
|
||||||
time.sleep(1)
|
logger.info(f"Still waiting: queue empty and no active threads, but not yet {required_empty_duration} seconds...")
|
||||||
|
else:
|
||||||
|
if q_length != 0:
|
||||||
|
logger.info(f"Queue not empty (size={q_length}), resetting timer.")
|
||||||
|
if any_threads_busy:
|
||||||
|
busy_threads = [t.name for t in running_update_threads if t.current_uuid]
|
||||||
|
logger.info(f"Threads still busy: {busy_threads}, resetting timer.")
|
||||||
|
empty_since = None
|
||||||
attempt += 1
|
attempt += 1
|
||||||
|
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ paho-mqtt!=2.0.*
|
|||||||
cryptography~=42.0.8
|
cryptography~=42.0.8
|
||||||
|
|
||||||
# Used for CSS filtering
|
# Used for CSS filtering
|
||||||
beautifulsoup4
|
beautifulsoup4>=4.0.0
|
||||||
|
|
||||||
# XPath filtering, lxml is required by bs4 anyway, but put it here to be safe.
|
# XPath filtering, lxml is required by bs4 anyway, but put it here to be safe.
|
||||||
# #2328 - 5.2.0 and 5.2.1 had extra CPU flag CFLAGS set which was not compatible on older hardware
|
# #2328 - 5.2.0 and 5.2.1 had extra CPU flag CFLAGS set which was not compatible on older hardware
|
||||||
@@ -70,7 +70,7 @@ jq~=1.3; python_version >= "3.8" and sys_platform == "linux"
|
|||||||
|
|
||||||
# playwright is installed at Dockerfile build time because it's not available on all platforms
|
# playwright is installed at Dockerfile build time because it's not available on all platforms
|
||||||
|
|
||||||
pyppeteer-ng==2.0.0rc9
|
pyppeteer-ng==2.0.0rc10
|
||||||
|
|
||||||
pyppeteerstealth>=0.0.4
|
pyppeteerstealth>=0.0.4
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user