+
+After **Browser Steps** have been run, then visit the **Visual Selector** tab to refine the content you're interested in.
+Requires Playwright to be enabled.
+
## Installation
### Docker
diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py
index 750c7a48..dceefcc6 100644
--- a/changedetectionio/__init__.py
+++ b/changedetectionio/__init__.py
@@ -1,18 +1,20 @@
#!/usr/bin/python3
import datetime
+import flask_login
+import logging
import os
+import pytz
import queue
import threading
import time
+import timeago
+
from copy import deepcopy
+from distutils.util import strtobool
+from feedgen.feed import FeedGenerator
from threading import Event
-import flask_login
-import logging
-import pytz
-import timeago
-from feedgen.feed import FeedGenerator
from flask import (
Flask,
abort,
@@ -27,7 +29,6 @@ from flask import (
)
from flask_login import login_required
from flask_restful import abort, Api
-
from flask_wtf import CSRFProtect
from changedetectionio import html_tools
@@ -44,7 +45,6 @@ ticker_thread = None
extra_stylesheets = []
update_q = queue.PriorityQueue()
-
notification_q = queue.Queue()
app = Flask(__name__,
@@ -97,7 +97,7 @@ def _jinja2_filter_datetime(watch_obj, format="%Y-%m-%d %H:%M:%S"):
# Worker thread tells us which UUID it is currently processing.
for t in running_update_threads:
if t.current_uuid == watch_obj['uuid']:
- return ' Checking now'
+ return ' Checking now'
if watch_obj['last_checked'] == 0:
return 'Not yet'
@@ -525,6 +525,7 @@ def changedetection_app(config=None, datastore_o=None):
def edit_page(uuid):
from changedetectionio import forms
+ from changedetectionio.blueprint.browser_steps.browser_steps import browser_step_ui_config
using_default_check_time = True
# More for testing, possible to return the first/only
@@ -558,6 +559,8 @@ def changedetection_app(config=None, datastore_o=None):
data=default,
)
+ # form.browser_steps[0] can be assumed that we 'goto url' first
+
if datastore.proxy_list is None:
# @todo - Couldn't get setattr() etc dynamic addition working, so remove it instead
del form.proxy
@@ -650,6 +653,7 @@ def changedetection_app(config=None, datastore_o=None):
watch.get('fetch_backend', None) is None and system_uses_webdriver) else False
output = render_template("edit.html",
+ browser_steps_config=browser_step_ui_config,
current_base_url=datastore.data['settings']['application']['base_url'],
emailprefix=os.getenv('NOTIFICATION_MAIL_BUTTON_PREFIX', False),
form=form,
@@ -661,7 +665,6 @@ def changedetection_app(config=None, datastore_o=None):
settings_application=datastore.data['settings']['application'],
using_global_webdriver_wait=default['webdriver_delay'] is None,
uuid=uuid,
- visualselector_data_is_ready=visualselector_data_is_ready,
visualselector_enabled=visualselector_enabled,
watch=watch
)
@@ -1190,7 +1193,6 @@ def changedetection_app(config=None, datastore_o=None):
else:
# No tag, no uuid, add everything.
for watch_uuid, watch in datastore.data['watching'].items():
-
if watch_uuid not in running_uuids and not datastore.data['watching'][watch_uuid]['paused']:
update_q.put((1, watch_uuid))
i += 1
@@ -1308,9 +1310,11 @@ def changedetection_app(config=None, datastore_o=None):
# paste in etc
return redirect(url_for('index'))
+ import changedetectionio.blueprint.browser_steps as browser_steps
+ app.register_blueprint(browser_steps.construct_blueprint(datastore), url_prefix='/browser-steps')
+
# @todo handle ctrl break
ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
-
threading.Thread(target=notification_runner).start()
# Check for new release version, but not when running in test/build or pytest
diff --git a/changedetectionio/blueprint/__init__.py b/changedetectionio/blueprint/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/changedetectionio/blueprint/browser_steps/__init__.py b/changedetectionio/blueprint/browser_steps/__init__.py
new file mode 100644
index 00000000..b877a20f
--- /dev/null
+++ b/changedetectionio/blueprint/browser_steps/__init__.py
@@ -0,0 +1,226 @@
+
+# HORRIBLE HACK BUT WORKS :-) PR anyone?
+#
+# Why?
+# `browsersteps_playwright_browser_interface.chromium.connect_over_cdp()` will only run once without async()
+# - this flask app is not async()
+# - browserless has a single timeout/keepalive which applies to the session made at .connect_over_cdp()
+#
+# So it means that we must unfortunately for now just keep a single timer since .connect_over_cdp() was run
+# and know when that reaches timeout/keepalive :( when that time is up, restart the connection and tell the user
+# that their time is up, insert another coin. (reload)
+#
+# Bigger picture
+# - It's horrible that we have this click+wait deal, some nice socket.io solution using something similar
+# to what the browserless debug UI already gives us would be smarter..
+#
+# OR
+# - Some API call that should be hacked into browserless or playwright that we can "/api/bump-keepalive/{session_id}/60"
+# So we can tell it that we need more time (run this on each action)
+#
+# OR
+# - use multiprocessing to bump this over to its own process and add some transport layer (queue/pipes)
+
+
+
+
+from distutils.util import strtobool
+from flask import Blueprint, request, make_response
+from flask_login import login_required
+import os
+import logging
+from changedetectionio.store import ChangeDetectionStore
+
+browsersteps_live_ui_o = {}
+browsersteps_playwright_browser_interface = None
+browsersteps_playwright_browser_interface_start_time = None
+browsersteps_playwright_browser_interface_browser = None
+browsersteps_playwright_browser_interface_end_time = None
+
+
+def cleanup_playwright_session():
+ print("Cleaning up old playwright session because time was up")
+ global browsersteps_playwright_browser_interface
+ global browsersteps_live_ui_o
+ global browsersteps_playwright_browser_interface_browser
+ global browsersteps_playwright_browser_interface
+ global browsersteps_playwright_browser_interface_start_time
+ global browsersteps_playwright_browser_interface_end_time
+
+ import psutil
+
+ current_process = psutil.Process()
+ children = current_process.children(recursive=True)
+ for child in children:
+ print (child)
+ print('Child pid is {}'.format(child.pid))
+
+ # .stop() hangs sometimes if its called when there are no children to process
+ # but how do we know this is our child? dunno
+ if children:
+ browsersteps_playwright_browser_interface.stop()
+
+ browsersteps_live_ui_o = {}
+ browsersteps_playwright_browser_interface = None
+ browsersteps_playwright_browser_interface_start_time = None
+ browsersteps_playwright_browser_interface_browser = None
+ browsersteps_playwright_browser_interface_end_time = None
+ print ("Cleaning up old playwright session because time was up - done")
+
+def construct_blueprint(datastore: ChangeDetectionStore):
+
+ browser_steps_blueprint = Blueprint('browser_steps', __name__, template_folder="templates")
+
+ @login_required
+ @browser_steps_blueprint.route("/browsersteps_update", methods=['GET', 'POST'])
+ def browsersteps_ui_update():
+ import base64
+ import playwright._impl._api_types
+ import time
+
+ from changedetectionio.blueprint.browser_steps import browser_steps
+
+ global browsersteps_live_ui_o, browsersteps_playwright_browser_interface_end_time
+ global browsersteps_playwright_browser_interface_browser
+ global browsersteps_playwright_browser_interface
+ global browsersteps_playwright_browser_interface_start_time
+
+ step_n = None
+ remaining =0
+ uuid = request.args.get('uuid')
+
+ browsersteps_session_id = request.args.get('browsersteps_session_id')
+
+ if not browsersteps_session_id:
+ return make_response('No browsersteps_session_id specified', 500)
+
+ # Because we don't "really" run in a context manager ( we make the playwright interface global/long-living )
+ # We need to manage the shutdown when the time is up
+ if browsersteps_playwright_browser_interface_end_time:
+ remaining = browsersteps_playwright_browser_interface_end_time-time.time()
+ if browsersteps_playwright_browser_interface_end_time and remaining <= 0:
+
+
+ cleanup_playwright_session()
+
+ return make_response('Browser session expired, please reload the Browser Steps interface', 500)
+
+
+ # Actions - step/apply/etc, do the thing and return state
+ if request.method == 'POST':
+ # @todo - should always be an existing session
+ step_operation = request.form.get('operation')
+ step_selector = request.form.get('selector')
+ step_optional_value = request.form.get('optional_value')
+ step_n = int(request.form.get('step_n'))
+ is_last_step = strtobool(request.form.get('is_last_step'))
+
+ if step_operation == 'Goto site':
+ step_operation = 'goto_url'
+ step_optional_value = None
+ step_selector = datastore.data['watching'][uuid].get('url')
+
+ # @todo try.. accept.. nice errors not popups..
+ try:
+
+ this_session = browsersteps_live_ui_o.get(browsersteps_session_id)
+ if not this_session:
+ print("Browser exited")
+ return make_response('Browser session ran out of time :( Please reload this page.', 401)
+
+ this_session.call_action(action_name=step_operation,
+ selector=step_selector,
+ optional_value=step_optional_value)
+ except playwright._impl._api_types.TimeoutError as e:
+ print("Element wasnt found :-(", step_operation)
+ return make_response("Element was not found on page", 401)
+
+ except playwright._impl._api_types.Error as e:
+ # Browser/playwright level error
+ print("Browser error - got playwright._impl._api_types.Error, try reloading the session/browser")
+ print (str(e))
+
+ # Try to find something of value to give back to the user
+ for l in str(e).splitlines():
+ if 'DOMException' in l:
+ return make_response(l, 401)
+
+ return make_response('Browser session ran out of time :( Please reload this page.', 401)
+
+ # Get visual selector ready/update its data (also use the current filter info from the page?)
+ # When the last 'apply' button was pressed
+ # @todo this adds overhead because the xpath selection is happening twice
+ u = this_session.page.url
+ if is_last_step and u:
+ (screenshot, xpath_data) = this_session.request_visualselector_data()
+ datastore.save_screenshot(watch_uuid=uuid, screenshot=screenshot)
+ datastore.save_xpath_data(watch_uuid=uuid, data=xpath_data)
+
+ # Setup interface
+ if request.method == 'GET':
+
+ if not browsersteps_playwright_browser_interface:
+ print("Starting connection with playwright")
+ logging.debug("browser_steps.py connecting")
+ from playwright.sync_api import sync_playwright
+
+ browsersteps_playwright_browser_interface = sync_playwright().start()
+
+
+ time.sleep(1)
+ # At 20 minutes, some other variable is closing it
+ # @todo find out what it is and set it
+ seconds_keepalive = int(os.getenv('BROWSERSTEPS_MINUTES_KEEPALIVE', 10)) * 60
+
+ # keep it alive for 10 seconds more than we advertise, sometimes it helps to keep it shutting down cleanly
+ keepalive = "&timeout={}".format(((seconds_keepalive+3) * 1000))
+ try:
+ browsersteps_playwright_browser_interface_browser = browsersteps_playwright_browser_interface.chromium.connect_over_cdp(
+ os.getenv('PLAYWRIGHT_DRIVER_URL', '') + keepalive)
+ except Exception as e:
+ if 'ECONNREFUSED' in str(e):
+ return make_response('Unable to start the Playwright session properly, is it running?', 401)
+
+ browsersteps_playwright_browser_interface_end_time = time.time() + (seconds_keepalive-3)
+ print("Starting connection with playwright - done")
+
+ if not browsersteps_live_ui_o.get(browsersteps_session_id):
+ # Boot up a new session
+ proxy_id = datastore.get_preferred_proxy_for_watch(uuid=uuid)
+ proxy = None
+ if proxy_id:
+ proxy_url = datastore.proxy_list.get(proxy_id).get('url')
+ if proxy_url:
+ proxy = {'server': proxy_url}
+ print("Browser Steps: UUID {} Using proxy {}".format(uuid, proxy_url))
+
+ # Begin the new "Playwright Context" that re-uses the playwright interface
+ # Each session is a "Playwright Context" as a list, that uses the playwright interface
+ browsersteps_live_ui_o[browsersteps_session_id] = browser_steps.browsersteps_live_ui(
+ playwright_browser=browsersteps_playwright_browser_interface_browser,
+ proxy=proxy)
+ this_session = browsersteps_live_ui_o[browsersteps_session_id]
+
+ if not this_session.page:
+ cleanup_playwright_session()
+ return make_response('Browser session ran out of time :( Please reload this page.', 401)
+
+ try:
+ state = this_session.get_current_state()
+ except playwright._impl._api_types.Error as e:
+ return make_response("Browser session ran out of time :( Please reload this page."+str(e), 401)
+
+ p = {'screenshot': "data:image/png;base64,{}".format(
+ base64.b64encode(state[0]).decode('ascii')),
+ 'xpath_data': state[1],
+ 'session_age_start': this_session.age_start,
+ 'browser_time_remaining': round(remaining)
+ }
+
+
+ # @todo BSON/binary JSON, faster xfer, OR pick it off the disk
+ return p
+
+ return browser_steps_blueprint
+
+
diff --git a/changedetectionio/blueprint/browser_steps/browser_steps.py b/changedetectionio/blueprint/browser_steps/browser_steps.py
new file mode 100644
index 00000000..1207d192
--- /dev/null
+++ b/changedetectionio/blueprint/browser_steps/browser_steps.py
@@ -0,0 +1,268 @@
+#!/usr/bin/python3
+
+import os
+import time
+import re
+from random import randint
+
+# Two flags, tell the JS which of the "Selector" or "Value" field should be enabled in the front end
+# 0- off, 1- on
+browser_step_ui_config = {'Choose one': '0 0',
+ # 'Check checkbox': '1 0',
+ # 'Click button containing text': '0 1',
+ # 'Scroll to bottom': '0 0',
+ # 'Scroll to element': '1 0',
+ # 'Scroll to top': '0 0',
+ # 'Switch to iFrame by index number': '0 1'
+ # 'Uncheck checkbox': '1 0',
+ # @todo
+ 'Check checkbox': '1 0',
+ 'Click X,Y': '0 1',
+ 'Click element if exists': '1 0',
+ 'Click element': '1 0',
+ 'Click element containing text': '0 1',
+ 'Enter text in field': '1 1',
+# 'Extract text and use as filter': '1 0',
+ 'Goto site': '0 0',
+ 'Press Enter': '0 0',
+ 'Select by label': '1 1',
+ 'Scroll down': '0 0',
+ 'Uncheck checkbox': '1 0',
+ 'Wait for seconds': '0 1',
+ 'Wait for text': '0 1',
+ # 'Press Page Down': '0 0',
+ # 'Press Page Up': '0 0',
+ # weird bug, come back to it later
+ }
+
+
+# Good reference - https://playwright.dev/python/docs/input
+# https://pythonmana.com/2021/12/202112162236307035.html
+#
+# ONLY Works in Playwright because we need the fullscreen screenshot
+class steppable_browser_interface():
+ page = None
+
+ # Convert and perform "Click Button" for example
+ def call_action(self, action_name, selector=None, optional_value=None):
+ now = time.time()
+ call_action_name = re.sub('[^0-9a-zA-Z]+', '_', action_name.lower())
+ if call_action_name == 'choose_one':
+ return
+
+ print("> action calling", call_action_name)
+ # https://playwright.dev/python/docs/selectors#xpath-selectors
+ if selector.startswith('/') and not selector.startswith('//'):
+ selector = "xpath=" + selector
+
+ action_handler = getattr(self, "action_" + call_action_name)
+
+ # Support for Jinja2 variables in the value and selector
+ from jinja2 import Environment
+ jinja2_env = Environment(extensions=['jinja2_time.TimeExtension'])
+
+ if selector and ('{%' in selector or '{{' in selector):
+ selector = str(jinja2_env.from_string(selector).render())
+
+ if optional_value and ('{%' in optional_value or '{{' in optional_value):
+ optional_value = str(jinja2_env.from_string(optional_value).render())
+
+ action_handler(selector, optional_value)
+ self.page.wait_for_timeout(3 * 1000)
+ print("Call action done in", time.time() - now)
+
+ def action_goto_url(self, url, optional_value):
+ # self.page.set_viewport_size({"width": 1280, "height": 5000})
+ now = time.time()
+ response = self.page.goto(url, timeout=0, wait_until='domcontentloaded')
+ print("Time to goto URL", time.time() - now)
+
+ # Wait_until = commit
+ # - `'commit'` - consider operation to be finished when network response is received and the document started loading.
+ # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
+ # This seemed to solve nearly all 'TimeoutErrors'
+ extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))
+ self.page.wait_for_timeout(extra_wait * 1000)
+
+ def action_click_element_containing_text(self, selector=None, value=''):
+ if not len(value.strip()):
+ return
+ elem = self.page.get_by_text(value)
+ if elem.count():
+ elem.first.click(delay=randint(200, 500))
+
+ def action_enter_text_in_field(self, selector, value):
+ if not len(selector.strip()):
+ return
+
+ self.page.fill(selector, value, timeout=10 * 1000)
+
+ def action_click_element(self, selector, value):
+ print("Clicking element")
+ if not len(selector.strip()):
+ return
+ self.page.click(selector, timeout=10 * 1000, delay=randint(200, 500))
+
+ def action_click_element_if_exists(self, selector, value):
+ import playwright._impl._api_types as _api_types
+ print("Clicking element if exists")
+ if not len(selector.strip()):
+ return
+ try:
+ self.page.click(selector, timeout=10 * 1000, delay=randint(200, 500))
+ except _api_types.TimeoutError as e:
+ return
+ except _api_types.Error as e:
+ # Element was there, but page redrew and now its long long gone
+ return
+
+ def action_click_x_y(self, selector, value):
+ x, y = value.strip().split(',')
+ x = int(float(x.strip()))
+ y = int(float(y.strip()))
+ self.page.mouse.click(x=x, y=y, delay=randint(200, 500))
+
+ def action_scroll_down(self, selector, value):
+ # Some sites this doesnt work on for some reason
+ self.page.mouse.wheel(0, 600)
+ self.page.wait_for_timeout(1000)
+
+ def action_wait_for_seconds(self, selector, value):
+ self.page.wait_for_timeout(int(value) * 1000)
+
+ # @todo - in the future make some popout interface to capture what needs to be set
+ # https://playwright.dev/python/docs/api/class-keyboard
+ def action_press_enter(self, selector, value):
+ self.page.keyboard.press("Enter", delay=randint(200, 500))
+
+ def action_press_page_up(self, selector, value):
+ self.page.keyboard.press("PageUp", delay=randint(200, 500))
+
+ def action_press_page_down(self, selector, value):
+ self.page.keyboard.press("PageDown", delay=randint(200, 500))
+
+ def action_check_checkbox(self, selector, value):
+ self.page.locator(selector).check()
+
+ def action_uncheck_checkbox(self, selector, value):
+ self.page.locator(selector).uncheck()
+
+
+# Responsible for maintaining a live 'context' with browserless
+# @todo - how long do contexts live for anyway?
+class browsersteps_live_ui(steppable_browser_interface):
+ context = None
+ page = None
+ render_extra_delay = 1
+ stale = False
+ # bump and kill this if idle after X sec
+ age_start = 0
+
+ # use a special driver, maybe locally etc
+ command_executor = os.getenv(
+ "PLAYWRIGHT_BROWSERSTEPS_DRIVER_URL"
+ )
+ # if not..
+ if not command_executor:
+ command_executor = os.getenv(
+ "PLAYWRIGHT_DRIVER_URL",
+ 'ws://playwright-chrome:3000'
+ ).strip('"')
+
+ browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')
+
+ def __init__(self, playwright_browser, proxy=None):
+ self.age_start = time.time()
+ self.playwright_browser = playwright_browser
+ if self.context is None:
+ self.connect(proxy=proxy)
+
+ # Connect and setup a new context
+ def connect(self, proxy=None):
+ # Should only get called once - test that
+ keep_open = 1000 * 60 * 5
+ now = time.time()
+
+ # @todo handle multiple contexts, bind a unique id from the browser on each req?
+ self.context = self.playwright_browser.new_context(
+ # @todo
+ # user_agent=request_headers['User-Agent'] if request_headers.get('User-Agent') else 'Mozilla/5.0',
+ # proxy=self.proxy,
+ # This is needed to enable JavaScript execution on GitHub and others
+ bypass_csp=True,
+ # Should never be needed
+ accept_downloads=False,
+ proxy=proxy
+ )
+
+ self.page = self.context.new_page()
+
+ # self.page.set_default_navigation_timeout(keep_open)
+ self.page.set_default_timeout(keep_open)
+ # @todo probably this doesnt work
+ self.page.on(
+ "close",
+ self.mark_as_closed,
+ )
+ # Listen for all console events and handle errors
+ self.page.on("console", lambda msg: print(f"Browser steps console - {msg.type}: {msg.text} {msg.args}"))
+
+ print("time to browser setup", time.time() - now)
+ self.page.wait_for_timeout(1 * 1000)
+
+ def mark_as_closed(self):
+ print("Page closed, cleaning up..")
+
+ @property
+ def has_expired(self):
+ if not self.page:
+ return True
+
+
+ def get_current_state(self):
+ """Return the screenshot and interactive elements mapping, generally always called after action_()"""
+ from pkg_resources import resource_string
+ xpath_element_js = resource_string(__name__, "../../res/xpath_element_scraper.js").decode('utf-8')
+ now = time.time()
+ self.page.wait_for_timeout(1 * 1000)
+
+ # The actual screenshot
+ screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=40)
+
+ self.page.evaluate("var include_filters=''")
+ # Go find the interactive elements
+ # @todo in the future, something smarter that can scan for elements with .click/focus etc event handlers?
+ elements = 'a,button,input,select,textarea,i,th,td,p,li,h1,h2,h3,h4'
+ xpath_element_js = xpath_element_js.replace('%ELEMENTS%', elements)
+ xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}")
+ # So the JS will find the smallest one first
+ xpath_data['size_pos'] = sorted(xpath_data['size_pos'], key=lambda k: k['width'] * k['height'], reverse=True)
+ print("Time to complete get_current_state of browser", time.time() - now)
+ # except
+ # playwright._impl._api_types.Error: Browser closed.
+ # @todo show some countdown timer?
+ return (screenshot, xpath_data)
+
+ def request_visualselector_data(self):
+ """
+ Does the same that the playwright operation in content_fetcher does
+ This is used to just bump the VisualSelector data so it' ready to go if they click on the tab
+ @todo refactor and remove duplicate code, add include_filters
+ :param xpath_data:
+ :param screenshot:
+ :param current_include_filters:
+ :return:
+ """
+
+ self.page.evaluate("var include_filters=''")
+ from pkg_resources import resource_string
+ # The code that scrapes elements and makes a list of elements/size/position to click on in the VisualSelector
+ # @todo dont duplicate these selectors, or just let them both use the same data?
+ xpath_element_js = resource_string(__name__, "../../res/xpath_element_scraper.js").decode('utf-8')
+ xpath_element_js = xpath_element_js.replace('%ELEMENTS%',
+ 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section')
+ xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}")
+
+ screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)))
+
+ return (screenshot, xpath_data)
diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py
index fdcd9988..18d40ad2 100644
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@@ -1,7 +1,7 @@
from abc import abstractmethod
-from pkg_resources import resource_string
import chardet
import json
+import logging
import os
import requests
import sys
@@ -30,6 +30,12 @@ class JSActionExceptions(Exception):
self.message = message
return
+class BrowserStepsStepTimout(Exception):
+ def __init__(self, step_n):
+ self.step_n = step_n
+ return
+
+
class PageUnloadable(Exception):
def __init__(self, status_code, url, screenshot=False, message=False):
# Set this so we can use it in other parts of the app
@@ -70,6 +76,8 @@ class Fetcher():
status_code = None
content = None
headers = None
+ browser_steps = None
+ browser_steps_screenshot_path = None
fetcher_description = "No description"
webdriver_js_execute_code = None
@@ -86,8 +94,10 @@ class Fetcher():
render_extract_delay = 0
def __init__(self):
+ from pkg_resources import resource_string
# The code that scrapes elements and makes a list of elements/size/position to click on in the VisualSelector
self.xpath_element_js = resource_string(__name__, "res/xpath_element_scraper.js").decode('utf-8')
+ self.xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section')
@abstractmethod
def get_error(self):
@@ -113,11 +123,62 @@ class Fetcher():
def get_last_status_code(self):
return self.status_code
+ @abstractmethod
+ def screenshot_step(self, step_n):
+ return None
+
@abstractmethod
# Return true/false if this checker is ready to run, in the case it needs todo some special config check etc
def is_ready(self):
return True
+ def iterate_browser_steps(self):
+ from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
+ from playwright._impl._api_types import TimeoutError
+ from jinja2 import Environment
+ jinja2_env = Environment(extensions=['jinja2_time.TimeExtension'])
+
+ step_n = 0
+
+ if self.browser_steps is not None and len(self.browser_steps):
+ interface = steppable_browser_interface()
+ interface.page = self.page
+
+ valid_steps = filter(lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), self.browser_steps)
+
+ for step in valid_steps:
+ step_n += 1
+ print(">> Iterating check - browser Step n {} - {}...".format(step_n, step['operation']))
+ self.screenshot_step("before-"+str(step_n))
+ self.save_step_html("before-"+str(step_n))
+ try:
+ optional_value = step['optional_value']
+ selector = step['selector']
+ # Support for jinja2 template in step values, with date module added
+ if '{%' in step['optional_value'] or '{{' in step['optional_value']:
+ optional_value = str(jinja2_env.from_string(step['optional_value']).render())
+ if '{%' in step['selector'] or '{{' in step['selector']:
+ selector = str(jinja2_env.from_string(step['selector']).render())
+
+ getattr(interface, "call_action")(action_name=step['operation'],
+ selector=selector,
+ optional_value=optional_value)
+ self.screenshot_step(step_n)
+ self.save_step_html(step_n)
+ except TimeoutError:
+ # Stop processing here
+ raise BrowserStepsStepTimout(step_n=step_n)
+
+
+
+ # It's always good to reset these
+ def delete_browser_steps_screenshots(self):
+ import glob
+ if self.browser_steps_screenshot_path is not None:
+ dest = os.path.join(self.browser_steps_screenshot_path, 'step_*.jpeg')
+ files = glob.glob(dest)
+ for f in files:
+ os.unlink(f)
# Maybe for the future, each fetcher provides its own diff output, could be used for text, image
# the current one would return javascript output (as we use JS to generate the diff)
@@ -136,7 +197,6 @@ def available_fetchers():
return p
-
class base_html_playwright(Fetcher):
fetcher_description = "Playwright {}/Javascript".format(
os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
@@ -174,15 +234,26 @@ class base_html_playwright(Fetcher):
# allow per-watch proxy selection override
if proxy_override:
- # https://playwright.dev/docs/network#http-proxy
- from urllib.parse import urlparse
- parsed = urlparse(proxy_override)
- proxy_url = "{}://{}:{}".format(parsed.scheme, parsed.hostname, parsed.port)
- self.proxy = {'server': proxy_url}
- if parsed.username:
- self.proxy['username'] = parsed.username
- if parsed.password:
- self.proxy['password'] = parsed.password
+ self.proxy = {'server': proxy_override}
+
+ def screenshot_step(self, step_n=''):
+
+ # There's a bug where we need to do it twice or it doesnt take the whole page, dont know why.
+ self.page.screenshot(type='jpeg', clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024})
+ screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=85)
+
+ if self.browser_steps_screenshot_path is not None:
+ destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n))
+ logging.debug("Saving step screenshot to {}".format(destination))
+ with open(destination, 'wb') as f:
+ f.write(screenshot)
+
+ def save_step_html(self, step_n):
+ content = self.page.content()
+ destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n))
+ logging.debug("Saving step HTML to {}".format(destination))
+ with open(destination, 'w') as f:
+ f.write(content)
def run(self,
url,
@@ -195,9 +266,9 @@ class base_html_playwright(Fetcher):
from playwright.sync_api import sync_playwright
import playwright._impl._api_types
- from playwright._impl._api_types import Error, TimeoutError
- response = None
+ self.delete_browser_steps_screenshots()
+ response = None
with sync_playwright() as p:
browser_type = getattr(p, self.browser_type)
@@ -217,89 +288,86 @@ class base_html_playwright(Fetcher):
accept_downloads=False
)
+ self.page = context.new_page()
if len(request_headers):
context.set_extra_http_headers(request_headers)
- page = context.new_page()
try:
- page.set_default_navigation_timeout(90000)
- page.set_default_timeout(90000)
+ self.page.set_default_navigation_timeout(90000)
+ self.page.set_default_timeout(90000)
# Listen for all console events and handle errors
- page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
+ self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
# Bug - never set viewport size BEFORE page.goto
+
# Waits for the next navigation. Using Python context manager
# prevents a race condition between clicking and waiting for a navigation.
- with page.expect_navigation():
- response = page.goto(url, wait_until='load')
+ with self.page.expect_navigation():
+ response = self.page.goto(url, wait_until='load')
+ # Wait_until = commit
+ # - `'commit'` - consider operation to be finished when network response is received and the document started loading.
+ # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
+ # This seemed to solve nearly all 'TimeoutErrors'
+ extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
+ self.page.wait_for_timeout(extra_wait * 1000)
+ if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
+ self.page.evaluate(self.webdriver_js_execute_code)
except playwright._impl._api_types.TimeoutError as e:
context.close()
browser.close()
# This can be ok, we will try to grab what we could retrieve
pass
-
except Exception as e:
- print("other exception when page.goto")
- print(str(e))
+ print ("other exception when page.goto")
+ print (str(e))
context.close()
browser.close()
- raise PageUnloadable(url=url, status_code=None, message=e.message)
+ raise PageUnloadable(url=url, status_code=None)
+
if response is None:
context.close()
browser.close()
- print("response object was none")
+ print ("response object was none")
raise EmptyReply(url=url, status_code=None)
+ # Bug 2(?) Set the viewport size AFTER loading the page
+ self.page.set_viewport_size({"width": 1280, "height": 1024})
+
+ # Run Browser Steps here
+ self.iterate_browser_steps()
- # Removed browser-set-size, seemed to be needed to make screenshots work reliably in older playwright versions
- # Was causing exceptions like 'waiting for page but content is changing' etc
- # https://www.browserstack.com/docs/automate/playwright/change-browser-window-size 1280x720 should be the default
-
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
time.sleep(extra_wait)
- if self.webdriver_js_execute_code is not None:
- try:
- page.evaluate(self.webdriver_js_execute_code)
- except Exception as e:
- # Is it possible to get a screenshot?
- error_screenshot = False
- try:
- page.screenshot(type='jpeg',
- clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024},
- quality=1)
- # The actual screenshot
- error_screenshot = page.screenshot(type='jpeg',
- full_page=True,
- quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)))
- except Exception as s:
- pass
-
- raise JSActionExceptions(status_code=response.status, screenshot=error_screenshot, message=str(e), url=url)
-
- else:
- # JS eval was run, now we also wait some time if possible to let the page settle
- if self.render_extract_delay:
- page.wait_for_timeout(self.render_extract_delay * 1000)
-
- page.wait_for_timeout(500)
-
- self.content = page.content()
+ self.content = self.page.content()
self.status_code = response.status
+
+ if len(self.page.content().strip()) == 0:
+ context.close()
+ browser.close()
+ print ("Content was empty")
+ raise EmptyReply(url=url, status_code=None)
+
+ # Bug 2(?) Set the viewport size AFTER loading the page
+ self.page.set_viewport_size({"width": 1280, "height": 1024})
+
+ self.status_code = response.status
+ self.content = self.page.content()
self.headers = response.all_headers()
+ # So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
if current_include_filters is not None:
- page.evaluate("var include_filters={}".format(json.dumps(current_include_filters)))
+ self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters)))
else:
- page.evaluate("var include_filters=''")
+ self.page.evaluate("var include_filters=''")
- self.xpath_data = page.evaluate("async () => {" + self.xpath_element_js + "}")
+ self.xpath_data = self.page.evaluate("async () => {" + self.xpath_element_js.replace('%ELEMENTS%', 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section, summary') + "}")
# Bug 3 in Playwright screenshot handling
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
@@ -310,26 +378,17 @@ class base_html_playwright(Fetcher):
# acceptable screenshot quality here
try:
# Quality set to 1 because it's not used, just used as a work-around for a bug, no need to change this.
- page.screenshot(type='jpeg', clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024}, quality=1)
+ self.page.screenshot(type='jpeg', clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024}, quality=1)
# The actual screenshot
- self.screenshot = page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)))
+ self.screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)))
except Exception as e:
context.close()
browser.close()
raise ScreenshotUnavailable(url=url, status_code=None)
- if len(self.content.strip()) == 0:
- context.close()
- browser.close()
- print("Content was empty")
- raise EmptyReply(url=url, status_code=None, screenshot=self.screenshot)
-
context.close()
browser.close()
- if not ignore_status_codes and self.status_code!=200:
- raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, page_html=self.content, screenshot=self.screenshot)
-
class base_html_webdriver(Fetcher):
if os.getenv("WEBDRIVER_URL"):
fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))
@@ -423,7 +482,6 @@ class base_html_webdriver(Fetcher):
def is_ready(self):
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
- from selenium.common.exceptions import WebDriverException
self.driver = webdriver.Remote(
command_executor=self.command_executor,
diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py
index 31c0bb7f..68762f45 100644
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@@ -108,6 +108,11 @@ class perform_site_check():
elif system_webdriver_delay is not None:
fetcher.render_extract_delay = system_webdriver_delay
+ # Possible conflict
+ if prefer_backend == 'html_webdriver':
+ fetcher.browser_steps = watch.get('browser_steps', None)
+ fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, uuid)
+
if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip():
fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code')
diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py
index 2904f461..57dd7c77 100644
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -1,11 +1,10 @@
+import os
import re
from wtforms import (
BooleanField,
- Field,
Form,
IntegerField,
- PasswordField,
RadioField,
SelectField,
StringField,
@@ -13,15 +12,17 @@ from wtforms import (
TextAreaField,
fields,
validators,
- widgets,
+ widgets
)
+from wtforms.fields import FieldList
from wtforms.validators import ValidationError
+# default
+# each select