Compare commits

..

4 Commits

Author SHA1 Message Date
dgtlmoon
ec77b45e84 WIP 2023-04-08 21:14:03 +02:00
dgtlmoon
138f7fc59c WIP 2023-04-08 20:35:13 +02:00
dgtlmoon
56b768d24f WIP 2023-04-08 20:12:30 +02:00
dgtlmoon
a61d7b4284 Attempt to abstract out each fetch type (requests/playwright/webdriver etc) 2023-04-08 18:49:27 +02:00
56 changed files with 1160 additions and 1997 deletions

View File

@@ -30,7 +30,7 @@ jobs:
# Selenium+browserless
docker run --network changedet-network -d --hostname selenium -p 4444:4444 --rm --shm-size="2g" selenium/standalone-chrome-debug:3.141.59
docker run --network changedet-network -d --hostname browserless -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm -p 3000:3000 --shm-size="2g" browserless/chrome:1.53-chrome-stable
docker run --network changedet-network -d --hostname browserless -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm -p 3000:3000 --shm-size="2g" browserless/chrome:1.53-chrome-stable
- name: Build changedetection.io container for testing
run: |
@@ -55,19 +55,9 @@ jobs:
# Playwright/Browserless fetch
docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py'
# Settings headers playwright tests - Call back in from Browserless, check headers
docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000?dumpio=true" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py'
docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py'
docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000?dumpio=true" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py'
# restock detection via playwright - added name=changedet here so that playwright/browserless can connect to it
docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py'
- name: Test with puppeteer fetcher and disk cache
run: |
docker run --rm -e "PUPPETEER_DISK_CACHE=/tmp/data/" -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py'
# Browserless would have had -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" added above
- name: Test proxy interaction
run: |
cd changedetectionio

View File

@@ -1,5 +1,5 @@
# pip dependencies install stage
FROM python:3.10-slim-bookworm as builder
FROM python:3.10-slim as builder
# See `cryptography` pin comment in requirements.txt
ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1
@@ -29,10 +29,10 @@ RUN pip install --target=/dependencies playwright~=1.27.1 \
|| echo "WARN: Failed to install Playwright. The application can still run, but the Playwright option will be disabled."
# Final image stage
FROM python:3.10-slim-bookworm
FROM python:3.10-slim
RUN apt-get update && apt-get install -y --no-install-recommends \
libssl3 \
libssl1.1 \
libxslt1.1 \
# For pdftohtml
poppler-utils \

View File

@@ -2,10 +2,10 @@
Live your data-life pro-actively, track website content changes and receive notifications via Discord, Email, Slack, Telegram and 70+ more
[<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot.png" style="max-width:100%;" alt="Self-hosted web page change monitoring" title="Self-hosted web page change monitoring" />](https://changedetection.io)
[<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot.png" style="max-width:100%;" alt="Self-hosted web page change monitoring" title="Self-hosted web page change monitoring" />](https://lemonade.changedetection.io/start?src=pip)
[**Don't have time? Let us host it for you! try our extremely affordable subscription use our proxies and support!**](https://changedetection.io)
[**Don't have time? Let us host it for you! try our extremely affordable subscription use our proxies and support!**](https://lemonade.changedetection.io/start)
#### Example use cases

View File

@@ -5,13 +5,13 @@
_Live your data-life pro-actively._
[<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot.png" style="max-width:100%;" alt="Self-hosted web page change monitoring" title="Self-hosted web page change monitoring" />](https://changedetection.io?src=github)
[<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot.png" style="max-width:100%;" alt="Self-hosted web page change monitoring" title="Self-hosted web page change monitoring" />](https://lemonade.changedetection.io/start?src=github)
[![Release Version][release-shield]][release-link] [![Docker Pulls][docker-pulls]][docker-link] [![License][license-shield]](LICENSE.md)
![changedetection.io](https://github.com/dgtlmoon/changedetection.io/actions/workflows/test-only.yml/badge.svg?branch=master)
[**Don't have time? Let us host it for you! try our $8.99/month subscription - use our proxies and support!**](https://changedetection.io) , _half the price of other website change monitoring services!_
[**Don't have time? Let us host it for you! try our $8.99/month subscription - use our proxies and support!**](https://lemonade.changedetection.io/start) , _half the price of other website change monitoring services and comes with unlimited watches & checks!_
- Chrome browser included.
- Super fast, no registration needed setup.
@@ -22,11 +22,11 @@ _Live your data-life pro-actively._
Available when connected to a <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Playwright-content-fetcher">playwright content fetcher</a> (included as part of our subscription service)
[<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/visualselector-anim.gif" style="max-width:100%;" alt="Self-hosted web page change monitoring context difference " title="Self-hosted web page change monitoring context difference " />](https://changedetection.io?src=github)
[<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/visualselector-anim.gif" style="max-width:100%;" alt="Self-hosted web page change monitoring context difference " title="Self-hosted web page change monitoring context difference " />](https://lemonade.changedetection.io/start?src=github)
### Easily see what changed, examine by word, line, or individual character.
[<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot-diff.png" style="max-width:100%;" alt="Self-hosted web page change monitoring context difference " title="Self-hosted web page change monitoring context difference " />](https://changedetection.io?src=github)
[<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot-diff.png" style="max-width:100%;" alt="Self-hosted web page change monitoring context difference " title="Self-hosted web page change monitoring context difference " />](https://lemonade.changedetection.io/start?src=github)
### Perform interactive browser steps
@@ -35,7 +35,7 @@ Fill in text boxes, click buttons and more, setup your changedetection scenario.
Using the **Browser Steps** configuration, add basic steps before performing change detection, such as logging into websites, adding a product to a cart, accept cookie logins, entering dates and refining searches.
[<img src="docs/browsersteps-anim.gif" style="max-width:100%;" alt="Self-hosted web page change monitoring context difference " title="Website change detection with interactive browser steps, login, cookies etc" />](https://changedetection.io?src=github)
[<img src="docs/browsersteps-anim.gif" style="max-width:100%;" alt="Self-hosted web page change monitoring context difference " title="Website change detection with interactive browser steps, login, cookies etc" />](https://lemonade.changedetection.io/start?src=github)
After **Browser Steps** have been run, then visit the **Visual Selector** tab to refine the content you're interested in.
Requires Playwright to be enabled.
@@ -66,7 +66,6 @@ Requires Playwright to be enabled.
- Proactively search for jobs, get notified when companies update their careers page, search job portals for keywords.
- Get alerts when new job positions are open on Bamboo HR and other job platforms
- Website defacement monitoring
- Pokémon Card Restock Tracker / Pokémon TCG Tracker
_Need an actual Chrome runner with Javascript support? We support fetching via WebDriver and Playwright!</a>_
@@ -145,7 +144,7 @@ See the wiki for more information https://github.com/dgtlmoon/changedetection.io
## Filters
XPath, JSONPath, jq, and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools.
(We support LXML `re:test`, `re:match` and `re:replace`.)
(We support LXML `re:test`, `re:math` and `re:replace`.)
## Notifications
@@ -238,7 +237,7 @@ Supports managing the website watch list [via our API](https://changedetection.i
Do you use changedetection.io to make money? does it save you time or money? Does it make your life easier? less stressful? Remember, we write this software when we should be doing actual paid work, we have to buy food and pay rent just like you.
Firstly, consider taking out a [change detection monthly subscription - unlimited checks and watches](https://changedetection.io?src=github) , even if you don't use it, you still get the warm fuzzy feeling of helping out the project. (And who knows, you might just use it!)
Firstly, consider taking out a [change detection monthly subscription - unlimited checks and watches](https://lemonade.changedetection.io/start) , even if you don't use it, you still get the warm fuzzy feeling of helping out the project. (And who knows, you might just use it!)
Or directly donate an amount PayPal [![Donate](https://img.shields.io/badge/Donate-PayPal-green.svg)](https://www.paypal.com/donate/?hosted_button_id=7CP6HR9ZCNDYJ)

View File

@@ -33,12 +33,10 @@ from flask import (
url_for,
)
from flask_paginate import Pagination, get_page_parameter
from changedetectionio import html_tools
from changedetectionio.api import api_v1
__version__ = '0.42.3'
__version__ = '0.41.1'
datastore = None
@@ -124,15 +122,6 @@ def _jinja2_filter_datetimestamp(timestamp, format="%Y-%m-%d %H:%M:%S"):
return timeago.format(timestamp, time.time())
@app.template_filter('pagination_slice')
def _jinja2_filter_pagination_slice(arr, skip):
per_page = datastore.data['settings']['application'].get('pager_size', 50)
if per_page:
return arr[skip:skip + per_page]
return arr
@app.template_filter('format_seconds_ago')
def _jinja2_filter_seconds_precise(timestamp):
if timestamp == False:
@@ -412,41 +401,24 @@ def changedetection_app(config=None, datastore_o=None):
# Sort by last_changed and add the uuid which is usually the key..
sorted_watches = []
search_q = request.args.get('q').strip().lower() if request.args.get('q') else False
for uuid, watch in datastore.data['watching'].items():
if limit_tag:
if limit_tag != None:
# Support for comma separated list of tags.
if not watch.get('tag'):
if watch['tag'] is None:
continue
for tag_in_watch in watch.get('tag', '').split(','):
for tag_in_watch in watch['tag'].split(','):
tag_in_watch = tag_in_watch.strip()
if tag_in_watch == limit_tag:
watch['uuid'] = uuid
if search_q:
if (watch.get('title') and search_q in watch.get('title').lower()) or search_q in watch.get('url', '').lower():
sorted_watches.append(watch)
else:
sorted_watches.append(watch)
sorted_watches.append(watch)
else:
#watch['uuid'] = uuid
if search_q:
if (watch.get('title') and search_q in watch.get('title').lower()) or search_q in watch.get('url', '').lower():
sorted_watches.append(watch)
else:
sorted_watches.append(watch)
watch['uuid'] = uuid
sorted_watches.append(watch)
existing_tags = datastore.get_all_tags()
form = forms.quickWatchForm(request.form)
page = request.args.get(get_page_parameter(), type=int, default=1)
total_count = len(sorted_watches)
pagination = Pagination(page=page,
total=total_count,
per_page=datastore.data['settings']['application'].get('pager_size', 50), css_framework="semantic")
output = render_template(
"watch-overview.html",
# Don't link to hosting when we're on the hosting environment
@@ -457,29 +429,16 @@ def changedetection_app(config=None, datastore_o=None):
has_proxies=datastore.proxy_list,
has_unviewed=datastore.has_unviewed,
hosted_sticky=os.getenv("SALTED_PASS", False) == False,
pagination=pagination,
queued_uuids=[q_uuid.item['uuid'] for q_uuid in update_q.queue],
search_q=request.args.get('q','').strip(),
sort_attribute=request.args.get('sort') if request.args.get('sort') else request.cookies.get('sort'),
sort_order=request.args.get('order') if request.args.get('order') else request.cookies.get('order'),
system_default_fetcher=datastore.data['settings']['application'].get('fetch_backend'),
tags=existing_tags,
watches=sorted_watches
)
if session.get('share-link'):
del(session['share-link'])
resp = make_response(output)
# The template can run on cookie or url query info
if request.args.get('sort'):
resp.set_cookie('sort', request.args.get('sort'))
if request.args.get('order'):
resp.set_cookie('order', request.args.get('order'))
return resp
return output
# AJAX endpoint for sending a test
@@ -504,19 +463,11 @@ def changedetection_app(config=None, datastore_o=None):
try:
n_object = {'watch_url': request.form['window_url'],
'notification_urls': request.form['notification_urls'].splitlines()
'notification_urls': request.form['notification_urls'].splitlines(),
'notification_title': request.form['notification_title'].strip(),
'notification_body': request.form['notification_body'].strip(),
'notification_format': request.form['notification_format'].strip()
}
# Only use if present, if not set in n_object it should use the default system value
if 'notification_format' in request.form and request.form['notification_format'].strip():
n_object['notification_format'] = request.form.get('notification_format', '').strip()
if 'notification_title' in request.form and request.form['notification_title'].strip():
n_object['notification_title'] = request.form.get('notification_title', '').strip()
if 'notification_body' in request.form and request.form['notification_body'].strip():
n_object['notification_body'] = request.form.get('notification_body', '').strip()
notification_q.put(n_object)
except Exception as e:
return make_response({'error': str(e)}, 400)
@@ -713,7 +664,6 @@ def changedetection_app(config=None, datastore_o=None):
form=form,
has_default_notification_urls=True if len(datastore.data['settings']['application']['notification_urls']) else False,
has_empty_checktime=using_default_check_time,
has_extra_headers_file=watch.has_extra_headers_file or datastore.has_extra_headers_file,
is_html_webdriver=is_html_webdriver,
jq_support=jq_support,
playwright_enabled=os.getenv('PLAYWRIGHT_DRIVER_URL', False),
@@ -729,7 +679,7 @@ def changedetection_app(config=None, datastore_o=None):
@app.route("/settings", methods=['GET', "POST"])
@login_optionally_required
def settings_page():
from changedetectionio import content_fetcher, forms
from . import forms
default = deepcopy(datastore.data['settings'])
if datastore.proxy_list is not None:
@@ -1308,13 +1258,6 @@ def changedetection_app(config=None, datastore_o=None):
datastore.data['watching'][uuid.strip()]['paused'] = False
flash("{} watches unpaused".format(len(uuids)))
elif (op == 'mark-viewed'):
for uuid in uuids:
uuid = uuid.strip()
if datastore.data['watching'].get(uuid):
datastore.set_last_viewed(uuid, int(time.time()))
flash("{} watches updated".format(len(uuids)))
elif (op == 'mute'):
for uuid in uuids:
uuid = uuid.strip()
@@ -1337,13 +1280,6 @@ def changedetection_app(config=None, datastore_o=None):
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
flash("{} watches queued for rechecking".format(len(uuids)))
elif (op == 'clear-history'):
for uuid in uuids:
uuid = uuid.strip()
if datastore.data['watching'].get(uuid):
datastore.clear_watch_history(uuid)
flash("{} watches cleared/reset.".format(len(uuids)))
elif (op == 'notification-default'):
from changedetectionio.notification import (
default_notification_format_for_watch
@@ -1458,7 +1394,6 @@ def check_for_new_version():
# Check daily
app.config.exit.wait(86400)
def notification_runner():
global notification_debug_log
from datetime import datetime

View File

@@ -27,106 +27,58 @@ import os
import logging
from changedetectionio.store import ChangeDetectionStore
from changedetectionio import login_optionally_required
browsersteps_live_ui_o = {}
browsersteps_playwright_browser_interface = None
browsersteps_playwright_browser_interface_browser = None
browsersteps_playwright_browser_interface_context = None
browsersteps_playwright_browser_interface_end_time = None
browsersteps_playwright_browser_interface_start_time = None
browsersteps_sessions = {}
io_interface_context = None
def cleanup_playwright_session():
global browsersteps_live_ui_o
global browsersteps_playwright_browser_interface
global browsersteps_playwright_browser_interface_browser
global browsersteps_playwright_browser_interface_context
global browsersteps_playwright_browser_interface_end_time
global browsersteps_playwright_browser_interface_start_time
browsersteps_live_ui_o = {}
browsersteps_playwright_browser_interface = None
browsersteps_playwright_browser_interface_browser = None
browsersteps_playwright_browser_interface_end_time = None
browsersteps_playwright_browser_interface_start_time = None
print("Cleaning up old playwright session because time was up, calling .goodbye()")
try:
browsersteps_playwright_browser_interface_context.goodbye()
except Exception as e:
print ("Got exception in shutdown, probably OK")
print (str(e))
browsersteps_playwright_browser_interface_context = None
print ("Cleaning up old playwright session because time was up - done")
def construct_blueprint(datastore: ChangeDetectionStore):
browser_steps_blueprint = Blueprint('browser_steps', __name__, template_folder="templates")
def start_browsersteps_session(watch_uuid):
from . import nonContext
from . import browser_steps
import time
global browsersteps_sessions
global io_interface_context
# We keep the playwright session open for many minutes
seconds_keepalive = int(os.getenv('BROWSERSTEPS_MINUTES_KEEPALIVE', 10)) * 60
browsersteps_start_session = {'start_time': time.time()}
# You can only have one of these running
# This should be very fine to leave running for the life of the application
# @idea - Make it global so the pool of watch fetchers can use it also
if not io_interface_context:
io_interface_context = nonContext.c_sync_playwright()
# Start the Playwright context, which is actually a nodejs sub-process and communicates over STDIN/STDOUT pipes
io_interface_context = io_interface_context.start()
# keep it alive for 10 seconds more than we advertise, sometimes it helps to keep it shutting down cleanly
keepalive = "&timeout={}".format(((seconds_keepalive + 3) * 1000))
try:
browsersteps_start_session['browser'] = io_interface_context.chromium.connect_over_cdp(
os.getenv('PLAYWRIGHT_DRIVER_URL', '') + keepalive)
except Exception as e:
if 'ECONNREFUSED' in str(e):
return make_response('Unable to start the Playwright Browser session, is it running?', 401)
else:
return make_response(str(e), 401)
proxy_id = datastore.get_preferred_proxy_for_watch(uuid=watch_uuid)
proxy = None
if proxy_id:
proxy_url = datastore.proxy_list.get(proxy_id).get('url')
if proxy_url:
# Playwright needs separate username and password values
from urllib.parse import urlparse
parsed = urlparse(proxy_url)
proxy = {'server': proxy_url}
if parsed.username:
proxy['username'] = parsed.username
if parsed.password:
proxy['password'] = parsed.password
print("Browser Steps: UUID {} selected proxy {}".format(watch_uuid, proxy_url))
# Tell Playwright to connect to Chrome and setup a new session via our stepper interface
browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui(
playwright_browser=browsersteps_start_session['browser'],
proxy=proxy)
# For test
#browsersteps_start_session['browserstepper'].action_goto_url(value="http://example.com?time="+str(time.time()))
return browsersteps_start_session
@login_optionally_required
@browser_steps_blueprint.route("/browsersteps_start_session", methods=['GET'])
def browsersteps_start_session():
# A new session was requested, return sessionID
import uuid
global browsersteps_sessions
browsersteps_session_id = str(uuid.uuid4())
watch_uuid = request.args.get('uuid')
if not watch_uuid:
return make_response('No Watch UUID specified', 500)
print("Starting connection with playwright")
logging.debug("browser_steps.py connecting")
browsersteps_sessions[browsersteps_session_id] = start_browsersteps_session(watch_uuid)
print("Starting connection with playwright - done")
return {'browsersteps_session_id': browsersteps_session_id}
# A request for an action was received
@login_optionally_required
@browser_steps_blueprint.route("/browsersteps_update", methods=['POST'])
@browser_steps_blueprint.route("/browsersteps_update", methods=['GET', 'POST'])
def browsersteps_ui_update():
import base64
import playwright._impl._api_types
global browsersteps_sessions
import time
from changedetectionio.blueprint.browser_steps import browser_steps
global browsersteps_live_ui_o, browsersteps_playwright_browser_interface_end_time
global browsersteps_playwright_browser_interface_browser
global browsersteps_playwright_browser_interface
global browsersteps_playwright_browser_interface_start_time
step_n = None
remaining =0
uuid = request.args.get('uuid')
@@ -135,9 +87,13 @@ def construct_blueprint(datastore: ChangeDetectionStore):
if not browsersteps_session_id:
return make_response('No browsersteps_session_id specified', 500)
if not browsersteps_sessions.get(browsersteps_session_id):
return make_response('No session exists under that ID', 500)
# Because we don't "really" run in a context manager ( we make the playwright interface global/long-living )
# We need to manage the shutdown when the time is up
if browsersteps_playwright_browser_interface_end_time:
remaining = browsersteps_playwright_browser_interface_end_time-time.time()
if browsersteps_playwright_browser_interface_end_time and remaining <= 0:
cleanup_playwright_session()
return make_response('Browser session expired, please reload the Browser Steps interface', 401)
# Actions - step/apply/etc, do the thing and return state
if request.method == 'POST':
@@ -156,7 +112,12 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# @todo try.. accept.. nice errors not popups..
try:
browsersteps_sessions[browsersteps_session_id]['browserstepper'].call_action(action_name=step_operation,
this_session = browsersteps_live_ui_o.get(browsersteps_session_id)
if not this_session:
print("Browser exited")
return make_response('Browser session ran out of time :( Please reload this page.', 401)
this_session.call_action(action_name=step_operation,
selector=step_selector,
optional_value=step_optional_value)
@@ -168,43 +129,99 @@ def construct_blueprint(datastore: ChangeDetectionStore):
# Get visual selector ready/update its data (also use the current filter info from the page?)
# When the last 'apply' button was pressed
# @todo this adds overhead because the xpath selection is happening twice
u = browsersteps_sessions[browsersteps_session_id]['browserstepper'].page.url
u = this_session.page.url
if is_last_step and u:
(screenshot, xpath_data) = browsersteps_sessions[browsersteps_session_id]['browserstepper'].request_visualselector_data()
(screenshot, xpath_data) = this_session.request_visualselector_data()
datastore.save_screenshot(watch_uuid=uuid, screenshot=screenshot)
datastore.save_xpath_data(watch_uuid=uuid, data=xpath_data)
# if not this_session.page:
# cleanup_playwright_session()
# return make_response('Browser session ran out of time :( Please reload this page.', 401)
# Setup interface
if request.method == 'GET':
# Screenshots and other info only needed on requesting a step (POST)
try:
state = browsersteps_sessions[browsersteps_session_id]['browserstepper'].get_current_state()
except playwright._impl._api_types.Error as e:
return make_response("Browser session ran out of time :( Please reload this page."+str(e), 401)
if not browsersteps_playwright_browser_interface:
print("Starting connection with playwright")
logging.debug("browser_steps.py connecting")
# Use send_file() which is way faster than read/write loop on bytes
import json
from tempfile import mkstemp
from flask import send_file
tmp_fd, tmp_file = mkstemp(text=True, suffix=".json", prefix="changedetectionio-")
global browsersteps_playwright_browser_interface_context
from . import nonContext
browsersteps_playwright_browser_interface_context = nonContext.c_sync_playwright()
browsersteps_playwright_browser_interface = browsersteps_playwright_browser_interface_context.start()
output = json.dumps({'screenshot': "data:image/jpeg;base64,{}".format(
base64.b64encode(state[0]).decode('ascii')),
'xpath_data': state[1],
'session_age_start': browsersteps_sessions[browsersteps_session_id]['browserstepper'].age_start,
'browser_time_remaining': round(remaining)
})
time.sleep(1)
# At 20 minutes, some other variable is closing it
# @todo find out what it is and set it
seconds_keepalive = int(os.getenv('BROWSERSTEPS_MINUTES_KEEPALIVE', 10)) * 60
with os.fdopen(tmp_fd, 'w') as f:
f.write(output)
# keep it alive for 10 seconds more than we advertise, sometimes it helps to keep it shutting down cleanly
keepalive = "&timeout={}".format(((seconds_keepalive+3) * 1000))
try:
browsersteps_playwright_browser_interface_browser = browsersteps_playwright_browser_interface.chromium.connect_over_cdp(
os.getenv('PLAYWRIGHT_DRIVER_URL', '') + keepalive)
except Exception as e:
if 'ECONNREFUSED' in str(e):
return make_response('Unable to start the Playwright session properly, is it running?', 401)
response = make_response(send_file(path_or_file=tmp_file,
mimetype='application/json; charset=UTF-8',
etag=True))
# No longer needed
os.unlink(tmp_file)
browsersteps_playwright_browser_interface_end_time = time.time() + (seconds_keepalive-3)
print("Starting connection with playwright - done")
if not browsersteps_live_ui_o.get(browsersteps_session_id):
# Boot up a new session
proxy_id = datastore.get_preferred_proxy_for_watch(uuid=uuid)
proxy = None
if proxy_id:
proxy_url = datastore.proxy_list.get(proxy_id).get('url')
if proxy_url:
proxy = {'server': proxy_url}
print("Browser Steps: UUID {} Using proxy {}".format(uuid, proxy_url))
# Begin the new "Playwright Context" that re-uses the playwright interface
# Each session is a "Playwright Context" as a list, that uses the playwright interface
browsersteps_live_ui_o[browsersteps_session_id] = browser_steps.browsersteps_live_ui(
playwright_browser=browsersteps_playwright_browser_interface_browser,
proxy=proxy)
this_session = browsersteps_live_ui_o[browsersteps_session_id]
if not this_session.page:
cleanup_playwright_session()
return make_response('Browser session ran out of time :( Please reload this page.', 401)
response = None
if request.method == 'POST':
# Screenshots and other info only needed on requesting a step (POST)
try:
state = this_session.get_current_state()
except playwright._impl._api_types.Error as e:
return make_response("Browser session ran out of time :( Please reload this page."+str(e), 401)
# Use send_file() which is way faster than read/write loop on bytes
import json
from tempfile import mkstemp
from flask import send_file
tmp_fd, tmp_file = mkstemp(text=True, suffix=".json", prefix="changedetectionio-")
output = json.dumps({'screenshot': "data:image/jpeg;base64,{}".format(
base64.b64encode(state[0]).decode('ascii')),
'xpath_data': state[1],
'session_age_start': this_session.age_start,
'browser_time_remaining': round(remaining)
})
with os.fdopen(tmp_fd, 'w') as f:
f.write(output)
response = make_response(send_file(path_or_file=tmp_file,
mimetype='application/json; charset=UTF-8',
etag=True))
# No longer needed
os.unlink(tmp_file)
elif request.method == 'GET':
# Just enough to get the session rolling, it will call for goto-site via POST next
response = make_response({
'session_age_start': this_session.age_start,
'browser_time_remaining': round(remaining)
})
return response

View File

@@ -71,10 +71,10 @@ class steppable_browser_interface():
optional_value = str(jinja2_env.from_string(optional_value).render())
action_handler(selector, optional_value)
self.page.wait_for_timeout(1.5 * 1000)
self.page.wait_for_timeout(3 * 1000)
print("Call action done in", time.time() - now)
def action_goto_url(self, selector=None, value=None):
def action_goto_url(self, selector, value):
# self.page.set_viewport_size({"width": 1280, "height": 5000})
now = time.time()
response = self.page.goto(value, timeout=0, wait_until='commit')
@@ -105,8 +105,7 @@ class steppable_browser_interface():
print("Clicking element")
if not len(selector.strip()):
return
self.page.click(selector=selector, timeout=30 * 1000, delay=randint(200, 500))
self.page.click(selector, timeout=10 * 1000, delay=randint(200, 500))
def action_click_element_if_exists(self, selector, value):
import playwright._impl._api_types as _api_types
@@ -133,18 +132,18 @@ class steppable_browser_interface():
self.page.wait_for_timeout(1000)
def action_wait_for_seconds(self, selector, value):
self.page.wait_for_timeout(float(value.strip()) * 1000)
self.page.wait_for_timeout(int(value) * 1000)
def action_wait_for_text(self, selector, value):
import json
v = json.dumps(value)
self.page.wait_for_function(f'document.querySelector("body").innerText.includes({v});', timeout=90000)
self.page.wait_for_function(f'document.querySelector("body").innerText.includes({v});', timeout=30000)
def action_wait_for_text_in_element(self, selector, value):
import json
s = json.dumps(selector)
v = json.dumps(value)
self.page.wait_for_function(f'document.querySelector({s}).innerText.includes({v});', timeout=90000)
self.page.wait_for_function(f'document.querySelector({s}).innerText.includes({v});', timeout=30000)
# @todo - in the future make some popout interface to capture what needs to be set
# https://playwright.dev/python/docs/api/class-keyboard
@@ -238,7 +237,7 @@ class browsersteps_live_ui(steppable_browser_interface):
def get_current_state(self):
"""Return the screenshot and interactive elements mapping, generally always called after action_()"""
from pkg_resources import resource_string
xpath_element_js = resource_string(__name__, "../../res/xpath_element_scraper.js").decode('utf-8')
xpath_element_js = resource_string(__name__, "../res/xpath_element_scraper.js").decode('utf-8')
now = time.time()
self.page.wait_for_timeout(1 * 1000)
@@ -273,8 +272,8 @@ class browsersteps_live_ui(steppable_browser_interface):
self.page.evaluate("var include_filters=''")
from pkg_resources import resource_string
# The code that scrapes elements and makes a list of elements/size/position to click on in the VisualSelector
xpath_element_js = resource_string(__name__, "../../res/xpath_element_scraper.js").decode('utf-8')
from changedetectionio.content_fetcher import visualselector_xpath_selectors
xpath_element_js = resource_string(__name__, "../res/xpath_element_scraper.js").decode('utf-8')
from changedetectionio.fetchers import visualselector_xpath_selectors
xpath_element_js = xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}")
screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)))

View File

@@ -13,7 +13,7 @@ import signal
import socket
import sys
from . import store, changedetection_app, content_fetcher
from . import store, changedetection_app
from . import __version__
# Only global so we can access it in the signal handler

View File

@@ -1,746 +0,0 @@
import hashlib
from abc import abstractmethod
import chardet
import json
import logging
import os
import requests
import sys
import time
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section, summary'
class Non200ErrorCodeReceived(Exception):
def __init__(self, status_code, url, screenshot=None, xpath_data=None, page_html=None):
# Set this so we can use it in other parts of the app
self.status_code = status_code
self.url = url
self.screenshot = screenshot
self.xpath_data = xpath_data
self.page_text = None
if page_html:
from changedetectionio import html_tools
self.page_text = html_tools.html_to_text(page_html)
return
class checksumFromPreviousCheckWasTheSame(Exception):
def __init__(self):
return
class JSActionExceptions(Exception):
def __init__(self, status_code, url, screenshot, message=''):
self.status_code = status_code
self.url = url
self.screenshot = screenshot
self.message = message
return
class BrowserStepsStepTimout(Exception):
def __init__(self, step_n):
self.step_n = step_n
return
class PageUnloadable(Exception):
def __init__(self, status_code, url, message, screenshot=False):
# Set this so we can use it in other parts of the app
self.status_code = status_code
self.url = url
self.screenshot = screenshot
self.message = message
return
class EmptyReply(Exception):
def __init__(self, status_code, url, screenshot=None):
# Set this so we can use it in other parts of the app
self.status_code = status_code
self.url = url
self.screenshot = screenshot
return
class ScreenshotUnavailable(Exception):
def __init__(self, status_code, url, page_html=None):
# Set this so we can use it in other parts of the app
self.status_code = status_code
self.url = url
if page_html:
from html_tools import html_to_text
self.page_text = html_to_text(page_html)
return
class ReplyWithContentButNoText(Exception):
def __init__(self, status_code, url, screenshot=None):
# Set this so we can use it in other parts of the app
self.status_code = status_code
self.url = url
self.screenshot = screenshot
return
class Fetcher():
browser_steps = None
browser_steps_screenshot_path = None
content = None
error = None
fetcher_description = "No description"
headers = {}
status_code = None
webdriver_js_execute_code = None
xpath_data = None
xpath_element_js = ""
instock_data = None
instock_data_js = ""
# Will be needed in the future by the VisualSelector, always get this where possible.
screenshot = False
system_http_proxy = os.getenv('HTTP_PROXY')
system_https_proxy = os.getenv('HTTPS_PROXY')
# Time ONTOP of the system defined env minimum time
render_extract_delay = 0
def __init__(self):
from pkg_resources import resource_string
# The code that scrapes elements and makes a list of elements/size/position to click on in the VisualSelector
self.xpath_element_js = resource_string(__name__, "res/xpath_element_scraper.js").decode('utf-8')
self.instock_data_js = resource_string(__name__, "res/stock-not-in-stock.js").decode('utf-8')
@abstractmethod
def get_error(self):
return self.error
@abstractmethod
def run(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
# Should set self.error, self.status_code and self.content
pass
@abstractmethod
def quit(self):
return
@abstractmethod
def get_last_status_code(self):
return self.status_code
@abstractmethod
def screenshot_step(self, step_n):
return None
@abstractmethod
# Return true/false if this checker is ready to run, in the case it needs todo some special config check etc
def is_ready(self):
return True
def get_all_headers(self):
"""
Get all headers but ensure all keys are lowercase
:return:
"""
return {k.lower(): v for k, v in self.headers.items()}
def iterate_browser_steps(self):
from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
from playwright._impl._api_types import TimeoutError
from jinja2 import Environment
jinja2_env = Environment(extensions=['jinja2_time.TimeExtension'])
step_n = 0
if self.browser_steps is not None and len(self.browser_steps):
interface = steppable_browser_interface()
interface.page = self.page
valid_steps = filter(
lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'),
self.browser_steps)
for step in valid_steps:
step_n += 1
print(">> Iterating check - browser Step n {} - {}...".format(step_n, step['operation']))
self.screenshot_step("before-" + str(step_n))
self.save_step_html("before-" + str(step_n))
try:
optional_value = step['optional_value']
selector = step['selector']
# Support for jinja2 template in step values, with date module added
if '{%' in step['optional_value'] or '{{' in step['optional_value']:
optional_value = str(jinja2_env.from_string(step['optional_value']).render())
if '{%' in step['selector'] or '{{' in step['selector']:
selector = str(jinja2_env.from_string(step['selector']).render())
getattr(interface, "call_action")(action_name=step['operation'],
selector=selector,
optional_value=optional_value)
self.screenshot_step(step_n)
self.save_step_html(step_n)
except TimeoutError as e:
print(str(e))
# Stop processing here
raise BrowserStepsStepTimout(step_n=step_n)
# It's always good to reset these
def delete_browser_steps_screenshots(self):
import glob
if self.browser_steps_screenshot_path is not None:
dest = os.path.join(self.browser_steps_screenshot_path, 'step_*.jpeg')
files = glob.glob(dest)
for f in files:
os.unlink(f)
# Maybe for the future, each fetcher provides its own diff output, could be used for text, image
# the current one would return javascript output (as we use JS to generate the diff)
#
def available_fetchers():
# See the if statement at the bottom of this file for how we switch between playwright and webdriver
import inspect
p = []
for name, obj in inspect.getmembers(sys.modules[__name__], inspect.isclass):
if inspect.isclass(obj):
# @todo html_ is maybe better as fetcher_ or something
# In this case, make sure to edit the default one in store.py and fetch_site_status.py
if name.startswith('html_'):
t = tuple([name, obj.fetcher_description])
p.append(t)
return p
class base_html_playwright(Fetcher):
fetcher_description = "Playwright {}/Javascript".format(
os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
)
if os.getenv("PLAYWRIGHT_DRIVER_URL"):
fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL"))
browser_type = ''
command_executor = ''
# Configs for Proxy setup
# In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server"
playwright_proxy_settings_mappings = ['bypass', 'server', 'username', 'password']
proxy = None
def __init__(self, proxy_override=None):
super().__init__()
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')
self.command_executor = os.getenv(
"PLAYWRIGHT_DRIVER_URL",
'ws://playwright-chrome:3000'
).strip('"')
# If any proxy settings are enabled, then we should setup the proxy object
proxy_args = {}
for k in self.playwright_proxy_settings_mappings:
v = os.getenv('playwright_proxy_' + k, False)
if v:
proxy_args[k] = v.strip('"')
if proxy_args:
self.proxy = proxy_args
# allow per-watch proxy selection override
if proxy_override:
self.proxy = {'server': proxy_override}
if self.proxy:
# Playwright needs separate username and password values
from urllib.parse import urlparse
parsed = urlparse(self.proxy.get('server'))
if parsed.username:
self.proxy['username'] = parsed.username
self.proxy['password'] = parsed.password
def screenshot_step(self, step_n=''):
screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=85)
if self.browser_steps_screenshot_path is not None:
destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n))
logging.debug("Saving step screenshot to {}".format(destination))
with open(destination, 'wb') as f:
f.write(screenshot)
def save_step_html(self, step_n):
content = self.page.content()
destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n))
logging.debug("Saving step HTML to {}".format(destination))
with open(destination, 'w') as f:
f.write(content)
def run_fetch_browserless_puppeteer(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
from pkg_resources import resource_string
extra_wait_ms = (int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) * 1000
self.xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
code = resource_string(__name__, "res/puppeteer_fetch.js").decode('utf-8')
# In the future inject this is a proper JS package
code = code.replace('%xpath_scrape_code%', self.xpath_element_js)
code = code.replace('%instock_scrape_code%', self.instock_data_js)
from requests.exceptions import ConnectTimeout, ReadTimeout
wait_browserless_seconds = 240
browserless_function_url = os.getenv('BROWSERLESS_FUNCTION_URL')
from urllib.parse import urlparse
if not browserless_function_url:
# Convert/try to guess from PLAYWRIGHT_DRIVER_URL
o = urlparse(os.getenv('PLAYWRIGHT_DRIVER_URL'))
browserless_function_url = o._replace(scheme="http")._replace(path="function").geturl()
# Append proxy connect string
if self.proxy:
import urllib.parse
# Remove username/password if it exists in the URL or you will receive "ERR_NO_SUPPORTED_PROXIES" error
# Actual authentication handled by Puppeteer/node
o = urlparse(self.proxy.get('server'))
proxy_url = urllib.parse.quote(o._replace(netloc="{}:{}".format(o.hostname, o.port)).geturl())
browserless_function_url = f"{browserless_function_url}&--proxy-server={proxy_url}&dumpio=true"
try:
amp = '&' if '?' in browserless_function_url else '?'
response = requests.request(
method="POST",
json={
"code": code,
"context": {
# Very primitive disk cache - USE WITH EXTREME CAUTION
# Run browserless container with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]"
'disk_cache_dir': os.getenv("PUPPETEER_DISK_CACHE", False), # or path to disk cache ending in /, ie /tmp/cache/
'execute_js': self.webdriver_js_execute_code,
'extra_wait_ms': extra_wait_ms,
'include_filters': current_include_filters,
'req_headers': request_headers,
'screenshot_quality': int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)),
'url': url,
'user_agent': request_headers.get('User-Agent', 'Mozilla/5.0'),
'proxy_username': self.proxy.get('username','') if self.proxy else False,
'proxy_password': self.proxy.get('password', '') if self.proxy else False,
'no_cache_list': [
'twitter',
'.pdf'
],
# Could use https://github.com/easylist/easylist here, or install a plugin
'block_url_list': [
'adnxs.com',
'analytics.twitter.com',
'doubleclick.net',
'google-analytics.com',
'googletagmanager',
'trustpilot.com'
]
}
},
# @todo /function needs adding ws:// to http:// rebuild this
url=browserless_function_url+f"{amp}--disable-features=AudioServiceOutOfProcess&dumpio=true&--disable-remote-fonts",
timeout=wait_browserless_seconds)
except ReadTimeout:
raise PageUnloadable(url=url, status_code=None, message=f"No response from browserless in {wait_browserless_seconds}s")
except ConnectTimeout:
raise PageUnloadable(url=url, status_code=None, message=f"Timed out connecting to browserless, retrying..")
else:
# 200 Here means that the communication to browserless worked only, not the page state
if response.status_code == 200:
import base64
x = response.json()
if not x.get('screenshot'):
# https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips
# https://github.com/puppeteer/puppeteer/issues/1834
# https://github.com/puppeteer/puppeteer/issues/1834#issuecomment-381047051
# Check your memory is shared and big enough
raise ScreenshotUnavailable(url=url, status_code=None)
if not x.get('content', '').strip():
raise EmptyReply(url=url, status_code=None)
if x.get('status_code', 200) != 200 and not ignore_status_codes:
raise Non200ErrorCodeReceived(url=url, status_code=x.get('status_code', 200), page_html=x['content'])
self.content = x.get('content')
self.headers = x.get('headers')
self.instock_data = x.get('instock_data')
self.screenshot = base64.b64decode(x.get('screenshot'))
self.status_code = x.get('status_code')
self.xpath_data = x.get('xpath_data')
else:
# Some other error from browserless
raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8'))
def run(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
# For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!)
has_browser_steps = self.browser_steps and list(filter(
lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'),
self.browser_steps))
if not has_browser_steps:
if os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
# Temporary backup solution until we rewrite the playwright code
return self.run_fetch_browserless_puppeteer(
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes,
current_include_filters,
is_binary)
from playwright.sync_api import sync_playwright
import playwright._impl._api_types
self.delete_browser_steps_screenshots()
response = None
with sync_playwright() as p:
browser_type = getattr(p, self.browser_type)
# Seemed to cause a connection Exception even tho I can see it connect
# self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000)
# 60,000 connection timeout only
browser = browser_type.connect_over_cdp(self.command_executor, timeout=60000)
# Set user agent to prevent Cloudflare from blocking the browser
# Use the default one configured in the App.py model that's passed from fetch_site_status.py
context = browser.new_context(
user_agent=request_headers.get('User-Agent', 'Mozilla/5.0'),
proxy=self.proxy,
# This is needed to enable JavaScript execution on GitHub and others
bypass_csp=True,
# Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'),
# Should never be needed
accept_downloads=False
)
self.page = context.new_page()
if len(request_headers):
context.set_extra_http_headers(request_headers)
self.page.set_default_navigation_timeout(90000)
self.page.set_default_timeout(90000)
# Listen for all console events and handle errors
self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
# Goto page
try:
# Wait_until = commit
# - `'commit'` - consider operation to be finished when network response is received and the document started loading.
# Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
# This seemed to solve nearly all 'TimeoutErrors'
response = self.page.goto(url, wait_until='commit')
except playwright._impl._api_types.Error as e:
# Retry once - https://github.com/browserless/chrome/issues/2485
# Sometimes errors related to invalid cert's and other can be random
print("Content Fetcher > retrying request got error - ", str(e))
time.sleep(1)
response = self.page.goto(url, wait_until='commit')
except Exception as e:
print("Content Fetcher > Other exception when page.goto", str(e))
context.close()
browser.close()
raise PageUnloadable(url=url, status_code=None, message=str(e))
# Execute any browser steps
try:
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
self.page.wait_for_timeout(extra_wait * 1000)
if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
self.page.evaluate(self.webdriver_js_execute_code)
except playwright._impl._api_types.TimeoutError as e:
context.close()
browser.close()
# This can be ok, we will try to grab what we could retrieve
pass
except Exception as e:
print("Content Fetcher > Other exception when executing custom JS code", str(e))
context.close()
browser.close()
raise PageUnloadable(url=url, status_code=None, message=str(e))
if response is None:
context.close()
browser.close()
print("Content Fetcher > Response object was none")
raise EmptyReply(url=url, status_code=None)
# Run Browser Steps here
self.iterate_browser_steps()
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
time.sleep(extra_wait)
self.content = self.page.content()
self.status_code = response.status
if len(self.page.content().strip()) == 0:
context.close()
browser.close()
print("Content Fetcher > Content was empty")
raise EmptyReply(url=url, status_code=response.status)
self.status_code = response.status
self.headers = response.all_headers()
# So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
if current_include_filters is not None:
self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters)))
else:
self.page.evaluate("var include_filters=''")
self.xpath_data = self.page.evaluate(
"async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}")
self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}")
# Bug 3 in Playwright screenshot handling
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
# JPEG is better here because the screenshots can be very very large
# Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
# which will significantly increase the IO size between the server and client, it's recommended to use the lowest
# acceptable screenshot quality here
try:
# The actual screenshot
self.screenshot = self.page.screenshot(type='jpeg', full_page=True,
quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)))
except Exception as e:
context.close()
browser.close()
raise ScreenshotUnavailable(url=url, status_code=None)
context.close()
browser.close()
class base_html_webdriver(Fetcher):
if os.getenv("WEBDRIVER_URL"):
fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))
else:
fetcher_description = "WebDriver Chrome/Javascript"
command_executor = ''
# Configs for Proxy setup
# In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy"
selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy',
'proxyAutoconfigUrl', 'sslProxy', 'autodetect',
'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
proxy = None
def __init__(self, proxy_override=None):
super().__init__()
from selenium.webdriver.common.proxy import Proxy as SeleniumProxy
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
self.command_executor = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')
# If any proxy settings are enabled, then we should setup the proxy object
proxy_args = {}
for k in self.selenium_proxy_settings_mappings:
v = os.getenv('webdriver_' + k, False)
if v:
proxy_args[k] = v.strip('"')
# Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy
if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy:
proxy_args['httpProxy'] = self.system_http_proxy
if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy:
proxy_args['httpsProxy'] = self.system_https_proxy
# Allows override the proxy on a per-request basis
if proxy_override is not None:
proxy_args['httpProxy'] = proxy_override
if proxy_args:
self.proxy = SeleniumProxy(raw=proxy_args)
def run(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import WebDriverException
# request_body, request_method unused for now, until some magic in the future happens.
# check env for WEBDRIVER_URL
self.driver = webdriver.Remote(
command_executor=self.command_executor,
desired_capabilities=DesiredCapabilities.CHROME,
proxy=self.proxy)
try:
self.driver.get(url)
except WebDriverException as e:
# Be sure we close the session window
self.quit()
raise
self.driver.set_window_size(1280, 1024)
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
if self.webdriver_js_execute_code is not None:
self.driver.execute_script(self.webdriver_js_execute_code)
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
# @todo - how to check this? is it possible?
self.status_code = 200
# @todo somehow we should try to get this working for WebDriver
# raise EmptyReply(url=url, status_code=r.status_code)
# @todo - dom wait loaded?
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
self.content = self.driver.page_source
self.headers = {}
self.screenshot = self.driver.get_screenshot_as_png()
# Does the connection to the webdriver work? run a test connection.
def is_ready(self):
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
self.driver = webdriver.Remote(
command_executor=self.command_executor,
desired_capabilities=DesiredCapabilities.CHROME)
# driver.quit() seems to cause better exceptions
self.quit()
return True
def quit(self):
if self.driver:
try:
self.driver.quit()
except Exception as e:
print("Content Fetcher > Exception in chrome shutdown/quit" + str(e))
# "html_requests" is listed as the default fetcher in store.py!
class html_requests(Fetcher):
fetcher_description = "Basic fast Plaintext/HTTP Client"
def __init__(self, proxy_override=None):
self.proxy_override = proxy_override
def run(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
# Make requests use a more modern looking user-agent
if not 'User-Agent' in request_headers:
request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT",
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36')
proxies = {}
# Allows override the proxy on a per-request basis
if self.proxy_override:
proxies = {'http': self.proxy_override, 'https': self.proxy_override, 'ftp': self.proxy_override}
else:
if self.system_http_proxy:
proxies['http'] = self.system_http_proxy
if self.system_https_proxy:
proxies['https'] = self.system_https_proxy
r = requests.request(method=request_method,
data=request_body,
url=url,
headers=request_headers,
timeout=timeout,
proxies=proxies,
verify=False)
# If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
# For example - some sites don't tell us it's utf-8, but return utf-8 content
# This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably.
# https://github.com/psf/requests/issues/1604 good info about requests encoding detection
if not is_binary:
# Don't run this for PDF (and requests identified as binary) takes a _long_ time
if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
encoding = chardet.detect(r.content)['encoding']
if encoding:
r.encoding = encoding
if not r.content or not len(r.content):
raise EmptyReply(url=url, status_code=r.status_code)
# @todo test this
# @todo maybe you really want to test zero-byte return pages?
if r.status_code != 200 and not ignore_status_codes:
# maybe check with content works?
raise Non200ErrorCodeReceived(url=url, status_code=r.status_code, page_html=r.text)
self.status_code = r.status_code
if is_binary:
# Binary files just return their checksum until we add something smarter
self.content = hashlib.md5(r.content).hexdigest()
else:
self.content = r.text
self.headers = r.headers
self.raw_content = r.content
# Decide which is the 'real' HTML webdriver, this is more a system wide config
# rather than site-specific.
use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False)
if use_playwright_as_chrome_fetcher:
html_webdriver = base_html_playwright
else:
html_webdriver = base_html_webdriver

View File

@@ -0,0 +1,150 @@
from abc import abstractmethod
import os
from . import exceptions
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section, summary'
class Fetcher():
browser_steps = None
browser_steps_screenshot_path = None
content = None
error = None
fetcher_description = "No description"
headers = None
status_code = None
webdriver_js_execute_code = None
xpath_data = None
xpath_element_js = ""
instock_data = None
instock_data_js = ""
# Will be needed in the future by the VisualSelector, always get this where possible.
screenshot = False
system_http_proxy = os.getenv('HTTP_PROXY')
system_https_proxy = os.getenv('HTTPS_PROXY')
# Time ONTOP of the system defined env minimum time
render_extract_delay = 0
def __init__(self):
from pkg_resources import resource_string
# The code that scrapes elements and makes a list of elements/size/position to click on in the VisualSelector
self.xpath_element_js = resource_string(__name__, "../res/xpath_element_scraper.js").decode('utf-8')
self.instock_data_js = resource_string(__name__, "../res/stock-not-in-stock.js").decode('utf-8')
@abstractmethod
def get_error(self):
return self.error
@abstractmethod
def run(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
# Should set self.error, self.status_code and self.content
pass
@abstractmethod
def quit(self):
return
@abstractmethod
def get_last_status_code(self):
return self.status_code
@abstractmethod
def screenshot_step(self, step_n):
return None
@abstractmethod
# Return true/false if this checker is ready to run, in the case it needs todo some special config check etc
def is_ready(self):
return True
def iterate_browser_steps(self):
from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
from playwright._impl._api_types import TimeoutError
from jinja2 import Environment
jinja2_env = Environment(extensions=['jinja2_time.TimeExtension'])
step_n = 0
if self.browser_steps is not None and len(self.browser_steps):
interface = steppable_browser_interface()
interface.page = self.page
valid_steps = filter(lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), self.browser_steps)
for step in valid_steps:
step_n += 1
print(">> Iterating check - browser Step n {} - {}...".format(step_n, step['operation']))
self.screenshot_step("before-"+str(step_n))
self.save_step_html("before-"+str(step_n))
try:
optional_value = step['optional_value']
selector = step['selector']
# Support for jinja2 template in step values, with date module added
if '{%' in step['optional_value'] or '{{' in step['optional_value']:
optional_value = str(jinja2_env.from_string(step['optional_value']).render())
if '{%' in step['selector'] or '{{' in step['selector']:
selector = str(jinja2_env.from_string(step['selector']).render())
getattr(interface, "call_action")(action_name=step['operation'],
selector=selector,
optional_value=optional_value)
self.screenshot_step(step_n)
self.save_step_html(step_n)
except TimeoutError:
# Stop processing here
raise exceptions.BrowserStepsStepTimout(step_n=step_n)
# It's always good to reset these
def delete_browser_steps_screenshots(self):
import glob
if self.browser_steps_screenshot_path is not None:
dest = os.path.join(self.browser_steps_screenshot_path, 'step_*.jpeg')
files = glob.glob(dest)
for f in files:
os.unlink(f)
# Maybe for the future, each fetcher provides its own diff output, could be used for text, image
# the current one would return javascript output (as we use JS to generate the diff)
#
def available_fetchers():
from . import playwright, html_requests, webdriver
p = []
p.append(tuple(['html_requests', html_requests.fetcher.fetcher_description]))
# Prefer playwright
if os.getenv('PLAYWRIGHT_DRIVER_URL', False):
p.append(tuple(['html_webdriver', playwright.fetcher.fetcher_description]))
elif os.getenv('WEBDRIVER_URL'):
p.append(tuple(['html_webdriver', webdriver.fetcher.fetcher_description]))
return p
html_webdriver = None
# Decide which is the 'real' HTML webdriver, this is more a system wide config rather than site-specific.
use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False)
if use_playwright_as_chrome_fetcher:
from . import playwright
html_webdriver = getattr(playwright, "fetcher")
else:
from . import webdriver
html_webdriver = getattr(webdriver, "fetcher")

View File

@@ -0,0 +1,71 @@
from . import Fetcher
import os
import requests
# Exploit the debugging API to get screenshot and HTML without needing playwright
# https://www.browserless.io/docs/scrape#debugging
class fetcher(Fetcher):
fetcher_description = "Browserless Chrome/Javascript via '{}'".format(os.getenv("BROWSERLESS_DRIVER_URL"))
command_executor = ''
proxy = None
def __init__(self, proxy_override=None, command_executor=None):
super().__init__()
self.proxy = proxy_override
def run(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
proxy = ""
if self.proxy:
proxy = f"--proxy-server={self.proxy}"
import json
r = requests.request(method='POST',
data=json.dumps({
"url": f"{url}?{proxy}",
"elements": [],
"debug": {
"screenshot": True,
"console": False,
"network": True,
"cookies": False,
"html": True
}
}),
url=os.getenv("BROWSERLESS_DRIVER_URL"),
headers={'Content-Type': 'application/json'},
timeout=timeout,
verify=False)
# "waitFor": "() => document.querySelector('h1')"
# extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
# self.page.wait_for_timeout(extra_wait * 1000)
if r.status_code == 200:
# the basic request to browserless was OK, but how was the internal request to the site?
result = r.json()
if result['debug']['network'].get('inbound') and len(result['debug']['network']['inbound']):
self.status_code = result['debug']['network']['inbound'][000]['status']
self.content = result['debug']['html']
self.headers = {}
if result['debug'].get('screenshot'):
import base64
self.screenshot = base64.b64decode(result['debug']['screenshot'])
def is_ready(self):
# Try ping?
return os.getenv("BROWSERLESS_DRIVER_URL", False)

View File

@@ -0,0 +1,66 @@
class Non200ErrorCodeReceived(Exception):
def __init__(self, status_code, url, screenshot=None, xpath_data=None, page_html=None):
# Set this so we can use it in other parts of the app
self.status_code = status_code
self.url = url
self.screenshot = screenshot
self.xpath_data = xpath_data
self.page_text = None
if page_html:
from changedetectionio import html_tools
self.page_text = html_tools.html_to_text(page_html)
return
class checksumFromPreviousCheckWasTheSame(Exception):
def __init__(self):
return
class JSActionExceptions(Exception):
def __init__(self, status_code, url, screenshot, message=''):
self.status_code = status_code
self.url = url
self.screenshot = screenshot
self.message = message
return
class BrowserStepsStepTimout(Exception):
def __init__(self, step_n):
self.step_n = step_n
return
class PageUnloadable(Exception):
def __init__(self, status_code, url, message, screenshot=False):
# Set this so we can use it in other parts of the app
self.status_code = status_code
self.url = url
self.screenshot = screenshot
self.message = message
return
class EmptyReply(Exception):
def __init__(self, status_code, url, screenshot=None):
# Set this so we can use it in other parts of the app
self.status_code = status_code
self.url = url
self.screenshot = screenshot
return
class ScreenshotUnavailable(Exception):
def __init__(self, status_code, url, page_html=None):
# Set this so we can use it in other parts of the app
self.status_code = status_code
self.url = url
if page_html:
from ..html_tools import html_to_text
self.page_text = html_to_text(page_html)
return
class ReplyWithContentButNoText(Exception):
def __init__(self, status_code, url, screenshot=None):
# Set this so we can use it in other parts of the app
self.status_code = status_code
self.url = url
self.screenshot = screenshot
return

View File

@@ -0,0 +1,80 @@
from . import Fetcher
from . import exceptions
# "html_requests" is listed as the default fetcher in store.py!
class fetcher(Fetcher):
fetcher_description = "Basic fast Plaintext/HTTP Client"
def __init__(self, proxy_override=None):
self.proxy_override = proxy_override
def run(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
import chardet
import hashlib
import os
import requests
# Make requests use a more modern looking user-agent
if not 'User-Agent' in request_headers:
request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT",
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36')
proxies = {}
# Allows override the proxy on a per-request basis
if self.proxy_override:
proxies = {'http': self.proxy_override, 'https': self.proxy_override, 'ftp': self.proxy_override}
else:
if self.system_http_proxy:
proxies['http'] = self.system_http_proxy
if self.system_https_proxy:
proxies['https'] = self.system_https_proxy
r = requests.request(method=request_method,
data=request_body,
url=url,
headers=request_headers,
timeout=timeout,
proxies=proxies,
verify=False)
# If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
# For example - some sites don't tell us it's utf-8, but return utf-8 content
# This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably.
# https://github.com/psf/requests/issues/1604 good info about requests encoding detection
if not is_binary:
# Don't run this for PDF (and requests identified as binary) takes a _long_ time
if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
encoding = chardet.detect(r.content)['encoding']
if encoding:
r.encoding = encoding
if not r.content or not len(r.content):
raise exceptions.EmptyReply(url=url, status_code=r.status_code)
# @todo test this
# @todo maybe you really want to test zero-byte return pages?
if r.status_code != 200 and not ignore_status_codes:
# maybe check with content works?
raise exceptions.Non200ErrorCodeReceived(url=url, status_code=r.status_code, page_html=r.text)
self.status_code = r.status_code
if is_binary:
# Binary files just return their checksum until we add something smarter
self.content = hashlib.md5(r.content).hexdigest()
else:
self.content = r.text
self.headers = r.headers
self.raw_content = r.content

View File

@@ -0,0 +1,208 @@
from . import Fetcher
from . import exceptions
from . import visualselector_xpath_selectors
import os
import logging
import time
class fetcher(Fetcher):
fetcher_description = "Playwright {}/Javascript".format(
os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
)
if os.getenv("PLAYWRIGHT_DRIVER_URL"):
fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL"))
browser_type = ''
command_executor = ''
# Configs for Proxy setup
# In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server"
playwright_proxy_settings_mappings = ['bypass', 'server', 'username', 'password']
proxy = None
def __init__(self, proxy_override=None):
super().__init__()
import json
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')
self.command_executor = os.getenv(
"PLAYWRIGHT_DRIVER_URL",
'ws://playwright-chrome:3000'
).strip('"')
# If any proxy settings are enabled, then we should setup the proxy object
proxy_args = {}
for k in self.playwright_proxy_settings_mappings:
v = os.getenv('playwright_proxy_' + k, False)
if v:
proxy_args[k] = v.strip('"')
if proxy_args:
self.proxy = proxy_args
# allow per-watch proxy selection override
if proxy_override:
self.proxy = {'server': proxy_override}
if self.proxy:
# Playwright needs separate username and password values
from urllib.parse import urlparse
parsed = urlparse(self.proxy.get('server'))
if parsed.username:
self.proxy['username'] = parsed.username
self.proxy['password'] = parsed.password
def screenshot_step(self, step_n=''):
screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=85)
if self.browser_steps_screenshot_path is not None:
destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n))
logging.debug("Saving step screenshot to {}".format(destination))
with open(destination, 'wb') as f:
f.write(screenshot)
def save_step_html(self, step_n):
content = self.page.content()
destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n))
logging.debug("Saving step HTML to {}".format(destination))
with open(destination, 'w') as f:
f.write(content)
def run(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
from playwright.sync_api import sync_playwright
import playwright._impl._api_types
import json
self.delete_browser_steps_screenshots()
response = None
with sync_playwright() as p:
browser_type = getattr(p, self.browser_type)
# Seemed to cause a connection Exception even tho I can see it connect
# self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000)
# 60,000 connection timeout only
browser = browser_type.connect_over_cdp(self.command_executor, timeout=60000)
# Set user agent to prevent Cloudflare from blocking the browser
# Use the default one configured in the App.py model that's passed from fetch_site_status.py
context = browser.new_context(
user_agent=request_headers['User-Agent'] if request_headers.get('User-Agent') else 'Mozilla/5.0',
proxy=self.proxy,
# This is needed to enable JavaScript execution on GitHub and others
bypass_csp=True,
# Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'),
# Should never be needed
accept_downloads=False
)
self.page = context.new_page()
if len(request_headers):
context.set_extra_http_headers(request_headers)
self.page.set_default_navigation_timeout(90000)
self.page.set_default_timeout(90000)
# Listen for all console events and handle errors
self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
# Goto page
try:
# Wait_until = commit
# - `'commit'` - consider operation to be finished when network response is received and the document started loading.
# Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
# This seemed to solve nearly all 'TimeoutErrors'
response = self.page.goto(url, wait_until='commit')
except playwright._impl._api_types.Error as e:
# Retry once - https://github.com/browserless/chrome/issues/2485
# Sometimes errors related to invalid cert's and other can be random
print ("Content Fetcher > retrying request got error - ", str(e))
time.sleep(1)
response = self.page.goto(url, wait_until='commit')
except Exception as e:
print ("Content Fetcher > Other exception when page.goto", str(e))
context.close()
browser.close()
raise exceptions.PageUnloadable(url=url, status_code=None, message=str(e))
# Execute any browser steps
try:
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
self.page.wait_for_timeout(extra_wait * 1000)
if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
self.page.evaluate(self.webdriver_js_execute_code)
except playwright._impl._api_types.TimeoutError as e:
context.close()
browser.close()
# This can be ok, we will try to grab what we could retrieve
pass
except Exception as e:
print ("Content Fetcher > Other exception when executing custom JS code", str(e))
context.close()
browser.close()
raise exceptions.PageUnloadable(url=url, status_code=None, message=str(e))
if response is None:
context.close()
browser.close()
print ("Content Fetcher > Response object was none")
raise exceptions.EmptyReply(url=url, status_code=None)
# Run Browser Steps here
self.iterate_browser_steps()
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
time.sleep(extra_wait)
self.content = self.page.content()
self.status_code = response.status
if len(self.page.content().strip()) == 0:
context.close()
browser.close()
print ("Content Fetcher > Content was empty")
raise exceptions.EmptyReply(url=url, status_code=response.status)
self.status_code = response.status
self.headers = response.all_headers()
# So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
if current_include_filters is not None:
self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters)))
else:
self.page.evaluate("var include_filters=''")
self.xpath_data = self.page.evaluate("async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}")
self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}")
# Bug 3 in Playwright screenshot handling
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
# JPEG is better here because the screenshots can be very very large
# Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
# which will significantly increase the IO size between the server and client, it's recommended to use the lowest
# acceptable screenshot quality here
try:
# The actual screenshot
self.screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)))
except Exception as e:
context.close()
browser.close()
raise exceptions.ScreenshotUnavailable(url=url, status_code=None)
context.close()
browser.close()

View File

@@ -0,0 +1,103 @@
from . import Fetcher
import os
import time
class fetcher(Fetcher):
if os.getenv("WEBDRIVER_URL"):
fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))
else:
fetcher_description = "WebDriver Chrome/Javascript"
command_executor = ''
# Configs for Proxy setup
# In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy"
selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy',
'proxyAutoconfigUrl', 'sslProxy', 'autodetect',
'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
proxy = None
def __init__(self, proxy_override=None, command_executor=None):
super().__init__()
from selenium.webdriver.common.proxy import Proxy as SeleniumProxy
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
if command_executor:
self.command_executor = command_executor
else:
self.command_executor = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')
# If any proxy settings are enabled, then we should setup the proxy object
proxy_args = {}
for k in self.selenium_proxy_settings_mappings:
v = os.getenv('webdriver_' + k, False)
if v:
proxy_args[k] = v.strip('"')
# Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy
if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy:
proxy_args['httpProxy'] = self.system_http_proxy
if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy:
proxy_args['httpsProxy'] = self.system_https_proxy
# Allows override the proxy on a per-request basis
if proxy_override is not None:
proxy_args['httpProxy'] = proxy_override
if proxy_args:
self.proxy = SeleniumProxy(raw=proxy_args)
def run(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import WebDriverException
# request_body, request_method unused for now, until some magic in the future happens.
# check env for WEBDRIVER_URL
self.driver = webdriver.Remote(
command_executor=self.command_executor,
desired_capabilities=DesiredCapabilities.CHROME,
proxy=self.proxy
)
try:
self.driver.get(url)
except WebDriverException as e:
# Be sure we close the session window
self.quit()
raise
self.driver.set_window_size(1280, 1024)
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
if self.webdriver_js_execute_code is not None:
self.driver.execute_script(self.webdriver_js_execute_code)
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
# @todo - how to check this? is it possible?
self.status_code = 200
# @todo somehow we should try to get this working for WebDriver
# raise EmptyReply(url=url, status_code=r.status_code)
# @todo - dom wait loaded?
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
self.content = self.driver.page_source
self.headers = {}
self.screenshot = self.driver.get_screenshot_as_png()
# Try something with requests?
def is_ready(self):
return True

View File

@@ -21,7 +21,6 @@ from wtforms.validators import ValidationError
# each select <option data-enabled="enabled-0-0"
from changedetectionio.blueprint.browser_steps.browser_steps import browser_step_ui_config
from changedetectionio import content_fetcher
from changedetectionio.notification import (
valid_notification_formats,
)
@@ -135,30 +134,31 @@ class ValidateContentFetcherIsReady(object):
def __call__(self, form, field):
import urllib3.exceptions
from changedetectionio import content_fetcher
import importlib
# Better would be a radiohandler that keeps a reference to each class
if field.data is not None and field.data != 'system':
klass = getattr(content_fetcher, field.data)
some_object = klass()
try:
ready = some_object.is_ready()
from . import fetchers
if fetchers.html_webdriver is not None:
try:
driver = fetchers.html_webdriver()
driver.is_ready()
except urllib3.exceptions.MaxRetryError as e:
driver_url = some_object.command_executor
message = field.gettext('Content fetcher \'%s\' did not respond.' % (field.data))
message += '<br>' + field.gettext(
'Be sure that the selenium/webdriver runner is running and accessible via network from this container/host.')
message += '<br>' + field.gettext('Did you follow the instructions in the wiki?')
message += '<br><br>' + field.gettext('WebDriver Host: %s' % (driver_url))
message += '<br><a href="https://github.com/dgtlmoon/changedetection.io/wiki/Fetching-pages-with-WebDriver">Go here for more information</a>'
message += '<br>'+field.gettext('Content fetcher did not respond properly, unable to use it.\n %s' % (str(e)))
except urllib3.exceptions.MaxRetryError as e:
driver_url = fetchers.html_webdriver.command_executor
message = field.gettext('Content fetcher \'%s\' did not respond.' % (field.data))
message += '<br>' + field.gettext(
'Be sure that the selenium/webdriver runner is running and accessible via network from this container/host.')
message += '<br>' + field.gettext('Did you follow the instructions in the wiki?')
message += '<br><br>' + field.gettext('WebDriver Host: %s' % (driver_url))
message += '<br><a href="https://github.com/dgtlmoon/changedetection.io/wiki/Fetching-pages-with-WebDriver">Go here for more information</a>'
message += '<br>'+field.gettext('Content fetcher did not respond properly, unable to use it.\n %s' % (str(e)))
raise ValidationError(message)
raise ValidationError(message)
except Exception as e:
message = field.gettext('Content fetcher \'%s\' did not respond properly, unable to use it.\n %s')
raise ValidationError(message % (field.data, e))
except Exception as e:
message = field.gettext('Content fetcher \'%s\' did not respond properly, unable to use it.\n %s')
raise ValidationError(message % (field.data, e))
class ValidateNotificationBodyAndTitleWhenURLisSet(object):
@@ -355,11 +355,12 @@ class quickWatchForm(Form):
# Common to a single watch and the global settings
class commonSettingsForm(Form):
from .fetchers import available_fetchers
notification_urls = StringListField('Notification URL List', validators=[validators.Optional(), ValidateAppRiseServers()])
notification_title = StringField('Notification Title', default='ChangeDetection.io Notification - {{ watch_url }}', validators=[validators.Optional(), ValidateJinja2Template()])
notification_body = TextAreaField('Notification Body', default='{{ watch_url }} had a change.', validators=[validators.Optional(), ValidateJinja2Template()])
notification_format = SelectField('Notification format', choices=valid_notification_formats.keys())
fetch_backend = RadioField(u'Fetch Method', choices=content_fetcher.available_fetchers(), validators=[ValidateContentFetcherIsReady()])
fetch_backend = RadioField(u'Fetch Method', choices=available_fetchers(), validators=[ValidateContentFetcherIsReady()])
extract_title_as_title = BooleanField('Extract <title> from document and use as watch title', default=False)
webdriver_delay = IntegerField('Wait seconds before extracting text', validators=[validators.Optional(), validators.NumberRange(min=1,
message="Should contain one or more seconds")])
@@ -472,19 +473,15 @@ class globalSettingsRequestForm(Form):
# datastore.data['settings']['application']..
class globalSettingsApplicationForm(commonSettingsForm):
from .fetchers import available_fetchers
api_access_token_enabled = BooleanField('API access token security check enabled', default=True, validators=[validators.Optional()])
base_url = StringField('Base URL', validators=[validators.Optional()])
empty_pages_are_a_change = BooleanField('Treat empty pages as a change?', default=False)
fetch_backend = RadioField('Fetch Method', default="html_requests", choices=content_fetcher.available_fetchers(), validators=[ValidateContentFetcherIsReady()])
fetch_backend = RadioField('Fetch Method', default="html_requests", choices=available_fetchers(), validators=[ValidateContentFetcherIsReady()])
global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
ignore_whitespace = BooleanField('Ignore whitespace')
password = SaltyPasswordField()
pager_size = IntegerField('Pager size',
render_kw={"style": "width: 5em;"},
validators=[validators.NumberRange(min=0,
message="Should be atleast zero (disabled)")])
removepassword_button = SubmitField('Remove password', render_kw={"class": "pure-button pure-button-primary"})
render_anchor_tag_content = BooleanField('Render anchor tag content', default=False)
shared_diff_access = BooleanField('Allow access to view diff page when password is enabled', default=False, validators=[validators.Optional()])

View File

@@ -137,13 +137,12 @@ def _get_stripped_text_from_json_match(match):
def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):
stripped_text_from_html = False
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
try:
stripped_text_from_html = _parse_json(json.loads(content), json_filter)
except json.JSONDecodeError:
# Foreach <script json></script> blob.. just return the first that matches json_filter
# As a last resort, try to parse the whole <body>
s = []
soup = BeautifulSoup(content, 'html.parser')
@@ -151,34 +150,32 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
bs_result = soup.findAll('script', {"type": "application/ld+json"})
else:
bs_result = soup.findAll('script')
bs_result += soup.findAll('body')
bs_jsons = []
if not bs_result:
raise JSONNotFound("No parsable JSON found in this document")
for result in bs_result:
# Skip empty tags, and things that dont even look like JSON
if not result.text or '{' not in result.text:
if not result.string or not '{' in result.string:
continue
try:
json_data = json.loads(result.text)
bs_jsons.append(json_data)
json_data = json.loads(result.string)
except json.JSONDecodeError:
# Skip objects which cannot be parsed
# Just skip it
continue
if not bs_jsons:
raise JSONNotFound("No parsable JSON found in this document")
for json_data in bs_jsons:
stripped_text_from_html = _parse_json(json_data, json_filter)
if ensure_is_ldjson_info_type:
# Could sometimes be list, string or something else random
if isinstance(json_data, dict):
# If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
# (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html:
break
elif stripped_text_from_html:
break
else:
stripped_text_from_html = _parse_json(json_data, json_filter)
if ensure_is_ldjson_info_type:
# Could sometimes be list, string or something else random
if isinstance(json_data, dict):
# If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
# (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html:
break
elif stripped_text_from_html:
break
if not stripped_text_from_html:
# Re 265 - Just return an empty string when filter not found

View File

@@ -52,8 +52,7 @@ class import_url_list(Importer):
# Flask wtform validators wont work with basic auth, use validators package
# Up to 5000 per batch so we dont flood the server
# @todo validators.url failed on local hostnames (such as referring to ourself when using browserless)
if len(url) and 'http' in url.lower() and good < 5000:
if len(url) and validators.url(url.replace('source:', '')) and good < 5000:
extras = None
if processor:
extras = {'processor': processor}

View File

@@ -23,26 +23,25 @@ class model(dict):
'workers': int(getenv("DEFAULT_SETTINGS_REQUESTS_WORKERS", "10")), # Number of threads, lower is better for slow connections
},
'application': {
# Custom notification content
'api_access_token_enabled': True,
'password': False,
'base_url' : None,
'empty_pages_are_a_change': False,
'extract_title_as_title': False,
'empty_pages_are_a_change': False,
'fetch_backend': getenv("DEFAULT_FETCH_BACKEND", "html_requests"),
'filter_failure_notification_threshold_attempts': _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT,
'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
'global_subtractive_selectors': [],
'ignore_whitespace': True,
'render_anchor_tag_content': False,
'notification_urls': [], # Apprise URL list
# Custom notification content
'notification_title': default_notification_title,
'notification_body': default_notification_body,
'notification_format': default_notification_format,
'notification_title': default_notification_title,
'notification_urls': [], # Apprise URL list
'pager_size': 50,
'password': False,
'render_anchor_tag_content': False,
'schema_version' : 0,
'shared_diff_access': False,
'webdriver_delay': None , # Extra delay in seconds before extracting text
'webdriver_delay': None # Extra delay in seconds before extracting text
}
}
}
@@ -50,15 +49,3 @@ class model(dict):
def __init__(self, *arg, **kw):
super(model, self).__init__(*arg, **kw)
self.update(self.base_config)
def parse_headers_from_text_file(filepath):
headers = {}
with open(filepath, 'r') as f:
for l in f.readlines():
l = l.strip()
if not l.startswith('#') and ':' in l:
(k, v) = l.split(':')
headers[k.strip()] = v.strip()
return headers

View File

@@ -20,7 +20,6 @@ base_config = {
'body': None,
'check_unique_lines': False, # On change-detected, compare against all history if its something new
'check_count': 0,
'date_created': None,
'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine.
'extract_text': [], # Extract text by regex after filters
'extract_title_as_title': False,
@@ -473,40 +472,6 @@ class model(dict):
# None is set
return False
@property
def has_extra_headers_file(self):
if os.path.isfile(os.path.join(self.watch_data_dir, 'headers.txt')):
return True
for f in self.all_tags:
fname = "headers-"+re.sub(r'[\W_]', '', f).lower().strip() + ".txt"
filepath = os.path.join(self.__datastore_path, fname)
if os.path.isfile(filepath):
return True
return False
def get_all_headers(self):
from .App import parse_headers_from_text_file
headers = self.get('headers', {}).copy()
# Available headers on the disk could 'headers.txt' in the watch data dir
filepath = os.path.join(self.watch_data_dir, 'headers.txt')
try:
if os.path.isfile(filepath):
headers.update(parse_headers_from_text_file(filepath))
except Exception as e:
print(f"ERROR reading headers.txt at {filepath}", str(e))
# Or each by tag, as tagname.txt in the main datadir
for f in self.all_tags:
fname = "headers-"+re.sub(r'[\W_]', '', f).lower().strip() + ".txt"
filepath = os.path.join(self.__datastore_path, fname)
try:
if os.path.isfile(filepath):
headers.update(parse_headers_from_text_file(filepath))
except Exception as e:
print(f"ERROR reading headers.txt at {filepath}", str(e))
return headers
def get_last_fetched_before_filters(self):
import brotli

View File

@@ -89,7 +89,7 @@ def process_notification(n_object, datastore):
n_body = jinja2_env.from_string(n_object.get('notification_body', default_notification_body)).render(**notification_parameters)
n_title = jinja2_env.from_string(n_object.get('notification_title', default_notification_title)).render(**notification_parameters)
n_format = valid_notification_formats.get(
n_object.get('notification_format', default_notification_format),
n_object['notification_format'],
valid_notification_formats[default_notification_format],
)

View File

@@ -4,20 +4,14 @@ import os
import re
import urllib3
from . import difference_detection_processor
from changedetectionio import content_fetcher
from copy import deepcopy
from .. import fetchers
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
name = 'Re-stock detection for single product pages'
description = 'Detects if the product goes back to in-stock'
class UnableToExtractRestockData(Exception):
def __init__(self, status_code):
# Set this so we can use it in other parts of the app
self.status_code = status_code
return
class perform_site_check(difference_detection_processor):
screenshot = None
xpath_data = None
@@ -67,11 +61,12 @@ class perform_site_check(difference_detection_processor):
if not prefer_backend or prefer_backend == 'system':
prefer_backend = self.datastore.data['settings']['application']['fetch_backend']
if hasattr(content_fetcher, prefer_backend):
klass = getattr(content_fetcher, prefer_backend)
if prefer_backend == 'html_webdriver':
preferred_fetcher = fetchers.html_webdriver
else:
# If the klass doesnt exist, just use a default
klass = getattr(content_fetcher, "html_requests")
from ..fetchers import html_requests
preferred_fetcher = html_requests
proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=uuid)
proxy_url = None
@@ -79,7 +74,7 @@ class perform_site_check(difference_detection_processor):
proxy_url = self.datastore.proxy_list.get(proxy_id).get('url')
print("UUID {} Using proxy {}".format(uuid, proxy_url))
fetcher = klass(proxy_override=proxy_url)
fetcher = preferred_fetcher(proxy_override=proxy_url)
# Configurable per-watch or global extra delay before extracting text (for webDriver types)
system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None)
@@ -111,8 +106,7 @@ class perform_site_check(difference_detection_processor):
fetched_md5 = hashlib.md5(fetcher.instock_data.encode('utf-8')).hexdigest()
# 'Possibly in stock' comes from stock-not-in-stock.js when no string found above the fold.
update_obj["in_stock"] = True if fetcher.instock_data == 'Possibly in stock' else False
else:
raise UnableToExtractRestockData(status_code=fetcher.status_code)
# The main thing that all this at the moment comes down to :)
changed_detected = False

View File

@@ -7,10 +7,11 @@ import os
import re
import urllib3
from changedetectionio import content_fetcher, html_tools
from changedetectionio import html_tools
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
from copy import deepcopy
from . import difference_detection_processor
from .. import fetchers
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@@ -70,9 +71,10 @@ class perform_site_check(difference_detection_processor):
# Unset any existing notification error
update_obj = {'last_notification_error': False, 'last_error': False}
extra_headers = watch.get('headers', [])
# Tweak the base config with the per-watch ones
extra_headers = watch.get_all_headers()
request_headers = self.datastore.get_all_headers()
request_headers = deepcopy(self.datastore.data['settings']['headers'])
request_headers.update(extra_headers)
# https://github.com/psf/requests/issues/4525
@@ -100,11 +102,12 @@ class perform_site_check(difference_detection_processor):
if not prefer_backend or prefer_backend == 'system':
prefer_backend = self.datastore.data['settings']['application']['fetch_backend']
if hasattr(content_fetcher, prefer_backend):
klass = getattr(content_fetcher, prefer_backend)
if prefer_backend == 'html_webdriver':
preferred_fetcher = fetchers.html_webdriver
else:
# If the klass doesnt exist, just use a default
klass = getattr(content_fetcher, "html_requests")
from ..fetchers import html_requests
preferred_fetcher = html_requests
proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=uuid)
proxy_url = None
@@ -112,7 +115,7 @@ class perform_site_check(difference_detection_processor):
proxy_url = self.datastore.proxy_list.get(proxy_id).get('url')
print("UUID {} Using proxy {}".format(uuid, proxy_url))
fetcher = klass(proxy_override=proxy_url)
fetcher = preferred_fetcher(proxy_override=proxy_url)
# Configurable per-watch or global extra delay before extracting text (for webDriver types)
system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None)
@@ -139,14 +142,14 @@ class perform_site_check(difference_detection_processor):
self.xpath_data = fetcher.xpath_data
# Track the content type
update_obj['content_type'] = fetcher.get_all_headers().get('content-type', '').lower()
update_obj['content_type'] = fetcher.headers.get('Content-Type', '')
# Watches added automatically in the queue manager will skip if its the same checksum as the previous run
# Saves a lot of CPU
update_obj['previous_md5_before_filters'] = hashlib.md5(fetcher.content.encode('utf-8')).hexdigest()
if skip_when_checksum_same:
if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'):
raise content_fetcher.checksumFromPreviousCheckWasTheSame()
raise fetchers.exceptions.checksumFromPreviousCheckWasTheSame()
# Fetching complete, now filters
@@ -159,7 +162,7 @@ class perform_site_check(difference_detection_processor):
# https://stackoverflow.com/questions/41817578/basic-method-chaining ?
# return content().textfilter().jsonextract().checksumcompare() ?
is_json = 'application/json' in fetcher.get_all_headers().get('content-type', '').lower()
is_json = 'application/json' in fetcher.headers.get('Content-Type', '')
is_html = not is_json
# source: support, basically treat it as plaintext
@@ -167,7 +170,7 @@ class perform_site_check(difference_detection_processor):
is_html = False
is_json = False
if watch.is_pdf or 'application/pdf' in fetcher.get_all_headers().get('content-type', '').lower():
if watch.is_pdf or 'application/pdf' in fetcher.headers.get('Content-Type', '').lower():
from shutil import which
tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml")
if not which(tool):
@@ -235,7 +238,7 @@ class perform_site_check(difference_detection_processor):
html_content = fetcher.content
# If not JSON, and if it's not text/plain..
if 'text/plain' in fetcher.get_all_headers().get('content-type', '').lower():
if 'text/plain' in fetcher.headers.get('Content-Type', '').lower():
# Don't run get_text or xpath/css filters on plaintext
stripped_text_from_html = html_content
else:
@@ -309,7 +312,7 @@ class perform_site_check(difference_detection_processor):
# Treat pages with no renderable text content as a change? No by default
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
raise content_fetcher.ReplyWithContentButNoText(url=url, status_code=fetcher.get_last_status_code(), screenshot=screenshot)
raise fetchers.exceptions.ReplyWithContentButNoText(url=url, status_code=fetcher.get_last_status_code(), screenshot=screenshot)
# We rely on the actual text in the html output.. many sites have random script vars etc,
# in the future we'll implement other mechanisms.

View File

@@ -1,183 +0,0 @@
module.exports = async ({page, context}) => {
var {
url,
execute_js,
user_agent,
extra_wait_ms,
req_headers,
include_filters,
xpath_element_js,
screenshot_quality,
proxy_username,
proxy_password,
disk_cache_dir,
no_cache_list,
block_url_list,
} = context;
await page.setBypassCSP(true)
await page.setExtraHTTPHeaders(req_headers);
await page.setUserAgent(user_agent);
// https://ourcodeworld.com/articles/read/1106/how-to-solve-puppeteer-timeouterror-navigation-timeout-of-30000-ms-exceeded
await page.setDefaultNavigationTimeout(0);
if (proxy_username) {
await page.authenticate({
username: proxy_username,
password: proxy_password
});
}
await page.setViewport({
width: 1024,
height: 768,
deviceScaleFactor: 1,
});
await page.setRequestInterception(true);
if (disk_cache_dir) {
console.log(">>>>>>>>>>>>>>> LOCAL DISK CACHE ENABLED <<<<<<<<<<<<<<<<<<<<<");
}
const fs = require('fs');
const crypto = require('crypto');
function file_is_expired(file_path) {
if (!fs.existsSync(file_path)) {
return true;
}
var stats = fs.statSync(file_path);
const now_date = new Date();
const expire_seconds = 300;
if ((now_date / 1000) - (stats.mtime.getTime() / 1000) > expire_seconds) {
console.log("CACHE EXPIRED: " + file_path);
return true;
}
return false;
}
page.on('request', async (request) => {
// General blocking of requests that waste traffic
if (block_url_list.some(substring => request.url().toLowerCase().includes(substring))) return request.abort();
if (disk_cache_dir) {
const url = request.url();
const key = crypto.createHash('md5').update(url).digest("hex");
const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/';
// https://stackoverflow.com/questions/4482686/check-synchronously-if-file-directory-exists-in-node-js
if (fs.existsSync(dir_path + key)) {
console.log("* CACHE HIT , using - " + dir_path + key + " - " + url);
const cached_data = fs.readFileSync(dir_path + key);
// @todo headers can come from dir_path+key+".meta" json file
request.respond({
status: 200,
//contentType: 'text/html', //@todo
body: cached_data
});
return;
}
}
request.continue();
});
if (disk_cache_dir) {
page.on('response', async (response) => {
const url = response.url();
// Basic filtering for sane responses
if (response.request().method() != 'GET' || response.request().resourceType() == 'xhr' || response.request().resourceType() == 'document' || response.status() != 200) {
console.log("Skipping (not useful) - Status:" + response.status() + " Method:" + response.request().method() + " ResourceType:" + response.request().resourceType() + " " + url);
return;
}
if (no_cache_list.some(substring => url.toLowerCase().includes(substring))) {
console.log("Skipping (no_cache_list) - " + url);
return;
}
if (url.toLowerCase().includes('data:')) {
console.log("Skipping (embedded-data) - " + url);
return;
}
response.buffer().then(buffer => {
if (buffer.length > 100) {
console.log("Cache - Saving " + response.request().method() + " - " + url + " - " + response.request().resourceType());
const key = crypto.createHash('md5').update(url).digest("hex");
const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/';
if (!fs.existsSync(dir_path)) {
fs.mkdirSync(dir_path, {recursive: true})
}
if (fs.existsSync(dir_path + key)) {
if (file_is_expired(dir_path + key)) {
fs.writeFileSync(dir_path + key, buffer);
}
} else {
fs.writeFileSync(dir_path + key, buffer);
}
}
});
});
}
const r = await page.goto(url, {
waitUntil: 'load'
});
await page.waitForTimeout(1000);
await page.waitForTimeout(extra_wait_ms);
if (execute_js) {
await page.evaluate(execute_js);
await page.waitForTimeout(200);
}
var xpath_data;
var instock_data;
try {
// Not sure the best way here, in the future this should be a new package added to npm then run in browserless
// (Once the old playwright is removed)
xpath_data = await page.evaluate((include_filters) => {%xpath_scrape_code%}, include_filters);
instock_data = await page.evaluate(() => {%instock_scrape_code%});
} catch (e) {
console.log(e);
}
// Protocol error (Page.captureScreenshot): Cannot take screenshot with 0 width can come from a proxy auth failure
// Wrap it here (for now)
var b64s = false;
try {
b64s = await page.screenshot({encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg'});
} catch (e) {
console.log(e);
}
// May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw'
if (!b64s) {
// @todo after text extract, we can place some overlay text with red background to say 'croppped'
console.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot');
try {
b64s = await page.screenshot({encoding: "base64", quality: screenshot_quality, type: 'jpeg'});
} catch (e) {
console.log(e);
}
}
var html = await page.content();
return {
data: {
'content': html,
'headers': r.headers(),
'instock_data': instock_data,
'screenshot': b64s,
'status_code': r.status(),
'xpath_data': xpath_data
},
type: 'application/json',
};
};

View File

@@ -10,7 +10,6 @@ function isItemInStock() {
'brak na stanie',
'brak w magazynie',
'coming soon',
'currently have any tickets for this',
'currently unavailable',
'en rupture de stock',
'item is no longer available',
@@ -21,9 +20,7 @@ function isItemInStock() {
'nicht zur verfügung',
'no disponible temporalmente',
'no longer in stock',
'no tickets available',
'not available',
'not currently available',
'not in stock',
'notify me when available',
'não estamos a aceitar encomendas',
@@ -33,8 +30,6 @@ function isItemInStock() {
'sold out',
'temporarily out of stock',
'temporarily unavailable',
'tickets unavailable',
'unavailable tickets',
'we do not currently have an estimate of when this product will be back in stock.',
'zur zeit nicht an lager',
];

View File

@@ -8,14 +8,8 @@
// Some pages like https://www.londonstockexchange.com/stock/NCCL/ncondezi-energy-limited/analysis
// will automatically force a scroll somewhere, so include the position offset
// Lets hope the position doesnt change while we iterate the bbox's, but this is better than nothing
var scroll_y = 0;
try {
scroll_y = +document.documentElement.scrollTop || document.body.scrollTop
} catch (e) {
console.log(e);
}
var scroll_y=+document.documentElement.scrollTop || document.body.scrollTop
// Include the getXpath script directly, easier than fetching
function getxpath(e) {
@@ -44,15 +38,15 @@ const findUpTag = (el) => {
if (el.name !== undefined && el.name.length) {
var proposed = el.tagName + "[name=" + el.name + "]";
var proposed_element = window.document.querySelectorAll(proposed);
if (proposed_element.length) {
if(proposed_element.length) {
if (proposed_element.length === 1) {
return proposed;
} else {
// Some sites change ID but name= stays the same, we can hit it if we know the index
// Find all the elements that match and work out the input[n]
var n = Array.from(proposed_element).indexOf(el);
var n=Array.from(proposed_element).indexOf(el);
// Return a Playwright selector for nthinput[name=zipcode]
return proposed + " >> nth=" + n;
return proposed+" >> nth="+n;
}
}
}

View File

@@ -1,37 +0,0 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Uploaded to: SVG Repo, www.svgrepo.com, Generator: SVG Repo Mixer Tools -->
<svg
fill="#FFFFFF"
height="7.5005589"
width="11.248507"
version="1.1"
id="Layer_1"
viewBox="0 0 7.1975545 4.7993639"
xml:space="preserve"
xmlns="http://www.w3.org/2000/svg"
xmlns:svg="http://www.w3.org/2000/svg"><defs
id="defs19" />
<g
id="g14"
transform="matrix(-0.01406065,0,0,0.01406065,7.1975543,-1.1990922)">
<g
id="g12">
<g
id="g10">
<path
d="M 468.373,85.28 H 45.333 C 21.227,85.28 0,105.76 0,129.014 V 383.2 c 0,23.147 21.227,43.413 45.333,43.413 h 422.933 c 23.68,0 43.627,-19.84 43.627,-43.413 V 129.014 C 512,105.334 492.053,85.28 468.373,85.28 Z m 0,320 H 45.333 c -12.373,0 -24,-10.773 -24,-22.08 V 129.014 c 0,-11.307 11.84,-22.4 24,-22.4 h 422.933 c 11.733,0 22.293,10.667 22.293,22.4 V 383.2 h 0.107 c 10e-4,11.734 -10.453,22.08 -22.293,22.08 z"
id="path2" />
<path
d="m 440.853,153.974 c -3.307,-4.907 -9.92,-6.187 -14.827,-2.987 L 256,264.48 85.973,151.094 c -4.907,-3.2 -11.52,-1.707 -14.72,3.2 -3.093,4.8 -1.813,11.307 2.88,14.507 l 176,117.333 c 3.627,2.347 8.213,2.347 11.84,0 l 176,-117.333 c 4.8,-3.201 6.187,-9.921 2.88,-14.827 z"
id="path4" />
<path
d="m 143.573,257.654 c -0.107,0.107 -0.32,0.213 -0.427,0.32 L 68.48,311.307 c -4.907,3.307 -6.187,9.92 -2.88,14.827 3.307,4.907 9.92,6.187 14.827,2.88 0.107,-0.107 0.32,-0.213 0.427,-0.32 l 74.667,-53.333 c 4.907,-3.307 6.187,-9.92 2.88,-14.827 -3.308,-4.907 -9.921,-6.187 -14.828,-2.88 z"
id="path6" />
<path
d="m 443.947,311.627 c -0.107,-0.107 -0.32,-0.213 -0.427,-0.32 l -74.667,-53.333 c -4.693,-3.52 -11.413,-2.56 -14.933,2.133 -3.52,4.693 -2.56,11.413 2.133,14.933 0.107,0.107 0.32,0.213 0.427,0.32 l 74.667,53.333 c 4.693,3.52 11.413,2.56 14.933,-2.133 3.52,-4.693 2.56,-11.413 -2.133,-14.933 z"
id="path8" />
</g>
</g>
</g>
</svg>

Before

Width:  |  Height:  |  Size: 1.9 KiB

View File

@@ -1,3 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg width="61.649mm" height="61.649mm" version="1.1" viewBox="0 0 61.649 61.649" xml:space="preserve" xmlns="http://www.w3.org/2000/svg"><g transform="translate(66.269 -15.463)" fill="#3056d3"><g transform="matrix(1.423 0 0 1.423 101.16 69.23)" fill="#3056d3"><g transform="matrix(.8229 0 0 .8229 -23.378 -2.3935)" fill="#3056d3"><path d="m-88.248-43.007a26.323 26.323 0 0 0-26.323 26.323 26.323 26.323 0 0 0 26.323 26.323 26.323 26.323 0 0 0 26.323-26.323 26.323 26.323 0 0 0-26.323-26.323zm0 2.8417a23.482 23.482 0 0 1 23.482 23.482 23.482 23.482 0 0 1-23.482 23.482 23.482 23.482 0 0 1-23.482-23.482 23.482 23.482 0 0 1 23.482-23.482z"/><g transform="matrix(.26458 0 0 .26458 -115.65 -44.085)"><path d="m33.02 64.43c0.35-0.05 2.04-0.13 2.04-0.13h25.53s3.17 0.32 3.67 0.53c2.5 1.05 3.98 1.89 6.04 3.57 0.72 0.58 4.12 4.01 4.12 4.01l51.67 57.39s1.61 1.65 1.97 1.94c1.2 0.97 2.48 1.96 3.98 2.32 0.5 0.12 2.72 0.21 2.72 0.21h27.32l-8.83-9.04s-1.31-1.65-1.44-1.94c-0.45-0.93-0.59-2.59-0.13-3.51 0.35-0.69 1.46-1.87 2.23-1.98 1.03-0.14 2.12-0.39 3.02 0.14 0.33 0.2 1.64 1.32 1.64 1.32l17.49 17.49s1.35 1.09 1.6 1.6c0.17 0.34 0.29 0.82 0.15 1.18-0.17 0.42-1.42 1.63-1.42 1.63l-0.94 0.98-15.69 16.37s-1.44 1.4-1.79 1.67c-0.76 0.6-1.99 0.89-2.96 0.9-1.03 0-2.62-1.11-3.26-1.91-0.6-0.76-1.1-2.22-0.77-3.13 0.16-0.45 1.28-1.85 1.28-1.85l11.36-11.3-29.47-0.02-1.68 0.09s-4.16-0.66-5.26-1.03c-1.63-0.56-3.44-1.82-4.75-2.93-0.39-0.33-1.8-1.92-1.8-1.92l-51.7-59.28s-2-2.06-2.43-2.43c-1.37-1.17-2-1.62-3.76-2.34-0.44-0.18-3.45-0.55-3.45-0.55l-24.13-0.22s-2.23-0.15-2.61-0.22c-1.08-0.21-2.16-1.07-2.81-1.83-0.79-0.92-0.59-3.06 0.06-4.09 0.57-0.89 2.14-1.52 3.19-1.66z"/><path d="m86.1 109.7-17.13 19.65s-2 2.06-2.43 2.43c-1.37 1.17-2 1.62-3.76 2.34-0.44 0.18-3.45 0.55-3.45 0.55l-24.13 0.22s-2.23 0.15-2.61 0.22c-1.08 0.21-2.16 1.07-2.81 1.83-0.79 0.92-0.59 3.06 0.06 4.09 0.57 0.89 2.14 1.52 3.19 1.66 0.35 0.05 2.04 0.13 2.04 0.13h25.53s3.17-0.32 3.67-0.53c2.5-1.05 3.98-1.89 6.04-3.57 0.72-0.58 4.12-4.01 4.12-4.01l17.38-19.3z"/><path d="m177.81 67.6c-0.17-0.42-1.42-1.63-1.42-1.63l-0.94-0.98-15.69-16.37s-1.44-1.4-1.79-1.67c-0.76-0.6-1.99-0.89-2.96-0.9-1.03 0-2.62 1.11-3.26 1.91-0.6 0.76-1.1 2.22-0.77 3.13 0.16 0.45 1.28 1.85 1.28 1.85l11.36 11.3-29.47 0.02-1.68-0.09s-4.16 0.66-5.26 1.03c-1.63 0.56-3.44 1.82-4.75 2.93-0.39 0.33-1.8 1.92-1.8 1.92l-18.91 21.69 5.98 5.98 18.38-20.41s1.61-1.65 1.97-1.94c1.2-0.97 2.48-1.96 3.98-2.32 0.5-0.12 2.72-0.21 2.72-0.21h27.32l-8.83 9.04s-1.31 1.65-1.44 1.94c-0.45 0.93-0.59 2.59-0.13 3.51 0.35 0.69 1.46 1.87 2.23 1.98 1.03 0.14 2.12 0.39 3.02-0.14 0.33-0.2 1.64-1.32 1.64-1.32l17.49-17.49s1.35-1.09 1.6-1.6c0.17-0.34 0.29-0.82 0.15-1.18z"/></g></g></g></g></svg>

Before

Width:  |  Height:  |  Size: 2.7 KiB

View File

@@ -114,11 +114,11 @@ $(document).ready(function () {
e.preventDefault()
});
// When the mouse moves we know which element it should be above
// mousedown will link that to the UI (select the right action, highlight etc)
$('#browsersteps-selector-canvas').bind('mousedown', function (e) {
// https://developer.mozilla.org/en-US/docs/Web/API/MouseEvent
e.preventDefault()
console.log(e);
console.log("current xpath in index is " + current_selected_i);
last_click_xy = {'x': parseInt((1 / x_scale) * e.offsetX), 'y': parseInt((1 / y_scale) * e.offsetY)}
process_selected(current_selected_i);
current_selected_i = false;
@@ -132,7 +132,6 @@ $(document).ready(function () {
}
});
// Debounce and find the current most 'interesting' element we are hovering above
$('#browsersteps-selector-canvas').bind('mousemove', function (e) {
if (!xpath_data) {
return;
@@ -152,40 +151,41 @@ $(document).ready(function () {
current_selected_i = false;
// Reverse order - the most specific one should be deeper/"laster"
// Basically, find the most 'deepest'
var possible_elements = [];
xpath_data['size_pos'].forEach(function (item, index) {
//$('#browsersteps-selector-canvas').css('cursor', 'pointer');
for (var i = xpath_data['size_pos'].length; i !== 0; i--) {
// draw all of them? let them choose somehow?
var sel = xpath_data['size_pos'][i - 1];
// If we are in a bounding-box
if (e.offsetY > item.top * y_scale && e.offsetY < item.top * y_scale + item.height * y_scale
if (e.offsetY > sel.top * y_scale && e.offsetY < sel.top * y_scale + sel.height * y_scale
&&
e.offsetX > item.left * y_scale && e.offsetX < item.left * y_scale + item.width * y_scale
e.offsetX > sel.left * y_scale && e.offsetX < sel.left * y_scale + sel.width * y_scale
) {
// There could be many elements here, record them all and then we'll find out which is the most 'useful'
// (input, textarea, button, A etc)
if (item.width < xpath_data['browser_width']) {
possible_elements.push(item);
// Only highlight these interesting types
if (1) {
ctx.strokeRect(sel.left * x_scale, sel.top * y_scale, sel.width * x_scale, sel.height * y_scale);
ctx.fillRect(sel.left * x_scale, sel.top * y_scale, sel.width * x_scale, sel.height * y_scale);
current_selected_i = i - 1;
break;
// find the smallest one at this x,y
// does it mean sort the xpath list by size (w*h) i think so!
} else {
if (include_text_elements[0].checked === true) {
// blue one with background instead?
ctx.fillStyle = 'rgba(0,0,255, 0.1)';
ctx.strokeStyle = 'rgba(0,0,200, 0.7)';
$('#browsersteps-selector-canvas').css('cursor', 'grab');
ctx.strokeRect(sel.left * x_scale, sel.top * y_scale, sel.width * x_scale, sel.height * y_scale);
ctx.fillRect(sel.left * x_scale, sel.top * y_scale, sel.width * x_scale, sel.height * y_scale);
current_selected_i = i - 1;
break;
}
}
}
});
// Find the best one
if (possible_elements.length) {
possible_elements.forEach(function (item, index) {
if (["a", "input", "textarea", "button"].includes(item['tagName'])) {
current_selected_i = item;
}
});
if (!current_selected_i) {
current_selected_i = possible_elements[0];
}
sel = xpath_data['size_pos'][current_selected_i];
ctx.strokeRect(current_selected_i.left * x_scale, current_selected_i.top * y_scale, current_selected_i.width * x_scale, current_selected_i.height * y_scale);
ctx.fillRect(current_selected_i.left * x_scale, current_selected_i.top * y_scale, current_selected_i.width * x_scale, current_selected_i.height * y_scale);
}
}.debounce(10));
});
@@ -195,16 +195,16 @@ $(document).ready(function () {
// callback for clicking on an xpath on the canvas
function process_selected(selected_in_xpath_list) {
function process_selected(xpath_data_index) {
found_something = false;
var first_available = $("ul#browser_steps li.empty").first();
if (selected_in_xpath_list !== false) {
if (xpath_data_index !== false) {
// Nothing focused, so fill in a new one
// if inpt type button or <button>
// from the top, find the next not used one and use it
var x = selected_in_xpath_list;
var x = xpath_data['size_pos'][xpath_data_index];
console.log(x);
if (x && first_available.length) {
// @todo will it let you click shit that has a layer ontop? probably not.
@@ -214,18 +214,26 @@ $(document).ready(function () {
$('input[placeholder="Value"]', first_available).addClass('ok').click().focus();
found_something = true;
} else {
// There's no good way (that I know) to find if this
// see https://stackoverflow.com/questions/446892/how-to-find-event-listeners-on-a-dom-node-in-javascript-or-in-debugging
// https://codepen.io/azaslavsky/pen/DEJVWv
// So we dont know if its really a clickable element or not :-(
// Assume it is - then we dont fill the pages with unreliable "Click X,Y" selections
// If you switch to "Click X,y" after an element here is setup, it will give the last co-ords anyway
//if (x['isClickable'] || x['tagName'].startsWith('h') || x['tagName'] === 'a' || x['tagName'] === 'button' || x['tagtype'] === 'submit' || x['tagtype'] === 'checkbox' || x['tagtype'] === 'radio' || x['tagtype'] === 'li') {
if (x['isClickable'] || x['tagName'].startsWith('h') || x['tagName'] === 'a' || x['tagName'] === 'button' || x['tagtype'] === 'submit' || x['tagtype'] === 'checkbox' || x['tagtype'] === 'radio' || x['tagtype'] === 'li') {
$('select', first_available).val('Click element').change();
$('input[type=text]', first_available).first().val(x['xpath']);
found_something = true;
//}
}
}
first_available.xpath_data_index = xpath_data_index;
if (!found_something) {
if (include_text_elements[0].checked === true) {
// Suggest that we use as filter?
// @todo filters should always be in the last steps, nothing non-filter after it
found_something = true;
ctx.strokeStyle = 'rgba(0,0,255, 0.9)';
ctx.fillStyle = 'rgba(0,0,255, 0.1)';
$('select', first_available).val('Extract text and use as filter').change();
$('input[type=text]', first_available).first().val(x['xpath']);
include_text_elements[0].checked = false;
}
}
}
}
@@ -240,7 +248,7 @@ $(document).ready(function () {
function start() {
console.log("Starting browser-steps UI");
browsersteps_session_id = false;
browsersteps_session_id = Date.now();
// @todo This setting of the first one should be done at the datalayer but wtforms doesnt wanna play nice
$('#browser_steps >li:first-child').removeClass('empty');
set_first_gotosite_disabled();
@@ -248,7 +256,7 @@ $(document).ready(function () {
$('.clear,.remove', $('#browser_steps >li:first-child')).hide();
$.ajax({
type: "GET",
url: browser_steps_start_url,
url: browser_steps_sync_url + "&browsersteps_session_id=" + browsersteps_session_id,
statusCode: {
400: function () {
// More than likely the CSRF token was lost when the server restarted
@@ -256,12 +264,12 @@ $(document).ready(function () {
}
}
}).done(function (data) {
xpath_data = data.xpath_data;
$("#loading-status-text").fadeIn();
browsersteps_session_id = data.browsersteps_session_id;
// This should trigger 'Goto site'
console.log("Got startup response, requesting Goto-Site (first) step fake click");
$('#browser_steps >li:first-child .apply').click();
browserless_seconds_remaining = 500;
browserless_seconds_remaining = data.browser_time_remaining;
set_first_gotosite_disabled();
}).fail(function (data) {
console.log(data);
@@ -422,6 +430,7 @@ $(document).ready(function () {
apply_buttons_disabled = false;
$("#browsersteps-img").css('opacity', 1);
$('ul#browser_steps li .control .apply').css('opacity', 1);
browserless_seconds_remaining = data.browser_time_remaining;
$("#loading-status-text").hide();
set_first_gotosite_disabled();
}).fail(function (data) {

View File

@@ -26,6 +26,9 @@ $(document).ready(function() {
data = {
window_url : window.location.href,
notification_urls : $('.notification-urls').val(),
notification_title : $('.notification-title').val(),
notification_body : $('.notification-body').val(),
notification_format : $('.notification-format').val(),
}
for (key in data) {
if (!data[key].length) {

View File

@@ -12,7 +12,7 @@ window.addEventListener('hashchange', function () {
var has_errors = document.querySelectorAll(".messages .error");
if (!has_errors.length) {
if (document.location.hash == "") {
location.replace(document.querySelector(".tabs ul li:first-child a").hash);
document.querySelector(".tabs ul li:first-child a").click();
} else {
set_active_tab();
}

View File

@@ -3,7 +3,7 @@
* Toggles theme between light and dark mode.
*/
$(document).ready(function () {
const button = document.getElementById("toggle-light-mode");
const button = document.getElementsByClassName("toggle-theme")[0];
button.onclick = () => {
const htmlElement = document.getElementsByTagName("html");
@@ -21,33 +21,4 @@ $(document).ready(function () {
const setCookieValue = (value) => {
document.cookie = `css_dark_mode=${value};max-age=31536000;path=/`
}
// Search input box behaviour
const toggle_search = document.getElementById("toggle-search");
const search_q = document.getElementById("search-q");
window.addEventListener('keydown', function (e) {
if (e.altKey == true && e.keyCode == 83)
search_q.classList.toggle('expanded');
search_q.focus();
});
search_q.onkeydown = (e) => {
var key = e.keyCode || e.which;
if (key === 13) {
document.searchForm.submit();
}
};
toggle_search.onclick = () => {
// Could be that they want to search something once text is in there
if (search_q.value.length) {
document.searchForm.submit();
} else {
// If not..
search_q.classList.toggle('expanded');
search_q.focus();
}
};
});

View File

@@ -61,12 +61,7 @@ $(document).ready(function () {
function bootstrap_visualselector() {
if (1) {
// bootstrap it, this will trigger everything else
$("img#selector-background").on("error", function () {
$('.fetching-update-notice').html("<strong>Ooops!</strong> The VisualSelector tool needs atleast one fetched page, please unpause the watch and/or wait for the watch to complete fetching and then reload this page.");
$('.fetching-update-notice').css('color','#bb0000');
$('#selector-current-xpath').hide();
$('#clear-selector').hide();
}).bind('load', function () {
$("img#selector-background").bind('load', function () {
console.log("Loaded background...");
c = document.getElementById("selector-canvas");
// greyed out fill context
@@ -84,11 +79,10 @@ $(document).ready(function () {
}).attr("src", screenshot_url);
}
// Tell visualSelector that the image should update
var s = $("img#selector-background").attr('src') + "?" + new Date().getTime();
$("img#selector-background").attr('src', s)
var s = $("img#selector-background").attr('src')+"?"+ new Date().getTime();
$("img#selector-background").attr('src',s)
}
// This is fired once the img src is loaded in bootstrap_visualselector()
function fetch_data() {
// Image is ready
$('.fetching-update-notice').html("Fetching element data..");
@@ -105,8 +99,7 @@ $(document).ready(function () {
reflow_selector();
$('.fetching-update-notice').fadeOut();
});
}
};
function set_scale() {

View File

@@ -1,37 +0,0 @@
.pagination-page-info {
color: #fff;
font-size: 0.85rem;
text-transform: capitalize;
}
.pagination.menu {
> * {
display: inline-block;
}
li {
display: inline-block;
}
a {
padding: 0.65rem;
margin: 3px;
border: none;
background: #444;
border-radius: 2px;
color: var(--color-text-button);
&.disabled {
display: none;
}
&.active {
font-weight: bold;
background: #888;
}
&:hover {
background: #999;
}
}
}

View File

@@ -5,7 +5,6 @@
@import "parts/_arrows";
@import "parts/_browser-steps";
@import "parts/_extra_proxies";
@import "parts/_pagination";
@import "parts/_spinners";
@import "parts/_variables";
@@ -54,47 +53,8 @@ a.github-link {
}
}
#toggle-light-mode {
width: 3rem;
.icon-dark {
display: none;
}
&.dark {
.icon-light {
display: none;
}
.icon-dark {
display: block;
}
}
}
#toggle-search {
width: 2rem;
}
#search-q {
opacity: 0;
-webkit-transition: all .9s ease;
-moz-transition: all .9s ease;
transition: all .9s ease;
width: 0;
display: none;
&.expanded {
width: auto;
display: inline-block;
opacity: 1;
}
}
#search-result-info {
color: #fff;
}
button.toggle-button {
vertical-align: middle;
button.toggle-theme {
width: 4rem;
background: transparent;
border: none;
cursor: pointer;
@@ -113,7 +73,19 @@ button.toggle-button {
display: block;
}
.icon-dark {
display: none;
}
&.dark {
.icon-light {
display: none;
}
.icon-dark {
display: block;
}
}
}
.pure-menu-horizontal {

View File

@@ -95,32 +95,6 @@ ul#requests-extra_proxies {
ul#requests-extra_proxies table tr {
display: inline; }
.pagination-page-info {
color: #fff;
font-size: 0.85rem;
text-transform: capitalize; }
.pagination.menu > * {
display: inline-block; }
.pagination.menu li {
display: inline-block; }
.pagination.menu a {
padding: 0.65rem;
margin: 3px;
border: none;
background: #444;
border-radius: 2px;
color: var(--color-text-button); }
.pagination.menu a.disabled {
display: none; }
.pagination.menu a.active {
font-weight: bold;
background: #888; }
.pagination.menu a:hover {
background: #999; }
/* spinner */
.spinner,
.spinner:after {
@@ -331,44 +305,23 @@ a.github-link {
a.github-link:hover {
color: var(--color-icon-github-hover); }
#toggle-light-mode {
width: 3rem; }
#toggle-light-mode .icon-dark {
display: none; }
#toggle-light-mode.dark .icon-light {
display: none; }
#toggle-light-mode.dark .icon-dark {
display: block; }
#toggle-search {
width: 2rem; }
#search-q {
opacity: 0;
-webkit-transition: all .9s ease;
-moz-transition: all .9s ease;
transition: all .9s ease;
width: 0;
display: none; }
#search-q.expanded {
width: auto;
display: inline-block;
opacity: 1; }
#search-result-info {
color: #fff; }
button.toggle-button {
vertical-align: middle;
button.toggle-theme {
width: 4rem;
background: transparent;
border: none;
cursor: pointer;
color: var(--color-icon-github); }
button.toggle-button:hover {
button.toggle-theme:hover {
color: var(--color-icon-github-hover); }
button.toggle-button svg {
button.toggle-theme svg {
fill: currentColor; }
button.toggle-button .icon-light {
button.toggle-theme .icon-light {
display: block; }
button.toggle-theme .icon-dark {
display: none; }
button.toggle-theme.dark .icon-light {
display: none; }
button.toggle-theme.dark .icon-dark {
display: block; }
.pure-menu-horizontal {

View File

@@ -3,7 +3,7 @@ from flask import (
)
from . model import App, Watch
from copy import deepcopy, copy
from copy import deepcopy
from os import path, unlink
from threading import Lock
import json
@@ -204,16 +204,15 @@ class ChangeDetectionStore:
# GitHub #30 also delete history records
for uuid in self.data['watching']:
path = pathlib.Path(os.path.join(self.datastore_path, uuid))
if os.path.exists(path):
shutil.rmtree(path)
shutil.rmtree(path)
self.needs_write_urgent = True
else:
path = pathlib.Path(os.path.join(self.datastore_path, uuid))
if os.path.exists(path):
shutil.rmtree(path)
shutil.rmtree(path)
del self.data['watching'][uuid]
self.needs_write_urgent = True
self.needs_write_urgent = True
# Clone a watch by UUID
def clone(self, uuid):
@@ -317,8 +316,7 @@ class ChangeDetectionStore:
# #Re 569
new_watch = Watch.model(datastore_path=self.datastore_path, default={
'url': url,
'tag': tag,
'date_created': int(time.time())
'tag': tag
})
new_uuid = new_watch['uuid']
@@ -367,21 +365,19 @@ class ChangeDetectionStore:
def save_error_text(self, watch_uuid, contents):
if not self.data['watching'].get(watch_uuid):
return
self.data['watching'][watch_uuid].ensure_data_dir_exists()
target_path = os.path.join(self.datastore_path, watch_uuid, "last-error.txt")
with open(target_path, 'w') as f:
f.write(contents)
def save_xpath_data(self, watch_uuid, data, as_error=False):
if not self.data['watching'].get(watch_uuid):
return
if as_error:
target_path = os.path.join(self.datastore_path, watch_uuid, "elements-error.json")
else:
target_path = os.path.join(self.datastore_path, watch_uuid, "elements.json")
self.data['watching'][watch_uuid].ensure_data_dir_exists()
with open(target_path, 'w') as f:
f.write(json.dumps(data))
f.close()
@@ -475,6 +471,8 @@ class ChangeDetectionStore:
return proxy_list if len(proxy_list) else None
def get_preferred_proxy_for_watch(self, uuid):
"""
Returns the preferred proxy by ID key
@@ -506,25 +504,6 @@ class ChangeDetectionStore:
return None
@property
def has_extra_headers_file(self):
filepath = os.path.join(self.datastore_path, 'headers.txt')
return os.path.isfile(filepath)
def get_all_headers(self):
from .model.App import parse_headers_from_text_file
headers = copy(self.data['settings'].get('headers', {}))
filepath = os.path.join(self.datastore_path, 'headers.txt')
try:
if os.path.isfile(filepath):
headers.update(parse_headers_from_text_file(filepath))
except Exception as e:
print(f"ERROR reading headers.txt at {filepath}", str(e))
return headers
# Run all updates
# IMPORTANT - Each update could be run even when they have a new install and the schema is correct
# So therefor - each `update_n` should be very careful about checking if it needs to actually run
@@ -700,13 +679,3 @@ class ChangeDetectionStore:
except:
continue
return
# We don't know when the date_created was in the past until now, so just add an index number for now.
def update_11(self):
i = 0
for uuid, watch in self.data['watching'].items():
if not watch.get('date_created'):
watch['date_created'] = i
i+=1
return

View File

@@ -23,7 +23,7 @@
<div class="notifications-wrapper">
<a id="send-test-notification" class="pure-button button-secondary button-xsmall" >Send test notification</a>
{% if emailprefix %}
<a id="add-email-helper" class="pure-button button-secondary button-xsmall" >Add email <img style="height: 1em; display: inline-block" src="{{url_for('static_content', group='images', filename='email.svg')}}" alt="Add an email address"> </a>
<a id="add-email-helper" class="pure-button button-secondary button-xsmall" >Add email</a>
{% endif %}
<a href="{{url_for('notification_logs')}}" class="pure-button button-secondary button-xsmall" >Notification debug logs</a>
</div>
@@ -115,7 +115,7 @@
URLs generated by changedetection.io (such as <code>{{ '{{diff_url}}' }}</code>) require the <code>BASE_URL</code> environment variable set.<br>
Your <code>BASE_URL</code> var is currently "{{settings_application['current_base_url']}}"
<br>
Warning: Contents of <code>{{ '{{diff}}' }}</code>, <code>{{ '{{diff_removed}}' }}</code>, and <code>{{ '{{diff_added}}' }}</code> depend on how the difference algorithm perceives the change. For example, an addition or removal could be perceived as a change in some cases. <a target="_new" href="https://github.com/dgtlmoon/changedetection.io/wiki/Using-the-%7B%7Bdiff%7D%7D,-%7B%7Bdiff_added%7D%7D,-and-%7B%7Bdiff_removed%7D%7D-notification-tokens">More Here</a> <br>
Warning: Contents of <code>{{ '{{diff}}' }}</code>, <code>{{ '{{diff_removed}}' }}</code>, and <code>{{ '{{diff_added}}' }}</code> depend on how the difference algorithm perceives the change. For example, an addition or removal could be perceived as a change in some cases. <a target="_new" href="https://github.com/dgtlmoon/changedetection.io/wiki/Using-the-%7B%7Bdiff%7D%7D,-%7B%7Bdiff_added%7D%7D,-and-%7B%7Bdiff_removal%7D%7D-notification-tokens">More Here</a> </br>
</div>
</div>
</div>

View File

@@ -0,0 +1,7 @@
{% macro pagination(sorted_watches, total_per_page, current_page) %}
{{ sorted_watches|length }}
{% for row in sorted_watches|batch(total_per_page, '&nbsp;') %}
{{ loop.index}}
{% endfor %}
{% endmacro %}

View File

@@ -2,35 +2,35 @@
<html lang="en" data-darkmode="{{ get_darkmode_state() }}">
<head>
<meta charset="utf-8" >
<meta name="viewport" content="width=device-width, initial-scale=1.0" >
<meta name="description" content="Self hosted website change detection." >
<meta charset="utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
<meta name="description" content="Self hosted website change detection."/>
<title>Change Detection{{extra_title}}</title>
<link rel="alternate" type="application/rss+xml" title="Changedetection.io » Feed{% if active_tag %}- {{active_tag}}{% endif %}" href="{{ url_for('rss', tag=active_tag , token=app_rss_token)}}" >
<link rel="stylesheet" href="{{url_for('static_content', group='styles', filename='pure-min.css')}}" >
<link rel="stylesheet" href="{{url_for('static_content', group='styles', filename='styles.css')}}" >
<link rel="alternate" type="application/rss+xml" title="Changedetection.io » Feed{% if active_tag %}- {{active_tag}}{% endif %}" href="{{ url_for('rss', tag=active_tag , token=app_rss_token)}}"/>
<link rel="stylesheet" href="{{url_for('static_content', group='styles', filename='pure-min.css')}}"/>
<link rel="stylesheet" href="{{url_for('static_content', group='styles', filename='styles.css')}}"/>
{% if extra_stylesheets %}
{% for m in extra_stylesheets %}
<link rel="stylesheet" href="{{ m }}?ver=1000" >
<link rel="stylesheet" href="{{ m }}?ver=1000"/>
{% endfor %}
{% endif %}
<link rel="apple-touch-icon" sizes="180x180" href="{{url_for('static_content', group='favicons', filename='apple-touch-icon.png')}}">
<link rel="icon" type="image/png" sizes="32x32" href="{{url_for('static_content', group='favicons', filename='favicon-32x32.png')}}">
<link rel="icon" type="image/png" sizes="16x16" href="{{url_for('static_content', group='favicons', filename='favicon-16x16.png')}}">
<link rel="manifest" href="{{url_for('static_content', group='favicons', filename='site.webmanifest')}}">
<link rel="mask-icon" href="{{url_for('static_content', group='favicons', filename='safari-pinned-tab.svg')}}" color="#5bbad5">
<link rel="shortcut icon" href="{{url_for('static_content', group='favicons', filename='favicon.ico')}}">
<meta name="msapplication-TileColor" content="#da532c">
<meta name="msapplication-config" content="favicons/browserconfig.xml">
<meta name="theme-color" content="#ffffff">
<link rel="apple-touch-icon" sizes="180x180" href="{{url_for('static_content', group='favicons', filename='apple-touch-icon.png')}}"/>
<link rel="icon" type="image/png" sizes="32x32" href="{{url_for('static_content', group='favicons', filename='favicon-32x32.png')}}"/>
<link rel="icon" type="image/png" sizes="16x16" href="{{url_for('static_content', group='favicons', filename='favicon-16x16.png')}}"/>
<link rel="manifest" href="{{url_for('static_content', group='favicons', filename='site.webmanifest')}}"/>
<link rel="mask-icon" href="{{url_for('static_content', group='favicons', filename='safari-pinned-tab.svg')}}" color="#5bbad5"/>
<link rel="shortcut icon" href="{{url_for('static_content', group='favicons', filename='favicon.ico')}}"/>
<meta name="msapplication-TileColor" content="#da532c"/>
<meta name="msapplication-config" content="favicons/browserconfig.xml"/>
<meta name="theme-color" content="#ffffff"/>
<style>
body::before {
background-image: url({{url_for('static_content', group='images', filename='gradient-border.png') }});
}
</style>
<script src="{{url_for('static_content', group='js', filename='jquery-3.6.0.min.js')}}"></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='jquery-3.6.0.min.js')}}"></script>
</head>
<body>
@@ -82,21 +82,11 @@
<a href="{{url_for('logout')}}" class="pure-menu-link">LOG OUT</a>
</li>
{% endif %}
<li class="pure-menu-item pure-form" id="search-menu-item">
<!-- We use GET here so it offers people a chance to set bookmarks etc -->
<form name="searchForm" action="" method="GET">
<input id="search-q" class="" name="q" placeholder="URL or Title {% if active_tag %}in '{{ active_tag }}'{% endif %}" required="" type="text" value="">
<input name="tag" type="hidden" value="{% if active_tag %}{{active_tag}}{% endif %}">
<button class="toggle-button " id="toggle-search" type="button" title="Search, or Use Alt+S Key" >
{% include "svgs/search-icon.svg" %}
</button>
</form>
</li>
<li class="pure-menu-item">
{% if dark_mode %}
{% set darkClass = 'dark' %}
{% endif %}
<button class="toggle-button {{darkClass}}" id ="toggle-light-mode" type="button" title="Toggle Light/Dark Mode">
<button class="toggle-theme {{darkClass}}" type="button" title="Toggle Light/Dark Mode">
<span class="visually-hidden">Toggle light/dark mode</span>
<span class="icon-light">
{% include "svgs/light-mode-toggle-icon.svg" %}
@@ -116,7 +106,7 @@
</div>
{% if hosted_sticky %}
<div class="sticky-tab" id="hosted-sticky">
<a href="https://changedetection.io/?ref={{guid}}">Let us host your instance!</a>
<a href="https://lemonade.changedetection.io/start?ref={{guid}}">Let us host your instance!</a>
</div>
{% endif %}
{% if left_sticky %}
@@ -147,13 +137,16 @@
<li class="message">
Share this link:
<span id="share-link">{{ session['share-link'] }}</span>
<img style="height: 1em; display: inline-block" src="{{url_for('static_content', group='images', filename='copy.svg')}}" >
<img style="height: 1em; display: inline-block" src="{{url_for('static_content', group='images', filename='copy.svg')}}"/>
</li>
</ul>
{% endif %}
{% block content %}{% endblock %}
</section>
<script src="{{url_for('static_content', group='js', filename='toggle-theme.js')}}" defer></script>
<script
type="text/javascript"
src="{{url_for('static_content', group='js', filename='toggle-theme.js')}}"
defer></script>
</body>
</html>

View File

@@ -6,7 +6,7 @@
action="{{url_for('clear_all_history')}}"
method="POST"
>
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" >
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
<fieldset>
<div class="pure-control-group">
This will remove version history (snapshots) for ALL watches, but keep

View File

@@ -7,7 +7,7 @@
const error_screenshot_url="{{url_for('static_content', group='screenshot', filename=uuid, error_screenshot=1) }}";
{% endif %}
</script>
<script src="{{url_for('static_content', group='js', filename='diff-overview.js')}}" defer></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='diff-overview.js')}}" defer></script>
<div id="settings">
<h1>Differences</h1>
@@ -15,15 +15,15 @@
<fieldset>
<label for="diffWords" class="pure-checkbox">
<input type="radio" name="diff_type" id="diffWords" value="diffWords"> Words</label>
<input type="radio" name="diff_type" id="diffWords" value="diffWords"/> Words</label>
<label for="diffLines" class="pure-checkbox">
<input type="radio" name="diff_type" id="diffLines" value="diffLines" checked=""> Lines</label>
<input type="radio" name="diff_type" id="diffLines" value="diffLines" checked=""/> Lines</label>
<label for="diffChars" class="pure-checkbox">
<input type="radio" name="diff_type" id="diffChars" value="diffChars"> Chars</label>
<input type="radio" name="diff_type" id="diffChars" value="diffChars"/> Chars</label>
<!-- @todo - when mimetype is JSON, select this by default? -->
<label for="diffJson" class="pure-checkbox">
<input type="radio" name="diff_type" id="diffJson" value="diffJson" > JSON</label>
<input type="radio" name="diff_type" id="diffJson" value="diffJson" /> JSON</label>
{% if versions|length >= 1 %}
<label for="diff-version">Compare newest (<span id="current-v-date"></span>) with</label>
@@ -43,7 +43,7 @@
<span>
<!-- https://github.com/kpdecker/jsdiff/issues/389 ? -->
<label for="ignoreWhitespace" class="pure-checkbox" id="label-diff-ignorewhitespace">
<input type="checkbox" id="ignoreWhitespace" name="ignoreWhitespace" > Ignore Whitespace</label>
<input type="checkbox" id="ignoreWhitespace" name="ignoreWhitespace"/> Ignore Whitespace</label>
</span>
</div>
@@ -51,7 +51,7 @@
<a onclick="next_diff();">Jump</a>
</div>
<script src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
<div class="tabs">
<ul>
{% if last_error_text %}<li class="tab" id="error-text-tab"><a href="#error-text">Error Text</a></li> {% endif %}
@@ -72,7 +72,7 @@
<div class="tab-pane-inner" id="error-screenshot">
<div class="snapshot-age error">{{watch_a.snapshot_error_screenshot_ctime|format_seconds_ago}} seconds ago</div>
<img id="error-screenshot-img" style="max-width: 80%" alt="Current error-ing screenshot from most recent request" >
<img id="error-screenshot-img" style="max-width: 80%" alt="Current error-ing screenshot from most recent request"/>
</div>
<div class="tab-pane-inner" id="text">
@@ -105,7 +105,7 @@
{% if is_html_webdriver %}
{% if screenshot %}
<div class="snapshot-age">{{watch_a.snapshot_screenshot_ctime|format_timestamp_timeago}}</div>
<img style="max-width: 80%" id="screenshot-img" alt="Current screenshot from most recent request" >
<img style="max-width: 80%" id="screenshot-img" alt="Current screenshot from most recent request"/>
{% else %}
No screenshot available just yet! Try rechecking the page.
{% endif %}
@@ -117,7 +117,7 @@
<form id="extract-data-form" class="pure-form pure-form-stacked edit-form"
action="{{ url_for('diff_history_page', uuid=uuid) }}#extract"
method="POST">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<p>This tool will extract text data from all of the watch history.</p>
@@ -149,9 +149,9 @@
<script>
const newest_version_timestamp = {{newest_version_timestamp}};
</script>
<script src="{{url_for('static_content', group='js', filename='diff.min.js')}}"></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='diff.min.js')}}"></script>
<script src="{{url_for('static_content', group='js', filename='diff-render.js')}}"></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='diff-render.js')}}"></script>
{% endblock %}

View File

@@ -2,7 +2,7 @@
{% block content %}
{% from '_helpers.jinja' import render_field, render_checkbox_field, render_button %}
{% from '_common_fields.jinja' import render_common_settings_form %}
<script src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
<script>
const notification_base_url="{{url_for('ajax_callback_send_notification_test')}}";
const watch_visual_selector_data_url="{{url_for('static_content', group='visual_selector_data', filename=uuid)}}";
@@ -14,17 +14,15 @@
{% endif %}
const browser_steps_config=JSON.parse('{{ browser_steps_config|tojson }}');
const browser_steps_start_url="{{url_for('browser_steps.browsersteps_start_session', uuid=uuid)}}";
const browser_steps_sync_url="{{url_for('browser_steps.browsersteps_ui_update', uuid=uuid)}}";
</script>
<script src="{{url_for('static_content', group='js', filename='watch-settings.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='limit.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='visual-selector.js')}}" defer></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='watch-settings.js')}}" defer></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='limit.js')}}" defer></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='visual-selector.js')}}" defer></script>
{% if playwright_enabled %}
<script src="{{url_for('static_content', group='js', filename='browser-steps.js')}}" defer></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='browser-steps.js')}}" defer></script>
{% endif %}
<div class="edit-form monospaced-textarea">
@@ -52,7 +50,7 @@
<div class="box-wrap inner">
<form class="pure-form pure-form-stacked"
action="{{ url_for('edit_page', uuid=uuid, next = request.args.get('next'), unpause_on_save = request.args.get('unpause_on_save')) }}" method="POST">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<div class="tab-pane-inner" id="general">
<fieldset>
@@ -152,17 +150,6 @@
{{ render_field(form.headers, rows=5, placeholder="Example
Cookie: foobar
User-Agent: wonderbra 1.0") }}
<div class="pure-form-message-inline">
{% if has_extra_headers_file %}
<strong>Alert! Extra headers file found and will be added to this watch!</strong>
{% else %}
Headers can be also read from a file in your data-directory <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Adding-headers-from-an-external-file">Read more here</a>
{% endif %}
<br>
(Not supported by Selenium browser)
</div>
</div>
<div class="pure-control-group" id="request-body">
{{ render_field(form.body, rows=5, placeholder="Example
@@ -176,7 +163,7 @@ User-Agent: wonderbra 1.0") }}
</div>
{% if playwright_enabled %}
<div class="tab-pane-inner" id="browser-steps">
<img class="beta-logo" src="{{url_for('static_content', group='images', filename='beta-logo.png')}}" alt="New beta functionality">
<img class="beta-logo" src="{{url_for('static_content', group='images', filename='beta-logo.png')}}">
<fieldset>
<div class="pure-control-group">
<!--
@@ -199,12 +186,11 @@ User-Agent: wonderbra 1.0") }}
<span class="loader" >
<span id="browsersteps-click-start">
<h2 >Click here to Start</h2>
<svg style="height: 3.5rem;" version="1.1" viewBox="0 0 32 32" xml:space="preserve" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g id="Layer_1"/><g id="play_x5F_alt"><path d="M16,0C7.164,0,0,7.164,0,16s7.164,16,16,16s16-7.164,16-16S24.836,0,16,0z M10,24V8l16.008,8L10,24z" style="fill: var(--color-grey-400);"/></g></svg><br>
Please allow 10-15 seconds for the browser to connect.<br>
Please allow 10-15 seconds for the browser to connect.
</span>
<div class="spinner" style="display: none;"></div>
</span>
<img class="noselect" id="browsersteps-img" src="" style="max-width: 100%; width: 100%;" >
<img class="noselect" id="browsersteps-img" src="" style="max-width: 100%; width: 100%;" />
<canvas class="noselect" id="browsersteps-selector-canvas" style="max-width: 100%; width: 100%;"></canvas>
</div>
</div>
@@ -234,7 +220,7 @@ User-Agent: wonderbra 1.0") }}
<div class="field-group" id="notification-field-group">
{% if has_default_notification_urls %}
<div class="inline-warning">
<img class="inline-warning-icon" src="{{url_for('static_content', group='images', filename='notice.svg')}}" alt="Look out!" title="Lookout!" >
<img class="inline-warning-icon" src="{{url_for('static_content', group='images', filename='notice.svg')}}" alt="Look out!" title="Lookout!"/>
There are <a href="{{ url_for('settings_page')}}#notifications">system-wide notification URLs enabled</a>, this form will override notification settings for this watch only &dash; an empty Notification URL list here will still send notifications.
</div>
{% endif %}
@@ -404,7 +390,7 @@ Unavailable") }}
{% if watch['processor'] == 'text_json_diff' %}
<div class="tab-pane-inner visual-selector-ui" id="visualselector">
<img class="beta-logo" src="{{url_for('static_content', group='images', filename='beta-logo.png')}}" alt="New beta functionality">
<img class="beta-logo" src="{{url_for('static_content', group='images', filename='beta-logo.png')}}">
<fieldset>
<div class="pure-control-group">
@@ -421,7 +407,7 @@ Unavailable") }}
<!-- request the screenshot and get the element offset info ready -->
<!-- use img src ready load to know everything is ready to map out -->
<!-- @todo: maybe something interesting like a field to select 'elements that contain text... and their parents n' -->
<img id="selector-background" >
<img id="selector-background" />
<canvas id="selector-canvas"></canvas>
</div>
<div id="selector-current-xpath" style="overflow-x: hidden"><strong>Currently:</strong>&nbsp;<span class="text">Loading...</span></div>

View File

@@ -1,7 +1,7 @@
{% extends 'base.html' %}
{% block content %}
{% from '_helpers.jinja' import render_field %}
<script src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
<div class="edit-form monospaced-textarea">
<div class="tabs collapsable">
@@ -13,7 +13,7 @@
<div class="box-wrap inner">
<form class="pure-form pure-form-aligned" action="{{url_for('import_page')}}" method="POST">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<div class="tab-pane-inner" id="url-list">
<legend>
Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma

View File

@@ -4,13 +4,13 @@
<div class="login-form">
<div class="inner">
<form class="pure-form pure-form-stacked" action="{{url_for('login')}}" method="POST">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<fieldset>
<div class="pure-control-group">
<label for="password">Password</label>
<input type="password" id="password" required="" name="password" value=""
size="15" autofocus />
<input type="hidden" id="email" name="email" value="defaultuser@changedetection.io" >
<input type="hidden" id="email" name="email" value="defaultuser@changedetection.io" />
</div>
<div class="pure-control-group">
<button type="submit" class="pure-button pure-button-primary">Login</button>

View File

@@ -7,9 +7,9 @@
const error_screenshot_url="{{url_for('static_content', group='screenshot', filename=uuid, error_screenshot=1) }}";
{% endif %}
</script>
<script src="{{url_for('static_content', group='js', filename='diff-overview.js')}}" defer></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='diff-overview.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
<div class="tabs">
<ul>
{% if last_error_text %}<li class="tab" id="error-text-tab"><a href="#error-text">Error Text</a></li> {% endif %}
@@ -31,7 +31,7 @@
<div class="tab-pane-inner" id="error-screenshot">
<div class="snapshot-age error">{{watch.snapshot_error_screenshot_ctime|format_seconds_ago}} seconds ago</div>
<img id="error-screenshot-img" style="max-width: 80%" alt="Current erroring screenshot from most recent request" >
<img id="error-screenshot-img" style="max-width: 80%" alt="Current erroring screenshot from most recent request"/>
</div>
<div class="tab-pane-inner" id="text">
@@ -58,7 +58,7 @@
{% if is_html_webdriver %}
{% if screenshot %}
<div class="snapshot-age">{{watch.snapshot_screenshot_ctime|format_timestamp_timeago}}</div>
<img style="max-width: 80%" id="screenshot-img" alt="Current screenshot from most recent request" >
<img style="max-width: 80%" id="screenshot-img" alt="Current screenshot from most recent request"/>
{% else %}
No screenshot available just yet! Try rechecking the page.
{% endif %}

View File

@@ -9,10 +9,10 @@
const email_notification_prefix=JSON.parse('{{emailprefix|tojson}}');
{% endif %}
</script>
<script src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='global-settings.js')}}" defer></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='global-settings.js')}}" defer></script>
<div class="edit-form">
<div class="tabs collapsable">
<ul>
@@ -26,7 +26,7 @@
</div>
<div class="box-wrap inner">
<form class="pure-form pure-form-stacked settings" action="{{url_for('settings_page')}}" method="POST">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" >
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<div class="tab-pane-inner" id="general">
<fieldset>
<div class="pure-control-group">
@@ -70,10 +70,6 @@
<a href="https://github.com/dgtlmoon/changedetection.io/wiki/Configurable-BASE_URL-setting">read more here</a>.
</span>
</div>
<div class="pure-control-group">
{{ render_field(form.application.form.pager_size) }}
<span class="pure-form-message-inline">Number of items per page in the watch overview list, 0 to disable.</span>
</div>
<div class="pure-control-group">
{{ render_checkbox_field(form.application.form.extract_title_as_title) }}

View File

@@ -1 +0,0 @@
<?xml version="1.0" encoding="utf-8"?><svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 122.879 119.799" enable-background="new 0 0 122.879 119.799" xml:space="preserve"><g><path d="M49.988,0h0.016v0.007C63.803,0.011,76.298,5.608,85.34,14.652c9.027,9.031,14.619,21.515,14.628,35.303h0.007v0.033v0.04 h-0.007c-0.005,5.557-0.917,10.905-2.594,15.892c-0.281,0.837-0.575,1.641-0.877,2.409v0.007c-1.446,3.66-3.315,7.12-5.547,10.307 l29.082,26.139l0.018,0.016l0.157,0.146l0.011,0.011c1.642,1.563,2.536,3.656,2.649,5.78c0.11,2.1-0.543,4.248-1.979,5.971 l-0.011,0.016l-0.175,0.203l-0.035,0.035l-0.146,0.16l-0.016,0.021c-1.565,1.642-3.654,2.534-5.78,2.646 c-2.097,0.111-4.247-0.54-5.971-1.978l-0.015-0.011l-0.204-0.175l-0.029-0.024L78.761,90.865c-0.88,0.62-1.778,1.209-2.687,1.765 c-1.233,0.755-2.51,1.466-3.813,2.115c-6.699,3.342-14.269,5.222-22.272,5.222v0.007h-0.016v-0.007 c-13.799-0.004-26.296-5.601-35.338-14.645C5.605,76.291,0.016,63.805,0.007,50.021H0v-0.033v-0.016h0.007 c0.004-13.799,5.601-26.296,14.645-35.338C23.683,5.608,36.167,0.016,49.955,0.007V0H49.988L49.988,0z M50.004,11.21v0.007h-0.016 h-0.033V11.21c-10.686,0.007-20.372,4.35-27.384,11.359C15.56,29.578,11.213,39.274,11.21,49.973h0.007v0.016v0.033H11.21 c0.007,10.686,4.347,20.367,11.359,27.381c7.009,7.012,16.705,11.359,27.403,11.361v-0.007h0.016h0.033v0.007 c10.686-0.007,20.368-4.348,27.382-11.359c7.011-7.009,11.358-16.702,11.36-27.4h-0.006v-0.016v-0.033h0.006 c-0.006-10.686-4.35-20.372-11.358-27.384C70.396,15.56,60.703,11.213,50.004,11.21L50.004,11.21z"/></g></svg>

Before

Width:  |  Height:  |  Size: 1.6 KiB

View File

@@ -1,13 +1,14 @@
{% extends 'base.html' %}
{% block content %}
{% from '_helpers.jinja' import render_simple_field, render_field %}
<script src="{{url_for('static_content', group='js', filename='jquery-3.6.0.min.js')}}"></script>
<script src="{{url_for('static_content', group='js', filename='watch-overview.js')}}" defer></script>
{% from '_pagination.jinja' import pagination %}
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='jquery-3.6.0.min.js')}}"></script>
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='watch-overview.js')}}" defer></script>
<div class="box">
<form class="pure-form" action="{{ url_for('form_quick_watch_add') }}" method="POST" id="new-watch-form">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" >
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<fieldset>
<legend>Add a new change detection watch</legend>
<div id="watch-add-wrapper-zone">
@@ -25,26 +26,20 @@
</div>
</fieldset>
<span style="color:#eee; font-size: 80%;"><img alt="Create a shareable link" style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread-white.svg')}}" > Tip: You can also add 'shared' watches. <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Sharing-a-Watch">More info</a></span>
<span style="color:#eee; font-size: 80%;"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread-white.svg')}}" /> Tip: You can also add 'shared' watches. <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Sharing-a-Watch">More info</a></a></span>
</form>
<form class="pure-form" action="{{ url_for('form_watch_list_checkbox_operations') }}" method="POST" id="watch-list-form">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" >
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<div id="checkbox-operations">
<button class="pure-button button-secondary button-xsmall" name="op" value="pause">Pause</button>
<button class="pure-button button-secondary button-xsmall" name="op" value="unpause">UnPause</button>
<button class="pure-button button-secondary button-xsmall" name="op" value="mute">Mute</button>
<button class="pure-button button-secondary button-xsmall" name="op" value="unmute">UnMute</button>
<button class="pure-button button-secondary button-xsmall" name="op" value="recheck">Recheck</button>
<button class="pure-button button-secondary button-xsmall" name="op" value="mark-viewed">Mark viewed</button>
<button class="pure-button button-secondary button-xsmall" name="op" value="notification-default">Use default notification</button>
<button class="pure-button button-secondary button-xsmall" style="background: #dd4242;" name="op" value="clear-history">Clear/reset history</button>
<button class="pure-button button-secondary button-xsmall" style="background: #dd4242;" name="op" value="delete">Delete</button>
<button class="pure-button button-secondary button-xsmall" style="background: #dd4242; font-size: 70%" name="op" value="delete">Delete</button>
</div>
{% if watches|length >= pagination.per_page %}
{{ pagination.info }}
{% endif %}
{% if search_q %}<div id="search-result-info">Searching "<strong><i>{{search_q}}</i></strong>"</div>{% endif %}
<div>
<a href="{{url_for('index')}}" class="pure-button button-tag {{'active' if not active_tag }}">All</a>
{% for tag in tags %}
@@ -54,19 +49,18 @@
{% endfor %}
</div>
{% set sort_order = sort_order or 'asc' %}
{% set sort_attribute = sort_attribute or 'last_changed' %}
{% set sort_order = request.args.get('order', 'asc') == 'asc' %}
{% set sort_attribute = request.args.get('sort', 'last_changed') %}
{% set pagination_page = request.args.get('page', 0) %}
<div id="watch-table-wrapper">
<table class="pure-table pure-table-striped watch-table">
<thead>
<tr>
{% set link_order = "desc" if sort_order == 'asc' else "asc" %}
{% set arrow_span = "" %}
<th><input style="vertical-align: middle" type="checkbox" id="check-all" > <a class="{{ 'active '+link_order if sort_attribute == 'date_created' else 'inactive' }}" href="{{url_for('index', sort='date_created', order=link_order, tag=active_tag)}}"># <span class='arrow {{link_order}}'></span></a></th>
<th><input style="vertical-align: middle" type="checkbox" id="check-all"/> #</th>
<th></th>
{% set link_order = "desc" if sort_order else "asc" %}
{% set arrow_span = "" %}
<th><a class="{{ 'active '+link_order if sort_attribute == 'label' else 'inactive' }}" href="{{url_for('index', sort='label', order=link_order, tag=active_tag)}}">Website <span class='arrow {{link_order}}'></span></a></th>
<th><a class="{{ 'active '+link_order if sort_attribute == 'last_checked' else 'inactive' }}" href="{{url_for('index', sort='last_checked', order=link_order, tag=active_tag)}}">Last Checked <span class='arrow {{link_order}}'></span></a></th>
<th><a class="{{ 'active '+link_order if sort_attribute == 'last_changed' else 'inactive' }}" href="{{url_for('index', sort='last_changed', order=link_order, tag=active_tag)}}">Last Changed <span class='arrow {{link_order}}'></span></a></th>
@@ -74,12 +68,13 @@
</tr>
</thead>
<tbody>
{% if not watches|length %}
<tr>
<td colspan="6">No website watches configured, please add a URL in the box above, or <a href="{{ url_for('import_page')}}" >import a list</a>.</td>
</tr>
{% endif %}
{% for watch in (watches|sort(attribute=sort_attribute, reverse=sort_order == 'asc'))|pagination_slice(skip=pagination.skip) %}
{% set sorted_watches = watches|sort(attribute=sort_attribute, reverse=sort_order) %}
{% for watch in sorted_watches %}
{# WIP for pagination, disabled for now
{% if not ( loop.index >= 3 and loop.index <=4) %}{% continue %}{% endif %} -->
#}
<tr id="{{ watch.uuid }}"
class="{{ loop.cycle('pure-table-odd', 'pure-table-even') }} processor-{{ watch['processor'] }}
{% if watch.last_error is defined and watch.last_error != False %}error{% endif %}
@@ -87,26 +82,26 @@
{% if watch.paused is defined and watch.paused != False %}paused{% endif %}
{% if watch.newest_history_key| int > watch.last_viewed and watch.history_n>=2 %}unviewed{% endif %}
{% if watch.uuid in queued_uuids %}queued{% endif %}">
<td class="inline checkbox-uuid" ><input name="uuids" type="checkbox" value="{{ watch.uuid}} " > <span>{{ loop.index+pagination.skip }}</span></td>
<td class="inline checkbox-uuid" ><input name="uuids" type="checkbox" value="{{ watch.uuid}} "/> <span>{{ loop.index }}</span></td>
<td class="inline watch-controls">
{% if not watch.paused %}
<a class="state-off" href="{{url_for('index', op='pause', uuid=watch.uuid, tag=active_tag)}}"><img src="{{url_for('static_content', group='images', filename='pause.svg')}}" alt="Pause checks" title="Pause checks" class="icon icon-pause" ></a>
<a class="state-off" href="{{url_for('index', op='pause', uuid=watch.uuid, tag=active_tag)}}"><img src="{{url_for('static_content', group='images', filename='pause.svg')}}" alt="Pause checks" title="Pause checks" class="icon icon-pause"/></a>
{% else %}
<a class="state-on" href="{{url_for('index', op='pause', uuid=watch.uuid, tag=active_tag)}}"><img src="{{url_for('static_content', group='images', filename='play.svg')}}" alt="UnPause checks" title="UnPause checks" class="icon icon-unpause" ></a>
<a class="state-on" href="{{url_for('index', op='pause', uuid=watch.uuid, tag=active_tag)}}"><img src="{{url_for('static_content', group='images', filename='play.svg')}}" alt="UnPause checks" title="UnPause checks" class="icon icon-unpause"/></a>
{% endif %}
<a class="link-mute state-{{'on' if watch.notification_muted else 'off'}}" href="{{url_for('index', op='mute', uuid=watch.uuid, tag=active_tag)}}"><img src="{{url_for('static_content', group='images', filename='bell-off.svg')}}" alt="Mute notifications" title="Mute notifications" class="icon icon-mute" ></a>
<a class="link-mute state-{{'on' if watch.notification_muted else 'off'}}" href="{{url_for('index', op='mute', uuid=watch.uuid, tag=active_tag)}}"><img src="{{url_for('static_content', group='images', filename='bell-off.svg')}}" alt="Mute notifications" title="Mute notifications" class="icon icon-mute"/></a>
</td>
<td class="title-col inline">{{watch.title if watch.title is not none and watch.title|length > 0 else watch.url}}
<a class="external" target="_blank" rel="noopener" href="{{ watch.link.replace('source:','') }}"></a>
<a class="link-spread" href="{{url_for('form_share_put_watch', uuid=watch.uuid)}}"><img src="{{url_for('static_content', group='images', filename='spread.svg')}}" class="status-icon icon icon-spread" title="Create a link to share watch config with others" ></a>
<a class="link-spread" href="{{url_for('form_share_put_watch', uuid=watch.uuid)}}"><img class="status-icon" src="{{url_for('static_content', group='images', filename='spread.svg')}}" class="status-icon icon icon-spread" title="Create a link to share watch config with others" /></a>
{% if watch.get_fetch_backend == "html_webdriver"
or ( watch.get_fetch_backend == "system" and system_default_fetcher == 'html_webdriver' )
%}
<img class="status-icon" src="{{url_for('static_content', group='images', filename='Google-Chrome-icon.png')}}" title="Using a chrome browser" >
<img class="status-icon" src="{{url_for('static_content', group='images', filename='Google-Chrome-icon.png')}}" title="Using a chrome browser" />
{% endif %}
{%if watch.is_pdf %}<img class="status-icon" src="{{url_for('static_content', group='images', filename='pdf-icon.svg')}}" title="Converting PDF to text" >{% endif %}
{%if watch.is_pdf %}<img class="status-icon" src="{{url_for('static_content', group='images', filename='pdf-icon.svg')}}" title="Converting PDF to text" />{% endif %}
{% if watch.last_error is defined and watch.last_error != False %}
<div class="fetch-error">{{ watch.last_error }}
@@ -128,7 +123,7 @@
<div class="ldjson-price-track-offer">Embedded price data detected, follow only price data? <a href="{{url_for('price_data_follower.accept', uuid=watch.uuid)}}" class="pure-button button-xsmall">Yes</a> <a href="{{url_for('price_data_follower.reject', uuid=watch.uuid)}}" class="">No</a></div>
{% endif %}
{% if watch['track_ldjson_price_data'] == 'accepted' %}
<span class="tracking-ldjson-price-data" title="Automatically following embedded price information"><img src="{{url_for('static_content', group='images', filename='price-tag-icon.svg')}}" class="status-icon price-follow-tag-icon" > Price</span>
<span class="tracking-ldjson-price-data" title="Automatically following embedded price information"><img src="{{url_for('static_content', group='images', filename='price-tag-icon.svg')}}" class="status-icon price-follow-tag-icon"/> Price</span>
{% endif %}
{% endif %}
@@ -140,7 +135,6 @@
{% else %}
Not yet checked
{% endif %}
</span>
{% endif %}
{% if not active_tag %}
@@ -184,7 +178,10 @@
<a href="{{ url_for('rss', tag=active_tag , token=app_rss_token)}}"><img alt="RSS Feed" id="feed-icon" src="{{url_for('static_content', group='images', filename='Generic_Feed-icon.svg')}}" height="15"></a>
</li>
</ul>
{{ pagination.links }}
{# WIP for pagination, disabled for now
{{ pagination(sorted_watches,3, pagination_page) }}
#}
</div>
</form>
</div>

View File

@@ -14,16 +14,13 @@ global app
def cleanup(datastore_path):
# Unlink test output files
files = [
'count.txt',
'endpoint-content.txt'
'headers.txt',
'headers-testtag.txt',
'notification.txt',
'secret.txt',
'url-watches.json',
'output.txt',
]
files = ['output.txt',
'url-watches.json',
'secret.txt',
'notification.txt',
'count.txt',
'endpoint-content.txt'
]
for file in files:
try:
os.unlink("{}/{}".format(datastore_path, file))

View File

@@ -3,7 +3,7 @@
import time
from flask import url_for, escape
from . util import live_server_setup, wait_for_all_checks
from . util import live_server_setup
import pytest
jq_support = True
@@ -64,24 +64,6 @@ and it can also be repeated
with pytest.raises(html_tools.JSONNotFound) as e_info:
html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "jq:.id")
def test_unittest_inline_extract_body():
content = """
<html>
<head></head>
<body>
<pre style="word-wrap: break-word; white-space: pre-wrap;">
{"testKey": 42}
</pre>
</body>
</html>
"""
from .. import html_tools
# See that we can find the second <script> one, which is not broken, and matches our filter
text = html_tools.extract_json_as_string(content, "json:$.testKey")
assert text == '42'
def set_original_ext_response():
data = """
[
@@ -454,37 +436,6 @@ def test_ignore_json_order(client, live_server):
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_correct_header_detect(client, live_server):
# Like in https://github.com/dgtlmoon/changedetection.io/pull/1593
# Specify extra html that JSON is sometimes wrapped in - when using Browserless/Puppeteer etc
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write('<html><body>{"hello" : 123, "world": 123}')
# Add our URL to the import page
# Check weird casing is cleaned up and detected also
test_url = url_for('test_endpoint', content_type="aPPlication/JSon", uppercase_headers=True, _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
res = client.get(url_for("index"))
# Fixed in #1593
assert b'No parsable JSON found in this document' not in res.data
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
assert b'&#34;world&#34;:' in res.data
assert res.data.count(b'{') >= 2
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_check_jsonpath_ext_filter(client, live_server):
check_json_ext_filter('json:$[?(@.status==Sold)]', client, live_server)

View File

@@ -1,8 +1,7 @@
import json
import os
import time
from flask import url_for
from . util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks, extract_UUID_from_client
from . util import set_original_response, set_modified_response, live_server_setup
def test_setup(live_server):
live_server_setup(live_server)
@@ -10,12 +9,8 @@ def test_setup(live_server):
# Hard to just add more live server URLs when one test is already running (I think)
# So we add our test here (was in a different file)
def test_headers_in_request(client, live_server):
#live_server_setup(live_server)
# Add our URL to the import page
test_url = url_for('test_headers', _external=True)
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
# Because its no longer calling back to localhost but from browserless, set in test-only.yml
test_url = test_url.replace('localhost', 'changedet')
# Add the test URL twice, we will check
res = client.post(
@@ -34,7 +29,7 @@ def test_headers_in_request(client, live_server):
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
time.sleep(3)
cookie_header = '_ga=GA1.2.1022228332; cookie-preferences=analytics:accepted;'
@@ -44,7 +39,7 @@ def test_headers_in_request(client, live_server):
data={
"url": test_url,
"tag": "",
"fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests',
"fetch_backend": "html_requests",
"headers": "xxx:ooo\ncool:yeah\r\ncookie:"+cookie_header},
follow_redirects=True
)
@@ -52,7 +47,7 @@ def test_headers_in_request(client, live_server):
# Give the thread time to pick up the first version
wait_for_all_checks(client)
time.sleep(5)
# The service should echo back the request headers
res = client.get(
@@ -68,7 +63,7 @@ def test_headers_in_request(client, live_server):
from html import escape
assert escape(cookie_header).encode('utf-8') in res.data
wait_for_all_checks(client)
time.sleep(5)
# Re #137 - Examine the JSON index file, it should have only one set of headers entered
watches_with_headers = 0
@@ -84,9 +79,6 @@ def test_headers_in_request(client, live_server):
def test_body_in_request(client, live_server):
# Add our URL to the import page
test_url = url_for('test_body', _external=True)
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
# Because its no longer calling back to localhost but from browserless, set in test-only.yml
test_url = test_url.replace('localhost', 'cdio')
res = client.post(
url_for("import_page"),
@@ -175,9 +167,6 @@ def test_body_in_request(client, live_server):
def test_method_in_request(client, live_server):
# Add our URL to the import page
test_url = url_for('test_method', _external=True)
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
# Because its no longer calling back to localhost but from browserless, set in test-only.yml
test_url = test_url.replace('localhost', 'cdio')
# Add the test URL twice, we will check
res = client.post(
@@ -245,76 +234,3 @@ def test_method_in_request(client, live_server):
# Should be only one with method set to PATCH
assert watches_with_method == 1
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_headers_textfile_in_request(client, live_server):
#live_server_setup(live_server)
# Add our URL to the import page
test_url = url_for('test_headers', _external=True)
if os.getenv('PLAYWRIGHT_DRIVER_URL'):
# Because its no longer calling back to localhost but from browserless, set in test-only.yml
test_url = test_url.replace('localhost', 'cdio')
print ("TEST URL IS ",test_url)
# Add the test URL twice, we will check
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
time.sleep(1)
# Add some headers to a request
res = client.post(
url_for("edit_page", uuid="first"),
data={
"url": test_url,
"tag": "testtag",
"fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests',
"headers": "xxx:ooo\ncool:yeah\r\n"},
follow_redirects=True
)
assert b"Updated watch." in res.data
wait_for_all_checks(client)
with open('test-datastore/headers-testtag.txt', 'w') as f:
f.write("tag-header: test")
with open('test-datastore/headers.txt', 'w') as f:
f.write("global-header: nice\r\nnext-global-header: nice")
with open('test-datastore/'+extract_UUID_from_client(client)+'/headers.txt', 'w') as f:
f.write("watch-header: nice")
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
wait_for_all_checks(client)
res = client.get(url_for("edit_page", uuid="first"))
assert b"Extra headers file found and will be added to this watch" in res.data
# Not needed anymore
os.unlink('test-datastore/headers.txt')
os.unlink('test-datastore/headers-testtag.txt')
os.unlink('test-datastore/'+extract_UUID_from_client(client)+'/headers.txt')
# The service should echo back the request verb
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
assert b"Global-Header:nice" in res.data
assert b"Next-Global-Header:nice" in res.data
assert b"Xxx:ooo" in res.data
assert b"Watch-Header:nice" in res.data
assert b"Tag-Header:test" in res.data
#unlink headers.txt on start/stop
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data

View File

@@ -1,74 +0,0 @@
from flask import url_for
from .util import set_original_response, set_modified_response, live_server_setup
import time
def test_setup(live_server):
live_server_setup(live_server)
def test_basic_search(client, live_server):
#live_server_setup(live_server)
urls = ['https://localhost:12300?first-result=1',
'https://localhost:5000?second-result=1'
]
res = client.post(
url_for("import_page"),
data={"urls": "\r\n".join(urls)},
follow_redirects=True
)
assert b"2 Imported" in res.data
# By URL
res = client.get(url_for("index") + "?q=first-res")
assert urls[0].encode('utf-8') in res.data
assert urls[1].encode('utf-8') not in res.data
# By Title
res = client.post(
url_for("edit_page", uuid="first"),
data={"title": "xxx-title", "url": urls[0], "tag": "", "headers": "", 'fetch_backend': "html_requests"},
follow_redirects=True
)
assert b"Updated watch." in res.data
res = client.get(url_for("index") + "?q=xxx-title")
assert urls[0].encode('utf-8') in res.data
assert urls[1].encode('utf-8') not in res.data
def test_search_in_tag_limit(client, live_server):
#live_server_setup(live_server)
urls = ['https://localhost:12300?first-result=1 tag-one',
'https://localhost:5000?second-result=1 tag-two'
]
res = client.post(
url_for("import_page"),
data={"urls": "\r\n".join(urls)},
follow_redirects=True
)
assert b"2 Imported" in res.data
# By URL
res = client.get(url_for("index") + "?q=first-res")
# Split because of the import tag separation
assert urls[0].split(' ')[0].encode('utf-8') in res.data, urls[0].encode('utf-8')
assert urls[1].split(' ')[0].encode('utf-8') not in res.data, urls[0].encode('utf-8')
# By Title
res = client.post(
url_for("edit_page", uuid="first"),
data={"title": "xxx-title", "url": urls[0].split(' ')[0], "tag": urls[0].split(' ')[1], "headers": "",
'fetch_backend': "html_requests"},
follow_redirects=True
)
assert b"Updated watch." in res.data
res = client.get(url_for("index") + "?q=xxx-title")
assert urls[0].split(' ')[0].encode('utf-8') in res.data, urls[0].encode('utf-8')
assert urls[1].split(' ')[0].encode('utf-8') not in res.data, urls[0].encode('utf-8')

View File

@@ -119,26 +119,16 @@ def live_server_setup(live_server):
status_code = request.args.get('status_code')
content = request.args.get('content') or None
# Used to just try to break the header detection
uppercase_headers = request.args.get('uppercase_headers')
try:
if content is not None:
resp = make_response(content, status_code)
if uppercase_headers:
ctype=ctype.upper()
resp.headers['CONTENT-TYPE'] = ctype if ctype else 'text/html'
else:
resp.headers['Content-Type'] = ctype if ctype else 'text/html'
resp.headers['Content-Type'] = ctype if ctype else 'text/html'
return resp
# Tried using a global var here but didn't seem to work, so reading from a file instead.
with open("test-datastore/endpoint-content.txt", "r") as f:
resp = make_response(f.read(), status_code)
if uppercase_headers:
resp.headers['CONTENT-TYPE'] = ctype if ctype else 'text/html'
else:
resp.headers['Content-Type'] = ctype if ctype else 'text/html'
resp.headers['Content-Type'] = ctype if ctype else 'text/html'
return resp
except FileNotFoundError:
return make_response('', status_code)

View File

@@ -3,9 +3,8 @@ import threading
import queue
import time
from changedetectionio import content_fetcher
from .processors.text_json_diff import FilterNotFoundInResponse
from .processors.restock_diff import UnableToExtractRestockData
from .fetchers import exceptions
# A single update worker
#
@@ -190,6 +189,7 @@ class update_worker(threading.Thread):
processor = self.datastore.data['watching'][uuid].get('processor','text_json_diff')
# @todo some way to switch by name
update_handler = None
if processor == 'restock_diff':
update_handler = restock_diff.perform_site_check(datastore=self.datastore)
else:
@@ -205,7 +205,7 @@ class update_worker(threading.Thread):
except PermissionError as e:
self.app.logger.error("File permission error updating", uuid, str(e))
process_changedetection_results = False
except content_fetcher.ReplyWithContentButNoText as e:
except exceptions.ReplyWithContentButNoText as e:
# Totally fine, it's by choice - just continue on, nothing more to care about
# Page had elements/content but no renderable text
# Backend (not filters) gave zero output
@@ -214,7 +214,7 @@ class update_worker(threading.Thread):
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot)
process_changedetection_results = False
except content_fetcher.Non200ErrorCodeReceived as e:
except exceptions.Non200ErrorCodeReceived as e:
if e.status_code == 403:
err_text = "Error - 403 (Access denied) received"
elif e.status_code == 404:
@@ -238,7 +238,7 @@ class update_worker(threading.Thread):
if not self.datastore.data['watching'].get(uuid):
continue
err_text = "Warning, no filters were found, no change detection ran - Did the page change layout? update your Visual Filter if necessary."
err_text = "Warning, no filters were found, no change detection ran."
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
# Only when enabled, send the notification
@@ -258,13 +258,12 @@ class update_worker(threading.Thread):
process_changedetection_results = False
except content_fetcher.checksumFromPreviousCheckWasTheSame as e:
except exceptions.checksumFromPreviousCheckWasTheSame as e:
# Yes fine, so nothing todo, don't continue to process.
process_changedetection_results = False
changed_detected = False
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': False})
except content_fetcher.BrowserStepsStepTimout as e:
except exceptions.BrowserStepsStepTimout as e:
if not self.datastore.data['watching'].get(uuid):
continue
@@ -289,25 +288,25 @@ class update_worker(threading.Thread):
process_changedetection_results = False
except content_fetcher.EmptyReply as e:
except exceptions.EmptyReply as e:
# Some kind of custom to-str handler in the exception handler that does this?
err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetcher.ScreenshotUnavailable as e:
except exceptions.ScreenshotUnavailable as e:
err_text = "Screenshot unavailable, page did not render fully in the expected time - try increasing 'Wait seconds before extracting text'"
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetcher.JSActionExceptions as e:
except exceptions.JSActionExceptions as e:
err_text = "Error running JS Actions - Page request - "+e.message
if e.screenshot:
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot, as_error=True)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetcher.PageUnloadable as e:
except exceptions.PageUnloadable as e:
err_text = "Page request from server didnt respond correctly"
if e.message:
err_text = "{} - {}".format(err_text, e.message)
@@ -316,13 +315,7 @@ class update_worker(threading.Thread):
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot, as_error=True)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code,
'has_ldjson_price_data': None})
process_changedetection_results = False
except UnableToExtractRestockData as e:
# Usually when fetcher.instock_data returns empty
self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': f"Unable to extract restock data for this page unfortunately. (Got code {e.status_code} from server)"})
'last_check_status': e.status_code})
process_changedetection_results = False
except Exception as e:
self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))

View File

@@ -2,7 +2,6 @@ eventlet>=0.31.0
feedgen~=0.9
flask-compress
flask-login~=0.5
flask-paginate
flask_expects_json~=1.7
flask_restful
flask_wtf