Compare commits

..

3 Commits

Author SHA1 Message Date
dgtlmoon
8578cc3582 Fix tuple 2022-10-09 18:10:24 +02:00
dgtlmoon
b72d6f8dec use brotli package 2022-10-09 18:10:14 +02:00
dgtlmoon
5b3f240846 Dont use default Requests user-agent and accept headers in playwright+selenium requests, breaks sites such as united.com. 2022-10-09 17:54:13 +02:00
44 changed files with 371 additions and 750 deletions

View File

@@ -1,31 +0,0 @@
# Taken from https://github.com/linuxserver/docker-changedetection.io/blob/main/Dockerfile
# Test that we can still build on Alpine (musl modified libc https://musl.libc.org/)
# Some packages wont install via pypi because they dont have a wheel available under this architecture.
FROM ghcr.io/linuxserver/baseimage-alpine:3.16
ENV PYTHONUNBUFFERED=1
COPY requirements.txt /requirements.txt
RUN \
apk add --update --no-cache --virtual=build-dependencies \
cargo \
g++ \
gcc \
libc-dev \
libffi-dev \
libxslt-dev \
make \
openssl-dev \
py3-wheel \
python3-dev \
zlib-dev && \
apk add --update --no-cache \
libxslt \
python3 \
py3-pip && \
echo "**** pip3 install test of changedetection.io ****" && \
pip3 install -U pip wheel setuptools && \
pip3 install -U --no-cache-dir --find-links https://wheel-index.linuxserver.io/alpine-3.16/ -r /requirements.txt && \
apk del --purge \
build-dependencies

View File

@@ -1,66 +0,0 @@
name: ChangeDetection.io Container Build Test
# Triggers the workflow on push or pull request events
# This line doesnt work, even tho it is the documented one
#on: [push, pull_request]
on:
push:
paths:
- requirements.txt
- Dockerfile
pull_request:
paths:
- requirements.txt
- Dockerfile
# Changes to requirements.txt packages and Dockerfile may or may not always be compatible with arm etc, so worth testing
# @todo: some kind of path filter for requirements.txt and Dockerfile
jobs:
test-container-build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.9
uses: actions/setup-python@v2
with:
python-version: 3.9
# Just test that the build works, some libraries won't compile on ARM/rPi etc
- name: Set up QEMU
uses: docker/setup-qemu-action@v1
with:
image: tonistiigi/binfmt:latest
platforms: all
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v1
with:
install: true
version: latest
driver-opts: image=moby/buildkit:master
# https://github.com/dgtlmoon/changedetection.io/pull/1067
# Check we can still build under alpine/musl
- name: Test that the docker containers can build (musl via alpine check)
id: docker_build_musl
uses: docker/build-push-action@v2
with:
context: ./
file: ./.github/test/Dockerfile-alpine
platforms: linux/amd64,linux/arm64
- name: Test that the docker containers can build
id: docker_build
uses: docker/build-push-action@v2
# https://github.com/docker/build-push-action#customizing
with:
context: ./
file: ./Dockerfile
platforms: linux/arm/v7,linux/arm/v6,linux/amd64,linux/arm64,
cache-from: type=local,src=/tmp/.buildx-cache
cache-to: type=local,dest=/tmp/.buildx-cache

View File

@@ -1,25 +1,28 @@
name: ChangeDetection.io App Test name: ChangeDetection.io Test
# Triggers the workflow on push or pull request events # Triggers the workflow on push or pull request events
on: [push, pull_request] on: [push, pull_request]
jobs: jobs:
test-application: test-build:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
- name: Set up Python 3.9 - name: Set up Python 3.9
uses: actions/setup-python@v2 uses: actions/setup-python@v2
with: with:
python-version: 3.9 python-version: 3.9
- name: Show env vars
run: set
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install flake8 pytest pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
- name: Lint with flake8 - name: Lint with flake8
run: | run: |
# stop the build if there are Python syntax errors or undefined names # stop the build if there are Python syntax errors or undefined names
@@ -36,4 +39,7 @@ jobs:
# Each test is totally isolated and performs its own cleanup/reset # Each test is totally isolated and performs its own cleanup/reset
cd changedetectionio; ./run_all_tests.sh cd changedetectionio; ./run_all_tests.sh
# https://github.com/docker/build-push-action/blob/master/docs/advanced/test-before-push.md ?
# https://github.com/docker/buildx/issues/59 ? Needs to be one platform?
# https://github.com/docker/buildx/issues/495#issuecomment-918925854

View File

@@ -6,7 +6,7 @@ Otherwise, it's always best to PR into the `dev` branch.
Please be sure that all new functionality has a matching test! Please be sure that all new functionality has a matching test!
Use `pytest` to validate/test, you can run the existing tests as `pytest tests/test_notification.py` for example Use `pytest` to validate/test, you can run the existing tests as `pytest tests/test_notifications.py` for example
``` ```
pip3 install -r requirements-dev pip3 install -r requirements-dev

View File

@@ -5,14 +5,13 @@ FROM python:3.8-slim as builder
ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1 ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
g++ \ libssl-dev \
libffi-dev \
gcc \ gcc \
libc-dev \ libc-dev \
libffi-dev \
libssl-dev \
libxslt-dev \ libxslt-dev \
make \ zlib1g-dev \
zlib1g-dev g++
RUN mkdir /install RUN mkdir /install
WORKDIR /install WORKDIR /install
@@ -23,7 +22,6 @@ RUN pip install --target=/dependencies -r /requirements.txt
# Playwright is an alternative to Selenium # Playwright is an alternative to Selenium
# Excluded this package from requirements.txt to prevent arm/v6 and arm/v7 builds from failing # Excluded this package from requirements.txt to prevent arm/v6 and arm/v7 builds from failing
# https://github.com/dgtlmoon/changedetection.io/pull/1067 also musl/alpine (not supported)
RUN pip install --target=/dependencies playwright~=1.26 \ RUN pip install --target=/dependencies playwright~=1.26 \
|| echo "WARN: Failed to install Playwright. The application can still run, but the Playwright option will be disabled." || echo "WARN: Failed to install Playwright. The application can still run, but the Playwright option will be disabled."
@@ -60,7 +58,6 @@ EXPOSE 5000
# The actual flask app # The actual flask app
COPY changedetectionio /app/changedetectionio COPY changedetectionio /app/changedetectionio
# The eventlet server wrapper # The eventlet server wrapper
COPY changedetection.py /app/changedetection.py COPY changedetection.py /app/changedetection.py

View File

@@ -2,7 +2,6 @@ recursive-include changedetectionio/api *
recursive-include changedetectionio/templates * recursive-include changedetectionio/templates *
recursive-include changedetectionio/static * recursive-include changedetectionio/static *
recursive-include changedetectionio/model * recursive-include changedetectionio/model *
recursive-include changedetectionio/tests *
include changedetection.py include changedetection.py
global-exclude *.pyc global-exclude *.pyc
global-exclude node_modules global-exclude node_modules

View File

@@ -1,7 +1,6 @@
## Web Site Change Detection, Monitoring and Notification. ## Web Site Change Detection, Monitoring and Notification.
_Live your data-life pro-actively, Detect website changes and perform meaningful actions, trigger notifications via Discord, Email, Slack, Telegram, API calls and many more._ Live your data-life pro-actively, track website content changes and receive notifications via Discord, Email, Slack, Telegram and 70+ more
[<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot.png" style="max-width:100%;" alt="Self-hosted web page change monitoring" title="Self-hosted web page change monitoring" />](https://lemonade.changedetection.io/start?src=github) [<img src="https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/screenshot.png" style="max-width:100%;" alt="Self-hosted web page change monitoring" title="Self-hosted web page change monitoring" />](https://lemonade.changedetection.io/start?src=github)
@@ -9,6 +8,8 @@ _Live your data-life pro-actively, Detect website changes and perform meaningful
![changedetection.io](https://github.com/dgtlmoon/changedetection.io/actions/workflows/test-only.yml/badge.svg?branch=master) ![changedetection.io](https://github.com/dgtlmoon/changedetection.io/actions/workflows/test-only.yml/badge.svg?branch=master)
Know when important content changes, we support notifications via Discord, Telegram, Home-Assistant, Slack, Email and 70+ more
[**Don't have time? Let us host it for you! try our $6.99/month subscription - use our proxies and support!**](https://lemonade.changedetection.io/start) , _half the price of other website change monitoring services and comes with unlimited watches & checks!_ [**Don't have time? Let us host it for you! try our $6.99/month subscription - use our proxies and support!**](https://lemonade.changedetection.io/start) , _half the price of other website change monitoring services and comes with unlimited watches & checks!_
- Chrome browser included. - Chrome browser included.
@@ -120,8 +121,8 @@ See the wiki for more information https://github.com/dgtlmoon/changedetection.io
## Filters ## Filters
XPath, JSONPath, jq, and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools.
XPath, JSONPath, jq, and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools.
(We support LXML `re:test`, `re:math` and `re:replace`.) (We support LXML `re:test`, `re:math` and `re:replace`.)
## Notifications ## Notifications
@@ -160,11 +161,46 @@ This will re-parse the JSON and apply formatting to the text, making it super ea
### JSONPath or jq? ### JSONPath or jq?
For more complex parsing, filtering, and modifying of JSON data, jq is recommended due to the built-in operators and functions. Refer to the [documentation](https://stedolan.github.io/jq/manual/) for more specifc information on jq. For more complex parsing, filtering, and modifying of JSON data, jq is recommended due to the built-in operators and functions. Refer to the [documentation](https://stedolan.github.io/jq/manual/) for more information on jq.
One big advantage of `jq` is that you can use logic in your JSON filter, such as filters to only show items that have a value greater than/less than etc. The example below adds the price in dollars to each item in the JSON data, and then filters to only show items that are greater than 10.
See the wiki https://github.com/dgtlmoon/changedetection.io/wiki/JSON-Selector-Filter-help for more information and examples #### Sample input data from API
```
{
"items": [
{
"name": "Product A",
"priceInCents": 2500
},
{
"name": "Product B",
"priceInCents": 500
},
{
"name": "Product C",
"priceInCents": 2000
}
]
}
```
#### Sample jq
`jq:.items[] | . + { "priceInDollars": (.priceInCents / 100) } | select(.priceInDollars > 10)`
#### Sample output data
```
{
"name": "Product A",
"priceInCents": 2500,
"priceInDollars": 25
}
{
"name": "Product C",
"priceInCents": 2000,
"priceInDollars": 20
}
```
### Parse JSON embedded in HTML! ### Parse JSON embedded in HTML!
@@ -180,9 +216,9 @@ When you enable a `json:` or `jq:` filter, you can even automatically extract an
`json:$.price` or `jq:.price` would give `23.50`, or you can extract the whole structure `json:$.price` or `jq:.price` would give `23.50`, or you can extract the whole structure
## Proxy Configuration ## Proxy configuration
See the wiki https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration , we also support using [BrightData proxy services where possible]( https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration#brightdata-proxy-support) See the wiki https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration
## Raspberry Pi support? ## Raspberry Pi support?

View File

@@ -33,7 +33,7 @@ from flask_wtf import CSRFProtect
from changedetectionio import html_tools from changedetectionio import html_tools
from changedetectionio.api import api_v1 from changedetectionio.api import api_v1
__version__ = '0.39.21.1' __version__ = '0.39.20'
datastore = None datastore = None
@@ -194,8 +194,7 @@ def changedetection_app(config=None, datastore_o=None):
watch_api.add_resource(api_v1.Watch, '/api/v1/watch/<string:uuid>', watch_api.add_resource(api_v1.Watch, '/api/v1/watch/<string:uuid>',
resource_class_kwargs={'datastore': datastore, 'update_q': update_q}) resource_class_kwargs={'datastore': datastore, 'update_q': update_q})
watch_api.add_resource(api_v1.SystemInfo, '/api/v1/systeminfo',
resource_class_kwargs={'datastore': datastore, 'update_q': update_q})
@@ -599,7 +598,7 @@ def changedetection_app(config=None, datastore_o=None):
extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid) extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
# Reset the previous_md5 so we process a new snapshot including stripping ignore text. # Reset the previous_md5 so we process a new snapshot including stripping ignore text.
if form.include_filters.data != datastore.data['watching'][uuid].get('include_filters', []): if form.css_filter.data.strip() != datastore.data['watching'][uuid]['css_filter']:
if len(datastore.data['watching'][uuid].history): if len(datastore.data['watching'][uuid].history):
extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid) extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
@@ -637,27 +636,20 @@ def changedetection_app(config=None, datastore_o=None):
# Only works reliably with Playwright # Only works reliably with Playwright
visualselector_enabled = os.getenv('PLAYWRIGHT_DRIVER_URL', False) and default['fetch_backend'] == 'html_webdriver' visualselector_enabled = os.getenv('PLAYWRIGHT_DRIVER_URL', False) and default['fetch_backend'] == 'html_webdriver'
# JQ is difficult to install on windows and must be manually added (outside requirements.txt)
jq_support = True
try:
import jq
except ModuleNotFoundError:
jq_support = False
output = render_template("edit.html", output = render_template("edit.html",
uuid=uuid,
watch=datastore.data['watching'][uuid],
form=form,
has_empty_checktime=using_default_check_time,
has_default_notification_urls=True if len(datastore.data['settings']['application']['notification_urls']) else False,
using_global_webdriver_wait=default['webdriver_delay'] is None,
current_base_url=datastore.data['settings']['application']['base_url'], current_base_url=datastore.data['settings']['application']['base_url'],
emailprefix=os.getenv('NOTIFICATION_MAIL_BUTTON_PREFIX', False), emailprefix=os.getenv('NOTIFICATION_MAIL_BUTTON_PREFIX', False),
form=form,
has_default_notification_urls=True if len(datastore.data['settings']['application']['notification_urls']) else False,
has_empty_checktime=using_default_check_time,
jq_support=jq_support,
playwright_enabled=os.getenv('PLAYWRIGHT_DRIVER_URL', False),
settings_application=datastore.data['settings']['application'], settings_application=datastore.data['settings']['application'],
using_global_webdriver_wait=default['webdriver_delay'] is None,
uuid=uuid,
visualselector_data_is_ready=visualselector_data_is_ready, visualselector_data_is_ready=visualselector_data_is_ready,
visualselector_enabled=visualselector_enabled, visualselector_enabled=visualselector_enabled,
watch=datastore.data['watching'][uuid], playwright_enabled=os.getenv('PLAYWRIGHT_DRIVER_URL', False)
) )
return output return output
@@ -817,10 +809,8 @@ def changedetection_app(config=None, datastore_o=None):
newest_file = history[dates[-1]] newest_file = history[dates[-1]]
# Read as binary and force decode as UTF-8
# Windows may fail decode in python if we just use 'r' mode (chardet decode exception)
try: try:
with open(newest_file, 'r', encoding='utf-8', errors='ignore') as f: with open(newest_file, 'r') as f:
newest_version_file_contents = f.read() newest_version_file_contents = f.read()
except Exception as e: except Exception as e:
newest_version_file_contents = "Unable to read {}.\n".format(newest_file) newest_version_file_contents = "Unable to read {}.\n".format(newest_file)
@@ -833,7 +823,7 @@ def changedetection_app(config=None, datastore_o=None):
previous_file = history[dates[-2]] previous_file = history[dates[-2]]
try: try:
with open(previous_file, 'r', encoding='utf-8', errors='ignore') as f: with open(previous_file, 'r') as f:
previous_version_file_contents = f.read() previous_version_file_contents = f.read()
except Exception as e: except Exception as e:
previous_version_file_contents = "Unable to read {}.\n".format(previous_file) previous_version_file_contents = "Unable to read {}.\n".format(previous_file)
@@ -910,7 +900,7 @@ def changedetection_app(config=None, datastore_o=None):
timestamp = list(watch.history.keys())[-1] timestamp = list(watch.history.keys())[-1]
filename = watch.history[timestamp] filename = watch.history[timestamp]
try: try:
with open(filename, 'r', encoding='utf-8', errors='ignore') as f: with open(filename, 'r') as f:
tmp = f.readlines() tmp = f.readlines()
# Get what needs to be highlighted # Get what needs to be highlighted
@@ -985,6 +975,9 @@ def changedetection_app(config=None, datastore_o=None):
# create a ZipFile object # create a ZipFile object
backupname = "changedetection-backup-{}.zip".format(int(time.time())) backupname = "changedetection-backup-{}.zip".format(int(time.time()))
# We only care about UUIDS from the current index file
uuids = list(datastore.data['watching'].keys())
backup_filepath = os.path.join(datastore_o.datastore_path, backupname) backup_filepath = os.path.join(datastore_o.datastore_path, backupname)
with zipfile.ZipFile(backup_filepath, "w", with zipfile.ZipFile(backup_filepath, "w",
@@ -1000,12 +993,12 @@ def changedetection_app(config=None, datastore_o=None):
# Add the flask app secret # Add the flask app secret
zipObj.write(os.path.join(datastore_o.datastore_path, "secret.txt"), arcname="secret.txt") zipObj.write(os.path.join(datastore_o.datastore_path, "secret.txt"), arcname="secret.txt")
# Add any data in the watch data directory. # Add any snapshot data we find, use the full path to access the file, but make the file 'relative' in the Zip.
for uuid, w in datastore.data['watching'].items(): for txt_file_path in Path(datastore_o.datastore_path).rglob('*.txt'):
for f in Path(w.watch_data_dir).glob('*'): parent_p = txt_file_path.parent
zipObj.write(f, if parent_p.name in uuids:
# Use the full path to access the file, but make the file 'relative' in the Zip. zipObj.write(txt_file_path,
arcname=os.path.join(f.parts[-2], f.parts[-1]), arcname=str(txt_file_path).replace(datastore_o.datastore_path, ''),
compress_type=zipfile.ZIP_DEFLATED, compress_type=zipfile.ZIP_DEFLATED,
compresslevel=8) compresslevel=8)
@@ -1307,8 +1300,8 @@ def changedetection_app(config=None, datastore_o=None):
threading.Thread(target=notification_runner).start() threading.Thread(target=notification_runner).start()
# Check for new release version, but not when running in test/build or pytest # Check for new release version, but not when running in test/build
if not os.getenv("GITHUB_REF", False) and not config.get('disable_checkver') == True: if not os.getenv("GITHUB_REF", False):
threading.Thread(target=check_for_new_version).start() threading.Thread(target=check_for_new_version).start()
return app return app

View File

@@ -122,37 +122,3 @@ class CreateWatch(Resource):
return {'status': "OK"}, 200 return {'status': "OK"}, 200
return list, 200 return list, 200
class SystemInfo(Resource):
def __init__(self, **kwargs):
# datastore is a black box dependency
self.datastore = kwargs['datastore']
self.update_q = kwargs['update_q']
@auth.check_token
def get(self):
import time
overdue_watches = []
# Check all watches and report which have not been checked but should have been
for uuid, watch in self.datastore.data.get('watching', {}).items():
# see if now - last_checked is greater than the time that should have been
# this is not super accurate (maybe they just edited it) but better than nothing
t = watch.threshold_seconds()
if not t:
# Use the system wide default
t = self.datastore.threshold_seconds
time_since_check = time.time() - watch.get('last_checked')
# Allow 5 minutes of grace time before we decide it's overdue
if time_since_check - (5 * 60) > t:
overdue_watches.append(uuid)
return {
'queue_size': self.update_q.qsize(),
'overdue_watches': overdue_watches,
'uptime': round(time.time() - self.datastore.start_time, 2),
'watch_count': len(self.datastore.data.get('watching', {}))
}, 200

View File

@@ -102,14 +102,6 @@ def main():
has_password=datastore.data['settings']['application']['password'] != False has_password=datastore.data['settings']['application']['password'] != False
) )
# Monitored websites will not receive a Referer header when a user clicks on an outgoing link.
# @Note: Incompatible with password login (and maybe other features) for now, submit a PR!
@app.after_request
def hide_referrer(response):
if os.getenv("HIDE_REFERER", False):
response.headers["Referrer-Policy"] = "no-referrer"
return response
# Proxy sub-directory support # Proxy sub-directory support
# Set environment var USE_X_SETTINGS=1 on this script # Set environment var USE_X_SETTINGS=1 on this script
# And then in your proxy_pass settings # And then in your proxy_pass settings

View File

@@ -164,16 +164,16 @@ class Fetcher():
} }
// inject the current one set in the include_filters, which may be a CSS rule // inject the current one set in the css_filter, which may be a CSS rule
// used for displaying the current one in VisualSelector, where its not one we generated. // used for displaying the current one in VisualSelector, where its not one we generated.
if (include_filters.length) { if (css_filter.length) {
q=false; q=false;
try { try {
// is it xpath? // is it xpath?
if (include_filters.startsWith('/') || include_filters.startsWith('xpath:')) { if (css_filter.startsWith('/') || css_filter.startsWith('xpath:')) {
q=document.evaluate(include_filters.replace('xpath:',''), document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; q=document.evaluate(css_filter.replace('xpath:',''), document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
} else { } else {
q=document.querySelector(include_filters); q=document.querySelector(css_filter);
} }
} catch (e) { } catch (e) {
// Maybe catch DOMException and alert? // Maybe catch DOMException and alert?
@@ -186,7 +186,7 @@ class Fetcher():
if (bbox && bbox['width'] >0 && bbox['height']>0) { if (bbox && bbox['width'] >0 && bbox['height']>0) {
size_pos.push({ size_pos.push({
xpath: include_filters, xpath: css_filter,
width: bbox['width'], width: bbox['width'],
height: bbox['height'], height: bbox['height'],
left: bbox['left'], left: bbox['left'],
@@ -220,7 +220,7 @@ class Fetcher():
request_body, request_body,
request_method, request_method,
ignore_status_codes=False, ignore_status_codes=False,
current_include_filters=None): current_css_filter=None):
# Should set self.error, self.status_code and self.content # Should set self.error, self.status_code and self.content
pass pass
@@ -310,7 +310,7 @@ class base_html_playwright(Fetcher):
request_body, request_body,
request_method, request_method,
ignore_status_codes=False, ignore_status_codes=False,
current_include_filters=None): current_css_filter=None):
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
import playwright._impl._api_types import playwright._impl._api_types
@@ -413,10 +413,10 @@ class base_html_playwright(Fetcher):
self.status_code = response.status self.status_code = response.status
self.headers = response.all_headers() self.headers = response.all_headers()
if current_include_filters is not None: if current_css_filter is not None:
page.evaluate("var include_filters={}".format(json.dumps(current_include_filters))) page.evaluate("var css_filter={}".format(json.dumps(current_css_filter)))
else: else:
page.evaluate("var include_filters=''") page.evaluate("var css_filter=''")
self.xpath_data = page.evaluate("async () => {" + self.xpath_element_js + "}") self.xpath_data = page.evaluate("async () => {" + self.xpath_element_js + "}")
@@ -497,7 +497,7 @@ class base_html_webdriver(Fetcher):
request_body, request_body,
request_method, request_method,
ignore_status_codes=False, ignore_status_codes=False,
current_include_filters=None): current_css_filter=None):
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
@@ -573,7 +573,7 @@ class html_requests(Fetcher):
request_body, request_body,
request_method, request_method,
ignore_status_codes=False, ignore_status_codes=False,
current_include_filters=None): current_css_filter=None):
# Make requests use a more modern looking user-agent # Make requests use a more modern looking user-agent
if not 'User-Agent' in request_headers: if not 'User-Agent' in request_headers:

View File

@@ -10,11 +10,6 @@ from changedetectionio import content_fetcher, html_tools
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
class FilterNotFoundInResponse(ValueError):
def __init__(self, msg):
ValueError.__init__(self, msg)
# Some common stuff here that can be moved to a base class # Some common stuff here that can be moved to a base class
# (set_proxy_from_list) # (set_proxy_from_list)
class perform_site_check(): class perform_site_check():
@@ -38,20 +33,18 @@ class perform_site_check():
return regex return regex
def run(self, uuid): def run(self, uuid):
from copy import deepcopy
changed_detected = False changed_detected = False
screenshot = False # as bytes screenshot = False # as bytes
stripped_text_from_html = "" stripped_text_from_html = ""
# DeepCopy so we can be sure we don't accidently change anything by reference watch = self.datastore.data['watching'].get(uuid)
watch = deepcopy(self.datastore.data['watching'].get(uuid))
if not watch: if not watch:
return return
# Protect against file:// access # Protect against file:// access
if re.search(r'^file', watch.get('url', ''), re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False): if re.search(r'^file', watch['url'], re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False):
raise Exception( raise Exception(
"file:// type access is denied for security reasons." "file:// type access is denied for security reasons."
) )
@@ -59,10 +52,10 @@ class perform_site_check():
# Unset any existing notification error # Unset any existing notification error
update_obj = {'last_notification_error': False, 'last_error': False} update_obj = {'last_notification_error': False, 'last_error': False}
extra_headers = watch.get('headers', []) extra_headers =self.datastore.data['watching'][uuid].get('headers')
# Tweak the base config with the per-watch ones # Tweak the base config with the per-watch ones
request_headers = deepcopy(self.datastore.data['settings']['headers']) request_headers = self.datastore.data['settings']['headers'].copy()
request_headers.update(extra_headers) request_headers.update(extra_headers)
# https://github.com/psf/requests/issues/4525 # https://github.com/psf/requests/issues/4525
@@ -72,9 +65,7 @@ class perform_site_check():
request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '') request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '')
timeout = self.datastore.data['settings']['requests'].get('timeout') timeout = self.datastore.data['settings']['requests'].get('timeout')
url = watch.get('url')
url = watch.link
request_body = self.datastore.data['watching'][uuid].get('body') request_body = self.datastore.data['watching'][uuid].get('body')
request_method = self.datastore.data['watching'][uuid].get('method') request_method = self.datastore.data['watching'][uuid].get('method')
ignore_status_codes = self.datastore.data['watching'][uuid].get('ignore_status_codes', False) ignore_status_codes = self.datastore.data['watching'][uuid].get('ignore_status_codes', False)
@@ -86,7 +77,7 @@ class perform_site_check():
is_source = True is_source = True
# Pluggable content fetcher # Pluggable content fetcher
prefer_backend = watch.get('fetch_backend') prefer_backend = watch['fetch_backend']
if hasattr(content_fetcher, prefer_backend): if hasattr(content_fetcher, prefer_backend):
klass = getattr(content_fetcher, prefer_backend) klass = getattr(content_fetcher, prefer_backend)
else: else:
@@ -97,21 +88,21 @@ class perform_site_check():
proxy_url = None proxy_url = None
if proxy_id: if proxy_id:
proxy_url = self.datastore.proxy_list.get(proxy_id).get('url') proxy_url = self.datastore.proxy_list.get(proxy_id).get('url')
print("UUID {} Using proxy {}".format(uuid, proxy_url)) print ("UUID {} Using proxy {}".format(uuid, proxy_url))
fetcher = klass(proxy_override=proxy_url) fetcher = klass(proxy_override=proxy_url)
# Configurable per-watch or global extra delay before extracting text (for webDriver types) # Configurable per-watch or global extra delay before extracting text (for webDriver types)
system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None) system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None)
if watch['webdriver_delay'] is not None: if watch['webdriver_delay'] is not None:
fetcher.render_extract_delay = watch.get('webdriver_delay') fetcher.render_extract_delay = watch['webdriver_delay']
elif system_webdriver_delay is not None: elif system_webdriver_delay is not None:
fetcher.render_extract_delay = system_webdriver_delay fetcher.render_extract_delay = system_webdriver_delay
if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip(): if watch['webdriver_js_execute_code'] is not None and watch['webdriver_js_execute_code'].strip():
fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code') fetcher.webdriver_js_execute_code = watch['webdriver_js_execute_code']
fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters')) fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch['css_filter'])
fetcher.quit() fetcher.quit()
self.screenshot = fetcher.screenshot self.screenshot = fetcher.screenshot
@@ -135,30 +126,28 @@ class perform_site_check():
is_html = False is_html = False
is_json = False is_json = False
include_filters_rule = watch.get('include_filters', []) css_filter_rule = watch['css_filter']
# include_filters_rule = watch['include_filters']
subtractive_selectors = watch.get( subtractive_selectors = watch.get(
"subtractive_selectors", [] "subtractive_selectors", []
) + self.datastore.data["settings"]["application"].get( ) + self.datastore.data["settings"]["application"].get(
"global_subtractive_selectors", [] "global_subtractive_selectors", []
) )
has_filter_rule = include_filters_rule and len("".join(include_filters_rule).strip()) has_filter_rule = css_filter_rule and len(css_filter_rule.strip())
has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip()) has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())
if is_json and not has_filter_rule: if is_json and not has_filter_rule:
include_filters_rule.append("json:$") css_filter_rule = "json:$"
has_filter_rule = True has_filter_rule = True
if has_filter_rule: if has_filter_rule:
json_filter_prefixes = ['json:', 'jq:'] json_filter_prefixes = ['json:', 'jq:']
for filter in include_filters_rule: if any(prefix in css_filter_rule for prefix in json_filter_prefixes):
if any(prefix in filter for prefix in json_filter_prefixes): stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, json_filter=css_filter_rule)
stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter) is_html = False
is_html = False
if is_html or is_source: if is_html or is_source:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
fetcher.content = html_tools.workarounds_for_obfuscations(fetcher.content) fetcher.content = html_tools.workarounds_for_obfuscations(fetcher.content)
html_content = fetcher.content html_content = fetcher.content
@@ -170,36 +159,33 @@ class perform_site_check():
else: else:
# Then we assume HTML # Then we assume HTML
if has_filter_rule: if has_filter_rule:
html_content = "" # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
for filter_rule in include_filters_rule: if css_filter_rule[0] == '/' or css_filter_rule.startswith('xpath:'):
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.." html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule.replace('xpath:', ''),
if filter_rule[0] == '/' or filter_rule.startswith('xpath:'): html_content=fetcher.content)
html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''), else:
html_content=fetcher.content, # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
append_pretty_line_formatting=not is_source) html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
else:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
html_content += html_tools.include_filters(include_filters=filter_rule,
html_content=fetcher.content,
append_pretty_line_formatting=not is_source)
if not html_content.strip():
raise FilterNotFoundInResponse(include_filters_rule)
if has_subtractive_selectors: if has_subtractive_selectors:
html_content = html_tools.element_removal(subtractive_selectors, html_content) html_content = html_tools.element_removal(subtractive_selectors, html_content)
if is_source: if not is_source:
stripped_text_from_html = html_content
else:
# extract text # extract text
do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
stripped_text_from_html = \ stripped_text_from_html = \
html_tools.html_to_text( html_tools.html_to_text(
html_content, html_content,
render_anchor_tag_content=do_anchor render_anchor_tag_content=self.datastore.data["settings"][
"application"].get(
"render_anchor_tag_content", False)
) )
elif is_source:
stripped_text_from_html = html_content
# Re #340 - return the content before the 'ignore text' was applied
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
# Re #340 - return the content before the 'ignore text' was applied # Re #340 - return the content before the 'ignore text' was applied
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
@@ -232,7 +218,7 @@ class perform_site_check():
for l in result: for l in result:
if type(l) is tuple: if type(l) is tuple:
# @todo - some formatter option default (between groups) #@todo - some formatter option default (between groups)
regex_matched_output += list(l) + [b'\n'] regex_matched_output += list(l) + [b'\n']
else: else:
# @todo - some formatter option default (between each ungrouped result) # @todo - some formatter option default (between each ungrouped result)
@@ -246,6 +232,7 @@ class perform_site_check():
stripped_text_from_html = b''.join(regex_matched_output) stripped_text_from_html = b''.join(regex_matched_output)
text_content_before_ignored_filter = stripped_text_from_html text_content_before_ignored_filter = stripped_text_from_html
# Re #133 - if we should strip whitespaces from triggering the change detected comparison # Re #133 - if we should strip whitespaces from triggering the change detected comparison
if self.datastore.data['settings']['application'].get('ignore_whitespace', False): if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest() fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
@@ -255,30 +242,29 @@ class perform_site_check():
############ Blocking rules, after checksum ################# ############ Blocking rules, after checksum #################
blocked = False blocked = False
trigger_text = watch.get('trigger_text', []) if len(watch['trigger_text']):
if len(trigger_text):
# Assume blocked # Assume blocked
blocked = True blocked = True
# Filter and trigger works the same, so reuse it # Filter and trigger works the same, so reuse it
# It should return the line numbers that match # It should return the line numbers that match
result = html_tools.strip_ignore_text(content=str(stripped_text_from_html), result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
wordlist=trigger_text, wordlist=watch['trigger_text'],
mode="line numbers") mode="line numbers")
# Unblock if the trigger was found # Unblock if the trigger was found
if result: if result:
blocked = False blocked = False
text_should_not_be_present = watch.get('text_should_not_be_present', [])
if len(text_should_not_be_present): if len(watch['text_should_not_be_present']):
# If anything matched, then we should block a change from happening # If anything matched, then we should block a change from happening
result = html_tools.strip_ignore_text(content=str(stripped_text_from_html), result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
wordlist=text_should_not_be_present, wordlist=watch['text_should_not_be_present'],
mode="line numbers") mode="line numbers")
if result: if result:
blocked = True blocked = True
# The main thing that all this at the moment comes down to :) # The main thing that all this at the moment comes down to :)
if watch.get('previous_md5') != fetched_md5: if watch['previous_md5'] != fetched_md5:
changed_detected = True changed_detected = True
# Looks like something changed, but did it match all the rules? # Looks like something changed, but did it match all the rules?
@@ -287,7 +273,7 @@ class perform_site_check():
# Extract title as title # Extract title as title
if is_html: if is_html:
if self.datastore.data['settings']['application'].get('extract_title_as_title') or watch['extract_title_as_title']: if self.datastore.data['settings']['application']['extract_title_as_title'] or watch['extract_title_as_title']:
if not watch['title'] or not len(watch['title']): if not watch['title'] or not len(watch['title']):
update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content) update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)

View File

@@ -303,16 +303,12 @@ class ValidateCSSJSONXPATHInput(object):
# Re #265 - maybe in the future fetch the page and offer a # Re #265 - maybe in the future fetch the page and offer a
# warning/notice that its possible the rule doesnt yet match anything? # warning/notice that its possible the rule doesnt yet match anything?
if 'jq:' in line:
if not self.allow_json: if not self.allow_json:
raise ValidationError("jq not permitted in this field!") raise ValidationError("jq not permitted in this field!")
if 'jq:' in line: import jq
try:
import jq
except ModuleNotFoundError:
# `jq` requires full compilation in windows and so isn't generally available
raise ValidationError("jq not support not found")
input = line.replace('jq:', '') input = line.replace('jq:', '')
try: try:
@@ -349,7 +345,7 @@ class watchForm(commonSettingsForm):
time_between_check = FormField(TimeBetweenCheckForm) time_between_check = FormField(TimeBetweenCheckForm)
include_filters = StringListField('CSS/JSONPath/JQ/XPath Filters', [ValidateCSSJSONXPATHInput()], default='') css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()], default='')
subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)]) subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])

View File

@@ -1,36 +1,33 @@
import json
from typing import List
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from jsonpath_ng.ext import parse
import jq
import re
from inscriptis import get_text from inscriptis import get_text
from inscriptis.model.config import ParserConfig from inscriptis.model.config import ParserConfig
from jsonpath_ng.ext import parse
from typing import List
import json
import re
# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis class FilterNotFoundInResponse(ValueError):
TEXT_FILTER_LIST_LINE_SUFFIX = "<br/>" def __init__(self, msg):
ValueError.__init__(self, msg)
class JSONNotFound(ValueError): class JSONNotFound(ValueError):
def __init__(self, msg): def __init__(self, msg):
ValueError.__init__(self, msg) ValueError.__init__(self, msg)
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches # Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
def include_filters(include_filters, html_content, append_pretty_line_formatting=False): def css_filter(css_filter, html_content):
soup = BeautifulSoup(html_content, "html.parser") soup = BeautifulSoup(html_content, "html.parser")
html_block = "" html_block = ""
r = soup.select(include_filters, separator="") r = soup.select(css_filter, separator="")
if len(html_content) > 0 and len(r) == 0:
raise FilterNotFoundInResponse(css_filter)
for item in r:
html_block += str(item)
for element in r: return html_block + "\n"
# When there's more than 1 match, then add the suffix to separate each line
# And where the matched result doesn't include something that will cause Inscriptis to add a newline
# (This way each 'match' reliably has a new-line in the diff)
# Divs are converted to 4 whitespaces by inscriptis
if append_pretty_line_formatting and len(html_block) and not element.name in (['br', 'hr', 'div', 'p']):
html_block += TEXT_FILTER_LIST_LINE_SUFFIX
html_block += str(element)
return html_block
def subtractive_css_selector(css_selector, html_content): def subtractive_css_selector(css_selector, html_content):
soup = BeautifulSoup(html_content, "html.parser") soup = BeautifulSoup(html_content, "html.parser")
@@ -46,29 +43,25 @@ def element_removal(selectors: List[str], html_content):
# Return str Utf-8 of matched rules # Return str Utf-8 of matched rules
def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False): def xpath_filter(xpath_filter, html_content):
from lxml import etree, html from lxml import etree, html
tree = html.fromstring(bytes(html_content, encoding='utf-8')) tree = html.fromstring(bytes(html_content, encoding='utf-8'))
html_block = "" html_block = ""
r = tree.xpath(xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}) r = tree.xpath(xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'})
if len(html_content) > 0 and len(r) == 0:
raise FilterNotFoundInResponse(xpath_filter)
#@note: //title/text() wont work where <title>CDATA.. #@note: //title/text() wont work where <title>CDATA..
for element in r: for element in r:
# When there's more than 1 match, then add the suffix to separate each line
# And where the matched result doesn't include something that will cause Inscriptis to add a newline
# (This way each 'match' reliably has a new-line in the diff)
# Divs are converted to 4 whitespaces by inscriptis
if append_pretty_line_formatting and len(html_block) and (not hasattr( element, 'tag' ) or not element.tag in (['br', 'hr', 'div', 'p'])):
html_block += TEXT_FILTER_LIST_LINE_SUFFIX
if type(element) == etree._ElementStringResult: if type(element) == etree._ElementStringResult:
html_block += str(element) html_block += str(element) + "<br/>"
elif type(element) == etree._ElementUnicodeResult: elif type(element) == etree._ElementUnicodeResult:
html_block += str(element) html_block += str(element) + "<br/>"
else: else:
html_block += etree.tostring(element, pretty_print=True).decode('utf-8') html_block += etree.tostring(element, pretty_print=True).decode('utf-8') + "<br/>"
return html_block return html_block
@@ -92,18 +85,9 @@ def _parse_json(json_data, json_filter):
jsonpath_expression = parse(json_filter.replace('json:', '')) jsonpath_expression = parse(json_filter.replace('json:', ''))
match = jsonpath_expression.find(json_data) match = jsonpath_expression.find(json_data)
return _get_stripped_text_from_json_match(match) return _get_stripped_text_from_json_match(match)
if 'jq:' in json_filter: if 'jq:' in json_filter:
try:
import jq
except ModuleNotFoundError:
# `jq` requires full compilation in windows and so isn't generally available
raise Exception("jq not support not found")
jq_expression = jq.compile(json_filter.replace('jq:', '')) jq_expression = jq.compile(json_filter.replace('jq:', ''))
match = jq_expression.input(json_data).all() match = jq_expression.input(json_data).all()
return _get_stripped_text_from_json_match(match) return _get_stripped_text_from_json_match(match)
def _get_stripped_text_from_json_match(match): def _get_stripped_text_from_json_match(match):

View File

@@ -103,12 +103,12 @@ class import_distill_io_json(Importer):
pass pass
except IndexError: except IndexError:
pass pass
extras['include_filters'] = []
try: try:
extras['css_filter'] = d_config['selections'][0]['frames'][0]['includes'][0]['expr']
if d_config['selections'][0]['frames'][0]['includes'][0]['type'] == 'xpath': if d_config['selections'][0]['frames'][0]['includes'][0]['type'] == 'xpath':
extras['include_filters'].append('xpath:' + d_config['selections'][0]['frames'][0]['includes'][0]['expr']) extras['css_filter'] = 'xpath:' + extras['css_filter']
else:
extras['include_filters'].append(d_config['selections'][0]['frames'][0]['includes'][0]['expr'])
except KeyError: except KeyError:
pass pass
except IndexError: except IndexError:

View File

@@ -1,8 +1,6 @@
from distutils.util import strtobool
import logging
import os import os
import time import uuid as uuid_builder
import uuid from distutils.util import strtobool
minimum_seconds_recheck_time = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 60)) minimum_seconds_recheck_time = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 60))
mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7} mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7}
@@ -16,43 +14,42 @@ class model(dict):
__newest_history_key = None __newest_history_key = None
__history_n=0 __history_n=0
__base_config = { __base_config = {
#'history': {}, # Dict of timestamp and output stripped filename (removed) 'url': None,
#'newest_history_key': 0, (removed, taken from history.txt index) 'tag': None,
'body': None,
'check_unique_lines': False, # On change-detected, compare against all history if its something new
'check_count': 0,
'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine.
'extract_text': [], # Extract text by regex after filters
'extract_title_as_title': False,
'fetch_backend': None,
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
'headers': {}, # Extra headers to send
'ignore_text': [], # List of text to ignore when calculating the comparison checksum
'include_filters': [],
'last_checked': 0, 'last_checked': 0,
'last_error': False, 'paused': False,
'last_viewed': 0, # history key value of the last viewed via the [diff] link 'last_viewed': 0, # history key value of the last viewed via the [diff] link
#'newest_history_key': 0,
'title': None,
'previous_md5': False,
'uuid': str(uuid_builder.uuid4()),
'headers': {}, # Extra headers to send
'body': None,
'method': 'GET', 'method': 'GET',
# Custom notification content #'history': {}, # Dict of timestamp and output stripped filename
'ignore_text': [], # List of text to ignore when calculating the comparison checksum
# Custom notification content
'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise)
'notification_title': None,
'notification_body': None, 'notification_body': None,
'notification_format': default_notification_format_for_watch, 'notification_format': default_notification_format_for_watch,
'notification_muted': False, 'notification_muted': False,
'notification_title': None, 'css_filter': '',
'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise) 'last_error': False,
'paused': False, 'extract_text': [], # Extract text by regex after filters
'previous_md5': False,
'proxy': None, # Preferred proxy connection
'subtractive_selectors': [], 'subtractive_selectors': [],
'tag': None, 'trigger_text': [], # List of text or regex to wait for until a change is detected
'text_should_not_be_present': [], # Text that should not present 'text_should_not_be_present': [], # Text that should not present
'fetch_backend': None,
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine.
'extract_title_as_title': False,
'check_unique_lines': False, # On change-detected, compare against all history if its something new
'proxy': None, # Preferred proxy connection
# Re #110, so then if this is set to None, we know to use the default value instead # Re #110, so then if this is set to None, we know to use the default value instead
# Requires setting to None on submit if it's the same as the default # Requires setting to None on submit if it's the same as the default
# Should be all None by default, so we use the system default in this case. # Should be all None by default, so we use the system default in this case.
'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None}, 'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None},
'title': None,
'trigger_text': [], # List of text or regex to wait for until a change is detected
'url': None,
'uuid': str(uuid.uuid4()),
'webdriver_delay': None, 'webdriver_delay': None,
'webdriver_js_execute_code': None, # Run before change-detection 'webdriver_js_execute_code': None, # Run before change-detection
} }
@@ -63,7 +60,7 @@ class model(dict):
self.update(self.__base_config) self.update(self.__base_config)
self.__datastore_path = kw['datastore_path'] self.__datastore_path = kw['datastore_path']
self['uuid'] = str(uuid.uuid4()) self['uuid'] = str(uuid_builder.uuid4())
del kw['datastore_path'] del kw['datastore_path']
@@ -85,19 +82,10 @@ class model(dict):
return False return False
def ensure_data_dir_exists(self): def ensure_data_dir_exists(self):
if not os.path.isdir(self.watch_data_dir): target_path = os.path.join(self.__datastore_path, self['uuid'])
print ("> Creating data dir {}".format(self.watch_data_dir)) if not os.path.isdir(target_path):
os.mkdir(self.watch_data_dir) print ("> Creating data dir {}".format(target_path))
os.mkdir(target_path)
@property
def link(self):
url = self.get('url', '')
if '{%' in url or '{{' in url:
from jinja2 import Environment
# Jinja2 available in URLs along with https://pypi.org/project/jinja2-time/
jinja2_env = Environment(extensions=['jinja2_time.TimeExtension'])
return str(jinja2_env.from_string(url).render())
return url
@property @property
def label(self): def label(self):
@@ -121,40 +109,16 @@ class model(dict):
@property @property
def history(self): def history(self):
"""History index is just a text file as a list
{watch-uuid}/history.txt
contains a list like
{epoch-time},{filename}\n
We read in this list as the history information
"""
tmp_history = {} tmp_history = {}
import logging
import time
# Read the history file as a dict # Read the history file as a dict
fname = os.path.join(self.watch_data_dir, "history.txt") fname = os.path.join(self.__datastore_path, self.get('uuid'), "history.txt")
if os.path.isfile(fname): if os.path.isfile(fname):
logging.debug("Reading history index " + str(time.time())) logging.debug("Reading history index " + str(time.time()))
with open(fname, "r") as f: with open(fname, "r") as f:
for i in f.readlines(): tmp_history = dict(i.strip().split(',', 2) for i in f.readlines())
if ',' in i:
k, v = i.strip().split(',', 2)
# The index history could contain a relative path, so we need to make the fullpath
# so that python can read it
if not '/' in v and not '\'' in v:
v = os.path.join(self.watch_data_dir, v)
else:
# It's possible that they moved the datadir on older versions
# So the snapshot exists but is in a different path
snapshot_fname = v.split('/')[-1]
proposed_new_path = os.path.join(self.watch_data_dir, snapshot_fname)
if not os.path.exists(v) and os.path.exists(proposed_new_path):
v = proposed_new_path
tmp_history[k] = v
if len(tmp_history): if len(tmp_history):
self.__newest_history_key = list(tmp_history.keys())[-1] self.__newest_history_key = list(tmp_history.keys())[-1]
@@ -165,7 +129,7 @@ class model(dict):
@property @property
def has_history(self): def has_history(self):
fname = os.path.join(self.watch_data_dir, "history.txt") fname = os.path.join(self.__datastore_path, self.get('uuid'), "history.txt")
return os.path.isfile(fname) return os.path.isfile(fname)
# Returns the newest key, but if theres only 1 record, then it's counted as not being new, so return 0. # Returns the newest key, but if theres only 1 record, then it's counted as not being new, so return 0.
@@ -184,33 +148,31 @@ class model(dict):
# Save some text file to the appropriate path and bump the history # Save some text file to the appropriate path and bump the history
# result_obj from fetch_site_status.run() # result_obj from fetch_site_status.run()
def save_history_text(self, contents, timestamp): def save_history_text(self, contents, timestamp):
import uuid
import logging
output_path = "{}/{}".format(self.__datastore_path, self['uuid'])
self.ensure_data_dir_exists() self.ensure_data_dir_exists()
# Small hack so that we sleep just enough to allow 1 second between history snapshots snapshot_fname = "{}/{}.stripped.txt".format(output_path, uuid.uuid4())
# this is because history.txt indexes/keys snapshots by epoch seconds and we dont want dupe keys logging.debug("Saving history text {}".format(snapshot_fname))
if self.__newest_history_key and int(timestamp) == int(self.__newest_history_key):
time.sleep(timestamp - self.__newest_history_key)
snapshot_fname = "{}.txt".format(str(uuid.uuid4())) with open(snapshot_fname, 'wb') as f:
# in /diff/ and /preview/ we are going to assume for now that it's UTF-8 when reading
# most sites are utf-8 and some are even broken utf-8
with open(os.path.join(self.watch_data_dir, snapshot_fname), 'wb') as f:
f.write(contents) f.write(contents)
f.close() f.close()
# Append to index # Append to index
# @todo check last char was \n # @todo check last char was \n
index_fname = os.path.join(self.watch_data_dir, "history.txt") index_fname = "{}/history.txt".format(output_path)
with open(index_fname, 'a') as f: with open(index_fname, 'a') as f:
f.write("{},{}\n".format(timestamp, snapshot_fname)) f.write("{},{}\n".format(timestamp, snapshot_fname))
f.close() f.close()
self.__newest_history_key = timestamp self.__newest_history_key = timestamp
self.__history_n += 1 self.__history_n+=1
# @todo bump static cache of the last timestamp so we dont need to examine the file to set a proper ''viewed'' status #@todo bump static cache of the last timestamp so we dont need to examine the file to set a proper ''viewed'' status
return snapshot_fname return snapshot_fname
@property @property
@@ -243,14 +205,14 @@ class model(dict):
return not local_lines.issubset(existing_history) return not local_lines.issubset(existing_history)
def get_screenshot(self): def get_screenshot(self):
fname = os.path.join(self.watch_data_dir, "last-screenshot.png") fname = os.path.join(self.__datastore_path, self['uuid'], "last-screenshot.png")
if os.path.isfile(fname): if os.path.isfile(fname):
return fname return fname
return False return False
def __get_file_ctime(self, filename): def __get_file_ctime(self, filename):
fname = os.path.join(self.watch_data_dir, filename) fname = os.path.join(self.__datastore_path, self['uuid'], filename)
if os.path.isfile(fname): if os.path.isfile(fname):
return int(os.path.getmtime(fname)) return int(os.path.getmtime(fname))
return False return False
@@ -275,14 +237,9 @@ class model(dict):
def snapshot_error_screenshot_ctime(self): def snapshot_error_screenshot_ctime(self):
return self.__get_file_ctime('last-error-screenshot.png') return self.__get_file_ctime('last-error-screenshot.png')
@property
def watch_data_dir(self):
# The base dir of the watch data
return os.path.join(self.__datastore_path, self['uuid'])
def get_error_text(self): def get_error_text(self):
"""Return the text saved from a previous request that resulted in a non-200 error""" """Return the text saved from a previous request that resulted in a non-200 error"""
fname = os.path.join(self.watch_data_dir, "last-error.txt") fname = os.path.join(self.__datastore_path, self['uuid'], "last-error.txt")
if os.path.isfile(fname): if os.path.isfile(fname):
with open(fname, 'r') as f: with open(fname, 'r') as f:
return f.read() return f.read()
@@ -290,7 +247,7 @@ class model(dict):
def get_error_snapshot(self): def get_error_snapshot(self):
"""Return path to the screenshot that resulted in a non-200 error""" """Return path to the screenshot that resulted in a non-200 error"""
fname = os.path.join(self.watch_data_dir, "last-error-screenshot.png") fname = os.path.join(self.__datastore_path, self['uuid'], "last-error-screenshot.png")
if os.path.isfile(fname): if os.path.isfile(fname):
return fname return fname
return False return False

View File

@@ -9,8 +9,6 @@
# exit when any command fails # exit when any command fails
set -e set -e
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
find tests/test_*py -type f|while read test_name find tests/test_*py -type f|while read test_name
do do
echo "TEST RUNNING $test_name" echo "TEST RUNNING $test_name"
@@ -24,6 +22,7 @@ echo "RUNNING WITH BASE_URL SET"
export BASE_URL="https://really-unique-domain.io" export BASE_URL="https://really-unique-domain.io"
pytest tests/test_notification.py pytest tests/test_notification.py
# Now for the selenium and playwright/browserless fetchers # Now for the selenium and playwright/browserless fetchers
# Note - this is not UI functional tests - just checking that each one can fetch the content # Note - this is not UI functional tests - just checking that each one can fetch the content
@@ -39,9 +38,7 @@ docker kill $$-test_selenium
echo "TESTING WEBDRIVER FETCH > PLAYWRIGHT/BROWSERLESS..." echo "TESTING WEBDRIVER FETCH > PLAYWRIGHT/BROWSERLESS..."
# Not all platforms support playwright (not ARM/rPI), so it's not packaged in requirements.txt # Not all platforms support playwright (not ARM/rPI), so it's not packaged in requirements.txt
PLAYWRIGHT_VERSION=$(grep -i -E "RUN pip install.+" "$SCRIPT_DIR/../Dockerfile" | grep --only-matching -i -E "playwright[=><~+]+[0-9\.]+") pip3 install playwright~=1.24
echo "using $PLAYWRIGHT_VERSION"
pip3 install "$PLAYWRIGHT_VERSION"
docker run -d --name $$-test_browserless -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm -p 3000:3000 --shm-size="2g" browserless/chrome:1.53-chrome-stable docker run -d --name $$-test_browserless -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm -p 3000:3000 --shm-size="2g" browserless/chrome:1.53-chrome-stable
# takes a while to spin up # takes a while to spin up
sleep 5 sleep 5

View File

@@ -50,7 +50,7 @@ $(document).ready(function() {
state_clicked=false; state_clicked=false;
ctx.clearRect(0, 0, c.width, c.height); ctx.clearRect(0, 0, c.width, c.height);
xctx.clearRect(0, 0, c.width, c.height); xctx.clearRect(0, 0, c.width, c.height);
$("#include_filters").val(''); $("#css_filter").val('');
}); });
@@ -68,7 +68,7 @@ $(document).ready(function() {
xctx = c.getContext("2d"); xctx = c.getContext("2d");
// redline highlight context // redline highlight context
ctx = c.getContext("2d"); ctx = c.getContext("2d");
current_default_xpath =$("#include_filters").val(); current_default_xpath =$("#css_filter").val();
fetch_data(); fetch_data();
$('#selector-canvas').off("mousemove mousedown"); $('#selector-canvas').off("mousemove mousedown");
// screenshot_url defined in the edit.html template // screenshot_url defined in the edit.html template
@@ -205,9 +205,9 @@ $(document).ready(function() {
var sel = selector_data['size_pos'][current_selected_i]; var sel = selector_data['size_pos'][current_selected_i];
if (sel[0] == '/') { if (sel[0] == '/') {
// @todo - not sure just checking / is right // @todo - not sure just checking / is right
$("#include_filters").val('xpath:'+sel.xpath); $("#css_filter").val('xpath:'+sel.xpath);
} else { } else {
$("#include_filters").val(sel.xpath); $("#css_filter").val(sel.xpath);
} }
xctx.fillStyle = 'rgba(205,205,205,0.95)'; xctx.fillStyle = 'rgba(205,205,205,0.95)';
xctx.strokeStyle = 'rgba(225,0,0,0.9)'; xctx.strokeStyle = 'rgba(225,0,0,0.9)';

View File

@@ -156,7 +156,7 @@ body:after, body:before {
.fetch-error { .fetch-error {
padding-top: 1em; padding-top: 1em;
font-size: 80%; font-size: 60%;
max-width: 400px; max-width: 400px;
display: block; display: block;
} }
@@ -803,4 +803,4 @@ ul {
padding: 0.5rem; padding: 0.5rem;
border-radius: 5px; border-radius: 5px;
color: #ff3300; color: #ff3300;
} }

View File

@@ -27,18 +27,17 @@ class ChangeDetectionStore:
# For when we edit, we should write to disk # For when we edit, we should write to disk
needs_write_urgent = False needs_write_urgent = False
__version_check = True
def __init__(self, datastore_path="/datastore", include_default_watches=True, version_tag="0.0.0"): def __init__(self, datastore_path="/datastore", include_default_watches=True, version_tag="0.0.0"):
# Should only be active for docker # Should only be active for docker
# logging.basicConfig(filename='/dev/stdout', level=logging.INFO) # logging.basicConfig(filename='/dev/stdout', level=logging.INFO)
self.__data = App.model() self.needs_write = False
self.datastore_path = datastore_path self.datastore_path = datastore_path
self.json_store_path = "{}/url-watches.json".format(self.datastore_path) self.json_store_path = "{}/url-watches.json".format(self.datastore_path)
self.needs_write = False
self.proxy_list = None self.proxy_list = None
self.start_time = time.time()
self.stop_thread = False self.stop_thread = False
self.__data = App.model()
# Base definition for all watchers # Base definition for all watchers
# deepcopy part of #569 - not sure why its needed exactly # deepcopy part of #569 - not sure why its needed exactly
self.generic_definition = deepcopy(Watch.model(datastore_path = datastore_path, default={})) self.generic_definition = deepcopy(Watch.model(datastore_path = datastore_path, default={}))
@@ -82,13 +81,8 @@ class ChangeDetectionStore:
except (FileNotFoundError, json.decoder.JSONDecodeError): except (FileNotFoundError, json.decoder.JSONDecodeError):
if include_default_watches: if include_default_watches:
print("Creating JSON store at", self.datastore_path) print("Creating JSON store at", self.datastore_path)
self.add_watch(url='https://news.ycombinator.com/', self.add_watch(url='https://news.ycombinator.com/', tag='Tech news')
tag='Tech news', self.add_watch(url='https://changedetection.io/CHANGELOG.txt', tag='changedetection.io')
extras={'fetch_backend': 'html_requests'})
self.add_watch(url='https://changedetection.io/CHANGELOG.txt',
tag='changedetection.io',
extras={'fetch_backend': 'html_requests'})
self.__data['version_tag'] = version_tag self.__data['version_tag'] = version_tag
@@ -272,7 +266,7 @@ class ChangeDetectionStore:
extras = {} extras = {}
# should always be str # should always be str
if tag is None or not tag: if tag is None or not tag:
tag = '' tag=''
# Incase these are copied across, assume it's a reference and deepcopy() # Incase these are copied across, assume it's a reference and deepcopy()
apply_extras = deepcopy(extras) apply_extras = deepcopy(extras)
@@ -287,31 +281,17 @@ class ChangeDetectionStore:
res = r.json() res = r.json()
# List of permissible attributes we accept from the wild internet # List of permissible attributes we accept from the wild internet
for k in [ for k in ['url', 'tag',
'body', 'paused', 'title',
'css_filter', 'previous_md5', 'headers',
'extract_text', 'body', 'method',
'extract_title_as_title', 'ignore_text', 'css_filter',
'headers', 'subtractive_selectors', 'trigger_text',
'ignore_text', 'extract_title_as_title', 'extract_text',
'include_filters', 'text_should_not_be_present',
'method', 'webdriver_js_execute_code']:
'paused',
'previous_md5',
'subtractive_selectors',
'tag',
'text_should_not_be_present',
'title',
'trigger_text',
'webdriver_js_execute_code',
'url',
]:
if res.get(k): if res.get(k):
if k != 'css_filter': apply_extras[k] = res[k]
apply_extras[k] = res[k]
else:
# We renamed the field and made it a list
apply_extras['include_filters'] = [res['css_filter']]
except Exception as e: except Exception as e:
logging.error("Error fetching metadata for shared watch link", url, str(e)) logging.error("Error fetching metadata for shared watch link", url, str(e))
@@ -334,13 +314,12 @@ class ChangeDetectionStore:
del apply_extras[k] del apply_extras[k]
new_watch.update(apply_extras) new_watch.update(apply_extras)
self.__data['watching'][new_uuid] = new_watch self.__data['watching'][new_uuid]=new_watch
self.__data['watching'][new_uuid].ensure_data_dir_exists() self.__data['watching'][new_uuid].ensure_data_dir_exists()
if write_to_disk_now: if write_to_disk_now:
self.sync_to_json() self.sync_to_json()
return new_uuid return new_uuid
def visualselector_data_is_ready(self, watch_uuid): def visualselector_data_is_ready(self, watch_uuid):
@@ -604,14 +583,3 @@ class ChangeDetectionStore:
for v in ['User-Agent', 'Accept', 'Accept-Encoding', 'Accept-Language']: for v in ['User-Agent', 'Accept', 'Accept-Encoding', 'Accept-Language']:
if self.data['settings']['headers'].get(v): if self.data['settings']['headers'].get(v):
del self.data['settings']['headers'][v] del self.data['settings']['headers'][v]
# Convert filters to a list of filters css_filter -> include_filters
def update_8(self):
for uuid, watch in self.data['watching'].items():
try:
existing_filter = watch.get('css_filter', '')
if existing_filter:
watch['include_filters'] = [existing_filter]
except:
continue
return

View File

@@ -40,8 +40,7 @@
<fieldset> <fieldset>
<div class="pure-control-group"> <div class="pure-control-group">
{{ render_field(form.url, placeholder="https://...", required=true, class="m-d") }} {{ render_field(form.url, placeholder="https://...", required=true, class="m-d") }}
<span class="pure-form-message-inline">Some sites use JavaScript to create the content, for this you should <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Fetching-pages-with-WebDriver">use the Chrome/WebDriver Fetcher</a></span><br/> <span class="pure-form-message-inline">Some sites use JavaScript to create the content, for this you should <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Fetching-pages-with-WebDriver">use the Chrome/WebDriver Fetcher</a></span>
<span class="pure-form-message-inline">You can use variables in the URL, perfect for inserting the current date and other logic, <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Handling-variables-in-the-watched-URL">help and examples here</a></span><br/>
</div> </div>
<div class="pure-control-group"> <div class="pure-control-group">
{{ render_field(form.title, class="m-d") }} {{ render_field(form.title, class="m-d") }}
@@ -174,27 +173,21 @@ User-Agent: wonderbra 1.0") }}
</div> </div>
</fieldset> </fieldset>
<div class="pure-control-group"> <div class="pure-control-group">
{% set field = render_field(form.include_filters, {% set field = render_field(form.css_filter,
rows=5, placeholder=".class-name or #some-id, or other CSS selector rule.",
placeholder="#example
xpath://body/div/span[contains(@class, 'example-class')]",
class="m-d") class="m-d")
%} %}
{{ field }} {{ field }}
{% if '/text()' in field %} {% if '/text()' in field %}
<span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the &lt;element&gt; contains &lt;![CDATA[]]&gt;</strong></span><br/> <span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the &lt;element&gt; contains &lt;![CDATA[]]&gt;</strong></span><br/>
{% endif %} {% endif %}
<span class="pure-form-message-inline">One rule per line, <i>any</i> rules that matches will be used.<br/> <span class="pure-form-message-inline">
<ul> <ul>
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li> <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
<li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a> (if installed). <li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a>.
<ul> <ul>
<li>JSONPath: Prefix with <code>json:</code>, use <code>json:$</code> to force re-formatting if required, <a href="https://jsonpath.com/" target="new">test your JSONPath here</a>.</li> <li>JSONPath: Prefix with <code>json:</code>, use <code>json:$</code> to force re-formatting if required, <a href="https://jsonpath.com/" target="new">test your JSONPath here</a>.</li>
{% if jq_support %}
<li>jq: Prefix with <code>jq:</code> and <a href="https://jqplay.org/" target="new">test your jq here</a>. Using <a href="https://stedolan.github.io/jq/" target="new">jq</a> allows for complex filtering and processing of JSON data with built-in functions, regex, filtering, and more. See examples and documentation <a href="https://stedolan.github.io/jq/manual/" target="new">here</a>.</li> <li>jq: Prefix with <code>jq:</code> and <a href="https://jqplay.org/" target="new">test your jq here</a>. Using <a href="https://stedolan.github.io/jq/" target="new">jq</a> allows for complex filtering and processing of JSON data with built-in functions, regex, filtering, and more. See examples and documentation <a href="https://stedolan.github.io/jq/manual/" target="new">here</a>.</li>
{% else %}
<li>jq support not installed</li>
{% endif %}
</ul> </ul>
</li> </li>
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash, <li>XPath - Limit text to this XPath rule, simply start with a forward-slash,
@@ -205,7 +198,7 @@ xpath://body/div/span[contains(@class, 'example-class')]",
</ul> </ul>
</li> </li>
</ul> </ul>
Please be sure that you thoroughly understand how to write CSS, JSONPath, XPath{% if jq_support %}, or jq selector{%endif%} rules before filing an issue on GitHub! <a Please be sure that you thoroughly understand how to write CSS, JSONPath, XPath, or jq selector rules before filing an issue on GitHub! <a
href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/> href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
</span> </span>
</div> </div>

View File

@@ -87,7 +87,7 @@
<a class="state-{{'on' if watch.notification_muted}}" href="{{url_for('index', op='mute', uuid=watch.uuid, tag=active_tag)}}"><img src="{{url_for('static_content', group='images', filename='bell-off.svg')}}" alt="Mute notifications" title="Mute notifications"/></a> <a class="state-{{'on' if watch.notification_muted}}" href="{{url_for('index', op='mute', uuid=watch.uuid, tag=active_tag)}}"><img src="{{url_for('static_content', group='images', filename='bell-off.svg')}}" alt="Mute notifications" title="Mute notifications"/></a>
</td> </td>
<td class="title-col inline">{{watch.title if watch.title is not none and watch.title|length > 0 else watch.url}} <td class="title-col inline">{{watch.title if watch.title is not none and watch.title|length > 0 else watch.url}}
<a class="external" target="_blank" rel="noopener" href="{{ watch.link.replace('source:','') }}"></a> <a class="external" target="_blank" rel="noopener" href="{{ watch.url.replace('source:','') }}"></a>
<a href="{{url_for('form_share_put_watch', uuid=watch.uuid)}}"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread.svg')}}" /></a> <a href="{{url_for('form_share_put_watch', uuid=watch.uuid)}}"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread.svg')}}" /></a>
{%if watch.fetch_backend == "html_webdriver" %}<img style="height: 1em; display:inline-block;" src="{{url_for('static_content', group='images', filename='Google-Chrome-icon.png')}}" />{% endif %} {%if watch.fetch_backend == "html_webdriver" %}<img style="height: 1em; display:inline-block;" src="{{url_for('static_content', group='images', filename='Google-Chrome-icon.png')}}" />{% endif %}

View File

@@ -41,7 +41,7 @@ def app(request):
cleanup(datastore_path) cleanup(datastore_path)
app_config = {'datastore_path': datastore_path, 'disable_checkver' : True} app_config = {'datastore_path': datastore_path}
cleanup(app_config['datastore_path']) cleanup(app_config['datastore_path'])
datastore = store.ChangeDetectionStore(datastore_path=app_config['datastore_path'], include_default_watches=False) datastore = store.ChangeDetectionStore(datastore_path=app_config['datastore_path'], include_default_watches=False)
app = changedetection_app(app_config, datastore) app = changedetection_app(app_config, datastore)

View File

@@ -24,7 +24,7 @@ def test_preferred_proxy(client, live_server):
res = client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={ data={
"include_filters": "", "css_filter": "",
"fetch_backend": "html_requests", "fetch_backend": "html_requests",
"headers": "", "headers": "",
"proxy": "proxy-two", "proxy": "proxy-two",

View File

@@ -147,16 +147,6 @@ def test_api_simple(client, live_server):
# @todo how to handle None/default global values? # @todo how to handle None/default global values?
assert watch['history_n'] == 2, "Found replacement history section, which is in its own API" assert watch['history_n'] == 2, "Found replacement history section, which is in its own API"
# basic systeminfo check
res = client.get(
url_for("systeminfo"),
headers={'x-api-key': api_key},
)
info = json.loads(res.data)
assert info.get('watch_count') == 1
assert info.get('uptime') > 0.5
# Finally delete the watch # Finally delete the watch
res = client.delete( res = client.delete(
url_for("watch", uuid=watch_uuid), url_for("watch", uuid=watch_uuid),

View File

@@ -23,7 +23,7 @@ def test_basic_auth(client, live_server):
# Check form validation # Check form validation
res = client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"include_filters": "", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, data={"css_filter": "", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
follow_redirects=True follow_redirects=True
) )
assert b"Updated watch." in res.data assert b"Updated watch." in res.data

View File

@@ -3,7 +3,7 @@
import time import time
from flask import url_for from flask import url_for
from urllib.request import urlopen from urllib.request import urlopen
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks from .util import set_original_response, set_modified_response, live_server_setup
sleep_time_for_fetch_thread = 3 sleep_time_for_fetch_thread = 3
@@ -36,7 +36,7 @@ def test_check_basic_change_detection_functionality(client, live_server):
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up # Give the thread time to pick it up
wait_for_all_checks(client) time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class) # It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index")) res = client.get(url_for("index"))
@@ -69,7 +69,7 @@ def test_check_basic_change_detection_functionality(client, live_server):
res = client.get(url_for("form_watch_checknow"), follow_redirects=True) res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
assert b'1 watches are queued for rechecking.' in res.data assert b'1 watches are queued for rechecking.' in res.data
wait_for_all_checks(client) time.sleep(sleep_time_for_fetch_thread)
# Now something should be ready, indicated by having a 'unviewed' class # Now something should be ready, indicated by having a 'unviewed' class
res = client.get(url_for("index")) res = client.get(url_for("index"))
@@ -98,14 +98,14 @@ def test_check_basic_change_detection_functionality(client, live_server):
assert b'which has this one new line' in res.data assert b'which has this one new line' in res.data
assert b'Which is across multiple lines' not in res.data assert b'Which is across multiple lines' not in res.data
wait_for_all_checks(client) time.sleep(2)
# Do this a few times.. ensures we dont accidently set the status # Do this a few times.. ensures we dont accidently set the status
for n in range(2): for n in range(2):
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up # Give the thread time to pick it up
wait_for_all_checks(client) time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class) # It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index")) res = client.get(url_for("index"))
@@ -125,7 +125,7 @@ def test_check_basic_change_detection_functionality(client, live_server):
) )
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client) time.sleep(sleep_time_for_fetch_thread)
res = client.get(url_for("index")) res = client.get(url_for("index"))
assert b'unviewed' in res.data assert b'unviewed' in res.data

View File

@@ -1,31 +1,18 @@
#!/usr/bin/python3 #!/usr/bin/python3
from .util import set_original_response, set_modified_response, live_server_setup import time
from flask import url_for from flask import url_for
from urllib.request import urlopen from urllib.request import urlopen
from zipfile import ZipFile from . util import set_original_response, set_modified_response, live_server_setup
import re
import time
def test_backup(client, live_server): def test_backup(client, live_server):
live_server_setup(live_server)
set_original_response() live_server_setup(live_server)
# Give the endpoint time to spin up # Give the endpoint time to spin up
time.sleep(1) time.sleep(1)
# Add our URL to the import page
res = client.post(
url_for("import_page"),
data={"urls": url_for('test_endpoint', _external=True)},
follow_redirects=True
)
assert b"1 Imported" in res.data
time.sleep(3)
res = client.get( res = client.get(
url_for("get_backup"), url_for("get_backup"),
follow_redirects=True follow_redirects=True
@@ -33,19 +20,6 @@ def test_backup(client, live_server):
# Should get the right zip content type # Should get the right zip content type
assert res.content_type == "application/zip" assert res.content_type == "application/zip"
# Should be PK/ZIP stream # Should be PK/ZIP stream
assert res.data.count(b'PK') >= 2 assert res.data.count(b'PK') >= 2
# ZipFile from buffer seems non-obvious, just save it instead
with open("download.zip", 'wb') as f:
f.write(res.data)
zip = ZipFile('download.zip')
l = zip.namelist()
uuid4hex = re.compile('^[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}.*txt', re.I)
newlist = list(filter(uuid4hex.match, l)) # Read Note below
# Should be two txt files in the archive (history and the snapshot)
assert len(newlist) == 2

View File

@@ -46,23 +46,22 @@ def set_modified_response():
# Test that the CSS extraction works how we expect, important here is the right placing of new lines \n's # Test that the CSS extraction works how we expect, important here is the right placing of new lines \n's
def test_include_filters_output(): def test_css_filter_output():
from changedetectionio import fetch_site_status
from inscriptis import get_text from inscriptis import get_text
# Check text with sub-parts renders correctly # Check text with sub-parts renders correctly
content = """<html> <body><div id="thingthing" > Some really <b>bold</b> text </div> </body> </html>""" content = """<html> <body><div id="thingthing" > Some really <b>bold</b> text </div> </body> </html>"""
html_blob = include_filters(include_filters="#thingthing", html_content=content) html_blob = css_filter(css_filter="#thingthing", html_content=content)
text = get_text(html_blob) text = get_text(html_blob)
assert text == " Some really bold text" assert text == " Some really bold text"
content = """<html> <body> content = """<html> <body>
<p>foo bar blah</p> <p>foo bar blah</p>
<DIV class="parts">Block A</DiV> <div class="parts">Block B</DIV></body> <div class="parts">Block A</div> <div class="parts">Block B</div></body>
</html> </html>
""" """
html_blob = css_filter(css_filter=".parts", html_content=content)
# in xPath this would be //*[@class='parts']
html_blob = include_filters(include_filters=".parts", html_content=content)
text = get_text(html_blob) text = get_text(html_blob)
# Divs are converted to 4 whitespaces by inscriptis # Divs are converted to 4 whitespaces by inscriptis
@@ -70,10 +69,10 @@ def test_include_filters_output():
# Tests the whole stack works with the CSS Filter # Tests the whole stack works with the CSS Filter
def test_check_markup_include_filters_restriction(client, live_server): def test_check_markup_css_filter_restriction(client, live_server):
sleep_time_for_fetch_thread = 3 sleep_time_for_fetch_thread = 3
include_filters = "#sametext" css_filter = "#sametext"
set_original_response() set_original_response()
@@ -99,7 +98,7 @@ def test_check_markup_include_filters_restriction(client, live_server):
# Add our URL to the import page # Add our URL to the import page
res = client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"include_filters": include_filters, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, data={"css_filter": css_filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
follow_redirects=True follow_redirects=True
) )
assert b"Updated watch." in res.data assert b"Updated watch." in res.data
@@ -108,7 +107,7 @@ def test_check_markup_include_filters_restriction(client, live_server):
res = client.get( res = client.get(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
) )
assert bytes(include_filters.encode('utf-8')) in res.data assert bytes(css_filter.encode('utf-8')) in res.data
# Trigger a check # Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
@@ -127,58 +126,3 @@ def test_check_markup_include_filters_restriction(client, live_server):
# Because it should be looking at only that 'sametext' id # Because it should be looking at only that 'sametext' id
res = client.get(url_for("index")) res = client.get(url_for("index"))
assert b'unviewed' in res.data assert b'unviewed' in res.data
# Tests the whole stack works with the CSS Filter
def test_check_multiple_filters(client, live_server):
sleep_time_for_fetch_thread = 3
include_filters = "#blob-a\r\nxpath://*[contains(@id,'blob-b')]"
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("""<html><body>
<div id="blob-a">Blob A</div>
<div id="blob-b">Blob B</div>
<div id="blob-c">Blob C</div>
</body>
</html>
""")
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
time.sleep(1)
# Goto the edit page, add our ignore text
# Add our URL to the import page
res = client.post(
url_for("edit_page", uuid="first"),
data={"include_filters": include_filters,
"url": test_url,
"tag": "",
"headers": "",
'fetch_backend': "html_requests"},
follow_redirects=True
)
assert b"Updated watch." in res.data
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
# Only the two blobs should be here
assert b"Blob A" in res.data # CSS was ok
assert b"Blob B" in res.data # xPath was ok
assert b"Blob C" not in res.data # Should not be included

View File

@@ -88,7 +88,7 @@ def test_check_filter_multiline(client, live_server):
# Add our URL to the import page # Add our URL to the import page
res = client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"include_filters": '', data={"css_filter": '',
'extract_text': '/something.+?6 billion.+?lines/si', 'extract_text': '/something.+?6 billion.+?lines/si',
"url": test_url, "url": test_url,
"tag": "", "tag": "",
@@ -116,7 +116,7 @@ def test_check_filter_multiline(client, live_server):
def test_check_filter_and_regex_extract(client, live_server): def test_check_filter_and_regex_extract(client, live_server):
sleep_time_for_fetch_thread = 3 sleep_time_for_fetch_thread = 3
include_filters = ".changetext" css_filter = ".changetext"
set_original_response() set_original_response()
@@ -143,7 +143,7 @@ def test_check_filter_and_regex_extract(client, live_server):
# Add our URL to the import page # Add our URL to the import page
res = client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"include_filters": include_filters, data={"css_filter": css_filter,
'extract_text': '\d+ online\r\n\d+ guests\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i', 'extract_text': '\d+ online\r\n\d+ guests\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i',
"url": test_url, "url": test_url,
"tag": "", "tag": "",

View File

@@ -92,7 +92,7 @@ def test_filter_doesnt_exist_then_exists_should_get_notification(client, live_se
"tag": "my tag", "tag": "my tag",
"title": "my title", "title": "my title",
"headers": "", "headers": "",
"include_filters": '.ticket-available', "css_filter": '.ticket-available',
"fetch_backend": "html_requests"}) "fetch_backend": "html_requests"})
res = client.post( res = client.post(

View File

@@ -76,7 +76,7 @@ def run_filter_test(client, content_filter):
"title": "my title", "title": "my title",
"headers": "", "headers": "",
"filter_failure_notification_send": 'y', "filter_failure_notification_send": 'y',
"include_filters": content_filter, "css_filter": content_filter,
"fetch_backend": "html_requests"}) "fetch_backend": "html_requests"})
res = client.post( res = client.post(
@@ -95,7 +95,7 @@ def run_filter_test(client, content_filter):
time.sleep(3) time.sleep(3)
# We should see something in the frontend # We should see something in the frontend
assert b'Warning, no filters were found' in res.data assert b'Warning, filter' in res.data
# Now it should exist and contain our "filter not found" alert # Now it should exist and contain our "filter not found" alert
assert os.path.isfile("test-datastore/notification.txt") assert os.path.isfile("test-datastore/notification.txt")
@@ -131,7 +131,7 @@ def run_filter_test(client, content_filter):
def test_setup(live_server): def test_setup(live_server):
live_server_setup(live_server) live_server_setup(live_server)
def test_check_include_filters_failure_notification(client, live_server): def test_check_css_filter_failure_notification(client, live_server):
set_original_response() set_original_response()
time.sleep(1) time.sleep(1)
run_filter_test(client, '#nope-doesnt-exist') run_filter_test(client, '#nope-doesnt-exist')

View File

@@ -1,33 +0,0 @@
#!/usr/bin/python3
import time
from flask import url_for
from .util import live_server_setup
# If there was only a change in the whitespacing, then we shouldnt have a change detected
def test_jinja2_in_url_query(client, live_server):
live_server_setup(live_server)
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_return_query', _external=True)
# because url_for() will URL-encode the var, but we dont here
full_url = "{}?{}".format(test_url,
"date={% now 'Europe/Berlin', '%Y' %}.{% now 'Europe/Berlin', '%m' %}.{% now 'Europe/Berlin', '%d' %}", )
res = client.post(
url_for("form_quick_watch_add"),
data={"url": full_url, "tag": "test"},
follow_redirects=True
)
assert b"Watch added" in res.data
time.sleep(3)
# It should report nothing found (no new 'unviewed' class)
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
assert b'date=2' in res.data

View File

@@ -5,12 +5,7 @@ import time
from flask import url_for, escape from flask import url_for, escape
from . util import live_server_setup from . util import live_server_setup
import pytest import pytest
jq_support = True
try:
import jq
except ModuleNotFoundError:
jq_support = False
def test_setup(live_server): def test_setup(live_server):
live_server_setup(live_server) live_server_setup(live_server)
@@ -45,24 +40,22 @@ and it can also be repeated
assert text == "23.5" assert text == "23.5"
# also check for jq # also check for jq
if jq_support: text = html_tools.extract_json_as_string(content, "jq:.offers.price")
text = html_tools.extract_json_as_string(content, "jq:.offers.price") assert text == "23.5"
assert text == "23.5"
text = html_tools.extract_json_as_string('{"id":5}', "jq:.id")
assert text == "5"
text = html_tools.extract_json_as_string('{"id":5}', "json:$.id") text = html_tools.extract_json_as_string('{"id":5}', "json:$.id")
assert text == "5" assert text == "5"
text = html_tools.extract_json_as_string('{"id":5}', "jq:.id")
assert text == "5"
# When nothing at all is found, it should throw JSONNOTFound # When nothing at all is found, it should throw JSONNOTFound
# Which is caught and shown to the user in the watch-overview table # Which is caught and shown to the user in the watch-overview table
with pytest.raises(html_tools.JSONNotFound) as e_info: with pytest.raises(html_tools.JSONNotFound) as e_info:
html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "json:$.id") html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "json:$.id")
if jq_support: with pytest.raises(html_tools.JSONNotFound) as e_info:
with pytest.raises(html_tools.JSONNotFound) as e_info: html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "jq:.id")
html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "jq:.id")
def set_original_ext_response(): def set_original_ext_response():
data = """ data = """
@@ -132,7 +125,7 @@ def set_original_response():
return None return None
def set_json_response_with_html(): def set_response_with_html():
test_return_data = """ test_return_data = """
{ {
"test": [ "test": [
@@ -176,7 +169,7 @@ def set_modified_response():
def test_check_json_without_filter(client, live_server): def test_check_json_without_filter(client, live_server):
# Request a JSON document from a application/json source containing HTML # Request a JSON document from a application/json source containing HTML
# and be sure it doesn't get chewed up by instriptis # and be sure it doesn't get chewed up by instriptis
set_json_response_with_html() set_response_with_html()
# Give the endpoint time to spin up # Give the endpoint time to spin up
time.sleep(1) time.sleep(1)
@@ -189,6 +182,9 @@ def test_check_json_without_filter(client, live_server):
follow_redirects=True follow_redirects=True
) )
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up # Give the thread time to pick it up
time.sleep(3) time.sleep(3)
@@ -197,7 +193,6 @@ def test_check_json_without_filter(client, live_server):
follow_redirects=True follow_redirects=True
) )
# Should still see '"html": "<b>"'
assert b'&#34;&lt;b&gt;' in res.data assert b'&#34;&lt;b&gt;' in res.data
assert res.data.count(b'{\n') >= 2 assert res.data.count(b'{\n') >= 2
@@ -219,6 +214,9 @@ def check_json_filter(json_filter, client, live_server):
) )
assert b"1 Imported" in res.data assert b"1 Imported" in res.data
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up # Give the thread time to pick it up
time.sleep(3) time.sleep(3)
@@ -226,7 +224,7 @@ def check_json_filter(json_filter, client, live_server):
# Add our URL to the import page # Add our URL to the import page
res = client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"include_filters": json_filter, data={"css_filter": json_filter,
"url": test_url, "url": test_url,
"tag": "", "tag": "",
"headers": "", "headers": "",
@@ -242,6 +240,9 @@ def check_json_filter(json_filter, client, live_server):
) )
assert bytes(escape(json_filter).encode('utf-8')) in res.data assert bytes(escape(json_filter).encode('utf-8')) in res.data
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up # Give the thread time to pick it up
time.sleep(3) time.sleep(3)
# Make a change # Make a change
@@ -270,8 +271,7 @@ def test_check_jsonpath_filter(client, live_server):
check_json_filter('json:boss.name', client, live_server) check_json_filter('json:boss.name', client, live_server)
def test_check_jq_filter(client, live_server): def test_check_jq_filter(client, live_server):
if jq_support: check_json_filter('jq:.boss.name', client, live_server)
check_json_filter('jq:.boss.name', client, live_server)
def check_json_filter_bool_val(json_filter, client, live_server): def check_json_filter_bool_val(json_filter, client, live_server):
set_original_response() set_original_response()
@@ -293,7 +293,7 @@ def check_json_filter_bool_val(json_filter, client, live_server):
# Add our URL to the import page # Add our URL to the import page
res = client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"include_filters": json_filter, data={"css_filter": json_filter,
"url": test_url, "url": test_url,
"tag": "", "tag": "",
"headers": "", "headers": "",
@@ -303,6 +303,11 @@ def check_json_filter_bool_val(json_filter, client, live_server):
) )
assert b"Updated watch." in res.data assert b"Updated watch." in res.data
time.sleep(3)
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up # Give the thread time to pick it up
time.sleep(3) time.sleep(3)
# Make a change # Make a change
@@ -324,8 +329,7 @@ def test_check_jsonpath_filter_bool_val(client, live_server):
check_json_filter_bool_val("json:$['available']", client, live_server) check_json_filter_bool_val("json:$['available']", client, live_server)
def test_check_jq_filter_bool_val(client, live_server): def test_check_jq_filter_bool_val(client, live_server):
if jq_support: check_json_filter_bool_val("jq:.available", client, live_server)
check_json_filter_bool_val("jq:.available", client, live_server)
# Re #265 - Extended JSON selector test # Re #265 - Extended JSON selector test
# Stuff to consider here # Stuff to consider here
@@ -347,6 +351,9 @@ def check_json_ext_filter(json_filter, client, live_server):
) )
assert b"1 Imported" in res.data assert b"1 Imported" in res.data
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up # Give the thread time to pick it up
time.sleep(3) time.sleep(3)
@@ -354,7 +361,7 @@ def check_json_ext_filter(json_filter, client, live_server):
# Add our URL to the import page # Add our URL to the import page
res = client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"include_filters": json_filter, data={"css_filter": json_filter,
"url": test_url, "url": test_url,
"tag": "", "tag": "",
"headers": "", "headers": "",
@@ -370,6 +377,9 @@ def check_json_ext_filter(json_filter, client, live_server):
) )
assert bytes(escape(json_filter).encode('utf-8')) in res.data assert bytes(escape(json_filter).encode('utf-8')) in res.data
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up # Give the thread time to pick it up
time.sleep(3) time.sleep(3)
# Make a change # Make a change
@@ -398,5 +408,4 @@ def test_check_jsonpath_ext_filter(client, live_server):
check_json_ext_filter('json:$[?(@.status==Sold)]', client, live_server) check_json_ext_filter('json:$[?(@.status==Sold)]', client, live_server)
def test_check_jq_ext_filter(client, live_server): def test_check_jq_ext_filter(client, live_server):
if jq_support: check_json_ext_filter('jq:.[] | select(.status | contains("Sold"))', client, live_server)
check_json_ext_filter('jq:.[] | select(.status | contains("Sold"))', client, live_server)

View File

@@ -14,7 +14,7 @@ def test_share_watch(client, live_server):
live_server_setup(live_server) live_server_setup(live_server)
test_url = url_for('test_endpoint', _external=True) test_url = url_for('test_endpoint', _external=True)
include_filters = ".nice-filter" css_filter = ".nice-filter"
# Add our URL to the import page # Add our URL to the import page
res = client.post( res = client.post(
@@ -29,7 +29,7 @@ def test_share_watch(client, live_server):
# Add our URL to the import page # Add our URL to the import page
res = client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"include_filters": include_filters, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, data={"css_filter": css_filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
follow_redirects=True follow_redirects=True
) )
assert b"Updated watch." in res.data assert b"Updated watch." in res.data
@@ -37,7 +37,7 @@ def test_share_watch(client, live_server):
res = client.get( res = client.get(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
) )
assert bytes(include_filters.encode('utf-8')) in res.data assert bytes(css_filter.encode('utf-8')) in res.data
# click share the link # click share the link
res = client.get( res = client.get(
@@ -73,8 +73,4 @@ def test_share_watch(client, live_server):
res = client.get( res = client.get(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
) )
assert bytes(include_filters.encode('utf-8')) in res.data assert bytes(css_filter.encode('utf-8')) in res.data
# Check it saved the URL
res = client.get(url_for("index"))
assert bytes(test_url.encode('utf-8')) in res.data

View File

@@ -57,9 +57,10 @@ def test_check_basic_change_detection_functionality_source(client, live_server):
# `subtractive_selectors` should still work in `source:` type requests
def test_check_ignore_elements(client, live_server): def test_check_ignore_elements(client, live_server):
set_original_response() set_original_response()
time.sleep(2) time.sleep(2)
test_url = 'source:'+url_for('test_endpoint', _external=True) test_url = 'source:'+url_for('test_endpoint', _external=True)
# Add our URL to the import page # Add our URL to the import page
@@ -76,9 +77,9 @@ def test_check_ignore_elements(client, live_server):
##################### #####################
# We want <span> and <p> ONLY, but ignore span with .foobar-detection # We want <span> and <p> ONLY, but ignore span with .foobar-detection
client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"include_filters": 'span,p', "url": test_url, "tag": "", "subtractive_selectors": ".foobar-detection", 'fetch_backend': "html_requests"}, data={"css_filter": 'span,p', "url": test_url, "tag": "", "subtractive_selectors": ".foobar-detection", 'fetch_backend': "html_requests"},
follow_redirects=True follow_redirects=True
) )
@@ -88,6 +89,7 @@ def test_check_ignore_elements(client, live_server):
url_for("preview_page", uuid="first"), url_for("preview_page", uuid="first"),
follow_redirects=True follow_redirects=True
) )
assert b'foobar-detection' not in res.data assert b'foobar-detection' not in res.data
assert b'&lt;br' not in res.data assert b'&lt;br' not in res.data
assert b'&lt;p' in res.data assert b'&lt;p' in res.data

View File

@@ -49,7 +49,7 @@ def test_trigger_regex_functionality_with_filter(client, live_server):
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"trigger_text": "/cool.stuff/", data={"trigger_text": "/cool.stuff/",
"url": test_url, "url": test_url,
"include_filters": '#in-here', "css_filter": '#in-here',
"fetch_backend": "html_requests"}, "fetch_backend": "html_requests"},
follow_redirects=True follow_redirects=True
) )

View File

@@ -22,7 +22,7 @@ def test_check_watch_field_storage(client, live_server):
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={ "notification_urls": "json://127.0.0.1:30000\r\njson://128.0.0.1\r\n", data={ "notification_urls": "json://127.0.0.1:30000\r\njson://128.0.0.1\r\n",
"time_between_check-minutes": 126, "time_between_check-minutes": 126,
"include_filters" : ".fooclass", "css_filter" : ".fooclass",
"title" : "My title", "title" : "My title",
"ignore_text" : "ignore this", "ignore_text" : "ignore this",
"url": test_url, "url": test_url,

View File

@@ -89,7 +89,7 @@ def test_check_xpath_filter_utf8(client, live_server):
time.sleep(1) time.sleep(1)
res = client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"include_filters": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
follow_redirects=True follow_redirects=True
) )
assert b"Updated watch." in res.data assert b"Updated watch." in res.data
@@ -143,7 +143,7 @@ def test_check_xpath_text_function_utf8(client, live_server):
time.sleep(1) time.sleep(1)
res = client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"include_filters": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
follow_redirects=True follow_redirects=True
) )
assert b"Updated watch." in res.data assert b"Updated watch." in res.data
@@ -182,6 +182,9 @@ def test_check_markup_xpath_filter_restriction(client, live_server):
) )
assert b"1 Imported" in res.data assert b"1 Imported" in res.data
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up # Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread) time.sleep(sleep_time_for_fetch_thread)
@@ -189,7 +192,7 @@ def test_check_markup_xpath_filter_restriction(client, live_server):
# Add our URL to the import page # Add our URL to the import page
res = client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"include_filters": xpath_filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, data={"css_filter": xpath_filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
follow_redirects=True follow_redirects=True
) )
assert b"Updated watch." in res.data assert b"Updated watch." in res.data
@@ -227,11 +230,10 @@ def test_xpath_validation(client, live_server):
follow_redirects=True follow_redirects=True
) )
assert b"1 Imported" in res.data assert b"1 Imported" in res.data
time.sleep(2)
res = client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"include_filters": "/something horrible", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, data={"css_filter": "/something horrible", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
follow_redirects=True follow_redirects=True
) )
assert b"is not a valid XPath expression" in res.data assert b"is not a valid XPath expression" in res.data
@@ -240,7 +242,7 @@ def test_xpath_validation(client, live_server):
# actually only really used by the distll.io importer, but could be handy too # actually only really used by the distll.io importer, but could be handy too
def test_check_with_prefix_include_filters(client, live_server): def test_check_with_prefix_css_filter(client, live_server):
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data assert b'Deleted' in res.data
@@ -261,7 +263,7 @@ def test_check_with_prefix_include_filters(client, live_server):
res = client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"include_filters": "xpath://*[contains(@class, 'sametext')]", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, data={"css_filter": "xpath://*[contains(@class, 'sametext')]", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
follow_redirects=True follow_redirects=True
) )

View File

@@ -86,7 +86,6 @@ def extract_UUID_from_client(client):
def wait_for_all_checks(client): def wait_for_all_checks(client):
# Loop waiting until done.. # Loop waiting until done..
attempt=0 attempt=0
time.sleep(0.1)
while attempt < 60: while attempt < 60:
time.sleep(1) time.sleep(1)
res = client.get(url_for("index")) res = client.get(url_for("index"))
@@ -160,10 +159,5 @@ def live_server_setup(live_server):
ret = " ".join([auth.username, auth.password, auth.type]) ret = " ".join([auth.username, auth.password, auth.type])
return ret return ret
# Just return some GET var
@live_server.app.route('/test-return-query', methods=['GET'])
def test_return_query():
return request.query_string
live_server.start() live_server.start()

View File

@@ -13,9 +13,9 @@ def test_visual_selector_content_ready(client, live_server):
live_server_setup(live_server) live_server_setup(live_server)
time.sleep(1) time.sleep(1)
# Add our URL to the import page, because the docker container (playwright/selenium) wont be able to connect to our usual test url # Add our URL to the import page, maybe better to use something we control?
test_url = "https://changedetection.io/ci-test/test-runjs.html" # We use an external URL because the docker container is too difficult to setup to connect back to the pytest socket
test_url = 'https://news.ycombinator.com'
res = client.post( res = client.post(
url_for("form_quick_watch_add"), url_for("form_quick_watch_add"),
data={"url": test_url, "tag": '', 'edit_and_watch_submit_button': 'Edit > Watch'}, data={"url": test_url, "tag": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
@@ -25,27 +25,13 @@ def test_visual_selector_content_ready(client, live_server):
res = client.post( res = client.post(
url_for("edit_page", uuid="first", unpause_on_save=1), url_for("edit_page", uuid="first", unpause_on_save=1),
data={ data={"css_filter": ".does-not-exist", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_webdriver"},
"url": test_url,
"tag": "",
"headers": "",
'fetch_backend': "html_webdriver",
'webdriver_js_execute_code': 'document.querySelector("button[name=test-button]").click();'
},
follow_redirects=True follow_redirects=True
) )
assert b"unpaused" in res.data assert b"unpaused" in res.data
time.sleep(1) time.sleep(1)
wait_for_all_checks(client) wait_for_all_checks(client)
uuid = extract_UUID_from_client(client) uuid = extract_UUID_from_client(client)
# Check the JS execute code before extract worked
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
assert b'I smell JavaScript' in res.data
assert os.path.isfile(os.path.join('test-datastore', uuid, 'last-screenshot.png')), "last-screenshot.png should exist" assert os.path.isfile(os.path.join('test-datastore', uuid, 'last-screenshot.png')), "last-screenshot.png should exist"
assert os.path.isfile(os.path.join('test-datastore', uuid, 'elements.json')), "xpath elements.json data should exist" assert os.path.isfile(os.path.join('test-datastore', uuid, 'elements.json')), "xpath elements.json data should exist"

View File

@@ -4,7 +4,7 @@ import queue
import time import time
from changedetectionio import content_fetcher from changedetectionio import content_fetcher
from changedetectionio.fetch_site_status import FilterNotFoundInResponse from changedetectionio.html_tools import FilterNotFoundInResponse
# A single update worker # A single update worker
# #
@@ -91,8 +91,8 @@ class update_worker(threading.Thread):
return return
n_object = {'notification_title': 'Changedetection.io - Alert - CSS/xPath filter was not present in the page', n_object = {'notification_title': 'Changedetection.io - Alert - CSS/xPath filter was not present in the page',
'notification_body': "Your configured CSS/xPath filters of '{}' for {{watch_url}} did not appear on the page after {} attempts, did the page change layout?\n\nLink: {{base_url}}/edit/{{watch_uuid}}\n\nThanks - Your omniscient changedetection.io installation :)\n".format( 'notification_body': "Your configured CSS/xPath filter of '{}' for {{watch_url}} did not appear on the page after {} attempts, did the page change layout?\n\nLink: {{base_url}}/edit/{{watch_uuid}}\n\nThanks - Your omniscient changedetection.io installation :)\n".format(
", ".join(watch['include_filters']), watch['css_filter'],
threshold), threshold),
'notification_format': 'text'} 'notification_format': 'text'}
@@ -189,7 +189,7 @@ class update_worker(threading.Thread):
if not self.datastore.data['watching'].get(uuid): if not self.datastore.data['watching'].get(uuid):
continue continue
err_text = "Warning, no filters were found, no change detection ran." err_text = "Warning, filter '{}' not found".format(str(e))
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
# So that we get a trigger when the content is added again # So that we get a trigger when the content is added again
'previous_md5': ''}) 'previous_md5': ''})
@@ -282,19 +282,16 @@ class update_worker(threading.Thread):
self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e)) self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)}) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
if self.datastore.data['watching'].get(uuid):
# Always record that we atleast tried
count = self.datastore.data['watching'][uuid].get('check_count', 0) + 1
self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),
'last_checked': round(time.time()),
'check_count': count
})
# Always save the screenshot if it's available # Always record that we atleast tried
if update_handler.screenshot: self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=update_handler.screenshot) 'last_checked': round(time.time())})
if update_handler.xpath_data:
self.datastore.save_xpath_data(watch_uuid=uuid, data=update_handler.xpath_data) # Always save the screenshot if it's available
if update_handler.screenshot:
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=update_handler.screenshot)
if update_handler.xpath_data:
self.datastore.save_xpath_data(watch_uuid=uuid, data=update_handler.xpath_data)
self.current_uuid = None # Done self.current_uuid = None # Done

View File

@@ -45,9 +45,6 @@ services:
# Respect proxy_pass type settings, `proxy_set_header Host "localhost";` and `proxy_set_header X-Forwarded-Prefix /app;` # Respect proxy_pass type settings, `proxy_set_header Host "localhost";` and `proxy_set_header X-Forwarded-Prefix /app;`
# More here https://github.com/dgtlmoon/changedetection.io/wiki/Running-changedetection.io-behind-a-reverse-proxy-sub-directory # More here https://github.com/dgtlmoon/changedetection.io/wiki/Running-changedetection.io-behind-a-reverse-proxy-sub-directory
# - USE_X_SETTINGS=1 # - USE_X_SETTINGS=1
#
# Hides the `Referer` header so that monitored websites can't see the changedetection.io hostname.
# - HIDE_REFERER=true
# Comment out ports: when using behind a reverse proxy , enable networks: etc. # Comment out ports: when using behind a reverse proxy , enable networks: etc.
ports: ports:

View File

@@ -1,36 +1,35 @@
flask~=2.0 flask~= 2.0
flask_wtf flask_wtf
eventlet>=0.31.0 eventlet>=0.31.0
validators validators
timeago~=1.0 timeago ~=1.0
inscriptis~=2.2 inscriptis ~= 2.2
feedgen~=0.9 feedgen ~= 0.9
flask-login~=0.5 flask-login ~= 0.5
flask_restful flask_restful
pytz pytz
# Set these versions together to avoid a RequestsDependencyWarning # Set these versions together to avoid a RequestsDependencyWarning
# >= 2.26 also adds Brotli support if brotli is installed # >= 2.26 also adds Brotli support if brotli is installed
brotli~=1.0 brotli ~= 1.0
requests[socks] ~=2.28 requests[socks] ~= 2.28
urllib3>1.26 urllib3 > 1.26
chardet>2.3.0 chardet > 2.3.0
wtforms~=3.0 wtforms ~= 3.0
jsonpath-ng~=1.5.3 jsonpath-ng ~= 1.5.3
jq ~= 1.3.0
# jq not available on Windows so must be installed manually
# Notification library # Notification library
apprise~=1.1.0 apprise ~= 1.1.0
# apprise mqtt https://github.com/dgtlmoon/changedetection.io/issues/315 # apprise mqtt https://github.com/dgtlmoon/changedetection.io/issues/315
paho-mqtt paho-mqtt
# Pinned version of cryptography otherwise # Pinned version of cryptography otherwise
# ERROR: Could not build wheels for cryptography which use PEP 517 and cannot be installed directly # ERROR: Could not build wheels for cryptography which use PEP 517 and cannot be installed directly
cryptography~=3.4 cryptography ~= 3.4
# Used for CSS filtering # Used for CSS filtering
bs4 bs4
@@ -39,20 +38,11 @@ bs4
lxml lxml
# 3.141 was missing socksVersion, 3.150 was not in pypi, so we try 4.1.0 # 3.141 was missing socksVersion, 3.150 was not in pypi, so we try 4.1.0
selenium~=4.1.0 selenium ~= 4.1.0
# https://stackoverflow.com/questions/71652965/importerror-cannot-import-name-safe-str-cmp-from-werkzeug-security/71653849#71653849 # https://stackoverflow.com/questions/71652965/importerror-cannot-import-name-safe-str-cmp-from-werkzeug-security/71653849#71653849
# ImportError: cannot import name 'safe_str_cmp' from 'werkzeug.security' # ImportError: cannot import name 'safe_str_cmp' from 'werkzeug.security'
# need to revisit flask login versions # need to revisit flask login versions
werkzeug~=2.0.0 werkzeug ~= 2.0.0
# Templating, so far just in the URLs but in the future can be for the notifications also
jinja2~=3.1
jinja2-time
# https://peps.python.org/pep-0508/#environment-markers
# https://github.com/dgtlmoon/changedetection.io/pull/1009
jq~=1.3 ;python_version >= "3.8" and sys_platform == "linux"
# playwright is installed at Dockerfile build time because it's not available on all platforms # playwright is installed at Dockerfile build time because it's not available on all platforms