Compare commits

..

32 Commits

Author SHA1 Message Date
dgtlmoon
acb3fa0841 Merge branch 'master' into add-system-info-api 2022-10-23 18:24:12 +02:00
dgtlmoon
614431ff07 Basic system info/system state API 2022-10-23 18:21:36 +02:00
dgtlmoon
492bbce6b6 Build - Fix syntax in container build test (#1050) 2022-10-23 16:02:13 +02:00
dgtlmoon
0394a56be5 Building - Test container build on PR 2022-10-23 15:54:19 +02:00
Entepotenz
7839551d6b Testing - Use same version of playwright while running tests as in production builds (#1047) 2022-10-23 11:26:32 +02:00
Entepotenz
9c5588c791 update path for validation in the CONTRIBUTING.md (#1046) 2022-10-23 11:25:29 +02:00
dgtlmoon
5a43a350de History index safety check - Be sure that only valid history index lines are read (#1042) 2022-10-19 22:41:13 +02:00
Michael McMillan
3c31f023ce Option to Hide the Referer header from monitored websites. (#996) 2022-10-18 09:16:22 +02:00
dgtlmoon
4cbcc59461 0.39.20.4 2022-10-17 18:36:47 +02:00
dgtlmoon
4be0260381 Better cross platform file handling in diff and preview (#1034) 2022-10-17 18:36:22 +02:00
dgtlmoon
957a3c1c16 0.39.20.3 2022-10-17 17:43:35 +02:00
dgtlmoon
85897e0bf9 Windows - diff file handling improvements (#1031) 2022-10-17 17:40:28 +02:00
dgtlmoon
63095f70ea Also include tests in pip build 2022-10-17 17:13:15 +02:00
dgtlmoon
8d5b0b5576 Update README.md 2022-10-12 10:51:39 +02:00
dgtlmoon
1b077abd93 0.39.20.2 2022-10-12 09:53:59 +02:00
dgtlmoon
32ea1a8721 Windows - JQ - Make library optional so it doesnt break Windows pip installs (#1009) 2022-10-12 09:53:16 +02:00
dgtlmoon
fff32cef0d Adding test - Test the 'execute JS before changedetection' (#1006) 2022-10-11 14:40:36 +02:00
dgtlmoon
8fb146f3e4 0.39.20.1 2022-10-09 23:05:35 +02:00
dgtlmoon
770b0faa45 Code - check containers build when Dockerfile or requirements.txt changes (#1005) 2022-10-09 22:58:01 +02:00
dgtlmoon
f6faa90340 Adding make to Dockerfile build as required by jq for ARM devices 2022-10-09 22:29:18 +02:00
dgtlmoon
669fd3ae0b Dont use default Requests user-agent and accept headers in playwright+selenium requests, breaks sites such as united.com. (#1004) 2022-10-09 18:25:36 +02:00
dgtlmoon
17d37fb626 0.39.20 2022-10-09 16:13:32 +02:00
Yusef Ouda
dfa7fc3a81 Adds support for jq JSON path querying engine (#1001) 2022-10-09 16:12:45 +02:00
dgtlmoon
cd467df97a Adding link to BrightData Proxy info (#1003) 2022-10-09 15:51:57 +02:00
dgtlmoon
71bc2fed82 Remove quotationspage default watch 2022-10-09 14:06:07 +02:00
Hmmbob
738fcfe01c Notification library: Bump apprise to 1.1.0 (signal, opsgenie, pagerduty, bark and mailto fixes, adds support for BulkSMS and SMSEagle) (#1002) 2022-10-09 11:42:51 +02:00
dgtlmoon
3ebb2ab9ba Selenium fetcher - screenshot should be taken after 'wait' time, not before #873 2022-09-25 11:05:07 +02:00
dgtlmoon
ac98bc9144 Upgrade Playwright to 1.26 2022-09-24 23:51:26 +02:00
dgtlmoon
3705ce6681 Render Extract Configurable Delay Seconds should also apply after executing any JS #958 2022-09-24 23:48:03 +02:00
dgtlmoon
f7ea99412f Re #958 - remove change screensize, should be in 1280x720 default, was causing "Unable to retrieve content because the page is navigating and changing the content." on some sites 2022-09-19 14:02:32 +02:00
dgtlmoon
d4715e2bc8 Tidy up proxies.json logic, adding tests (#955) 2022-09-19 13:14:35 +02:00
dgtlmoon
8567a83c47 Update README.md - Include BrightData suggestion 2022-09-16 13:21:01 +02:00
26 changed files with 371 additions and 94 deletions

View File

@@ -0,0 +1,55 @@
name: ChangeDetection.io Container Build Test
# Triggers the workflow on push or pull request events
# This line doesnt work, even tho it is the documented one
#on: [push, pull_request]
on:
push:
paths:
- requirements.txt
- Dockerfile
pull_request:
paths:
- requirements.txt
- Dockerfile
# Changes to requirements.txt packages and Dockerfile may or may not always be compatible with arm etc, so worth testing
# @todo: some kind of path filter for requirements.txt and Dockerfile
jobs:
test-container-build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.9
uses: actions/setup-python@v2
with:
python-version: 3.9
# Just test that the build works, some libraries won't compile on ARM/rPi etc
- name: Set up QEMU
uses: docker/setup-qemu-action@v1
with:
image: tonistiigi/binfmt:latest
platforms: all
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v1
with:
install: true
version: latest
driver-opts: image=moby/buildkit:master
- name: Test that the docker containers can build
id: docker_build
uses: docker/build-push-action@v2
# https://github.com/docker/build-push-action#customizing
with:
context: ./
file: ./Dockerfile
platforms: linux/arm/v7,linux/arm/v6,linux/amd64,linux/arm64,
cache-from: type=local,src=/tmp/.buildx-cache
cache-to: type=local,dest=/tmp/.buildx-cache

View File

@@ -1,28 +1,25 @@
name: ChangeDetection.io Test
name: ChangeDetection.io App Test
# Triggers the workflow on push or pull request events
on: [push, pull_request]
jobs:
test-build:
test-application:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.9
uses: actions/setup-python@v2
with:
python-version: 3.9
- name: Show env vars
run: set
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
@@ -39,7 +36,4 @@ jobs:
# Each test is totally isolated and performs its own cleanup/reset
cd changedetectionio; ./run_all_tests.sh
# https://github.com/docker/build-push-action/blob/master/docs/advanced/test-before-push.md ?
# https://github.com/docker/buildx/issues/59 ? Needs to be one platform?
# https://github.com/docker/buildx/issues/495#issuecomment-918925854

View File

@@ -6,7 +6,7 @@ Otherwise, it's always best to PR into the `dev` branch.
Please be sure that all new functionality has a matching test!
Use `pytest` to validate/test, you can run the existing tests as `pytest tests/test_notifications.py` for example
Use `pytest` to validate/test, you can run the existing tests as `pytest tests/test_notification.py` for example
```
pip3 install -r requirements-dev

View File

@@ -5,13 +5,14 @@ FROM python:3.8-slim as builder
ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1
RUN apt-get update && apt-get install -y --no-install-recommends \
libssl-dev \
libffi-dev \
g++ \
gcc \
libc-dev \
libffi-dev \
libssl-dev \
libxslt-dev \
zlib1g-dev \
g++
make \
zlib1g-dev
RUN mkdir /install
WORKDIR /install
@@ -22,9 +23,14 @@ RUN pip install --target=/dependencies -r /requirements.txt
# Playwright is an alternative to Selenium
# Excluded this package from requirements.txt to prevent arm/v6 and arm/v7 builds from failing
RUN pip install --target=/dependencies playwright~=1.25 \
RUN pip install --target=/dependencies playwright~=1.26 \
|| echo "WARN: Failed to install Playwright. The application can still run, but the Playwright option will be disabled."
RUN pip install --target=/dependencies jq~=1.3 \
|| echo "WARN: Failed to install JQ. The application can still run, but the Jq: filter option will be disabled."
# Final image stage
FROM python:3.8-slim
@@ -58,6 +64,7 @@ EXPOSE 5000
# The actual flask app
COPY changedetectionio /app/changedetectionio
# The eventlet server wrapper
COPY changedetection.py /app/changedetection.py

View File

@@ -2,6 +2,7 @@ recursive-include changedetectionio/api *
recursive-include changedetectionio/templates *
recursive-include changedetectionio/static *
recursive-include changedetectionio/model *
recursive-include changedetectionio/tests *
include changedetection.py
global-exclude *.pyc
global-exclude node_modules

View File

@@ -33,7 +33,7 @@ _Need an actual Chrome runner with Javascript support? We support fetching via W
#### Key Features
- Lots of trigger filters, such as "Trigger on text", "Remove text by selector", "Ignore text", "Extract text", also using regular-expressions!
- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JsonPath rules
- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JSONPath or jq
- Switch between fast non-JS and Chrome JS based "fetchers"
- Easily specify how often a site should be checked
- Execute JS before extracting text (Good for logging in, see examples in the UI!)

View File

@@ -47,13 +47,15 @@ _Need an actual Chrome runner with Javascript support? We support fetching via W
#### Key Features
- Lots of trigger filters, such as "Trigger on text", "Remove text by selector", "Ignore text", "Extract text", also using regular-expressions!
- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JsonPath rules
- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JSONPath or jq
- Switch between fast non-JS and Chrome JS based "fetchers"
- Easily specify how often a site should be checked
- Execute JS before extracting text (Good for logging in, see examples in the UI!)
- Override Request Headers, Specify `POST` or `GET` and other methods
- Use the "Visual Selector" to help target specific elements
- Configurable [proxy per watch](https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration)
We [recommend and use Bright Data](https://brightdata.grsm.io/n0r16zf7eivq) global proxy services, Bright Data will match any first deposit up to $100 using our signup link.
## Screenshots
@@ -119,8 +121,8 @@ See the wiki for more information https://github.com/dgtlmoon/changedetection.io
## Filters
XPath, JSONPath and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools.
XPath, JSONPath, jq, and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools.
(We support LXML `re:test`, `re:math` and `re:replace`.)
## Notifications
@@ -149,7 +151,7 @@ Now you can also customise your notification content!
## JSON API Monitoring
Detect changes and monitor data in JSON API's by using the built-in JSONPath selectors as a filter / selector.
Detect changes and monitor data in JSON API's by using either JSONPath or jq to filter, parse, and restructure JSON as needed.
![image](https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/json-filter-field-example.png)
@@ -157,9 +159,20 @@ This will re-parse the JSON and apply formatting to the text, making it super ea
![image](https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/json-diff-example.png)
### JSONPath or jq?
For more complex parsing, filtering, and modifying of JSON data, jq is recommended due to the built-in operators and functions. Refer to the [documentation](https://stedolan.github.io/jq/manual/) for more specifc information on jq.
One big advantage of `jq` is that you can use logic in your JSON filter, such as filters to only show items that have a value greater than/less than etc.
See the wiki https://github.com/dgtlmoon/changedetection.io/wiki/JSON-Selector-Filter-help for more information and examples
Note: `jq` library must be added separately (`pip3 install jq`)
### Parse JSON embedded in HTML!
When you enable a `json:` filter, you can even automatically extract and parse embedded JSON inside a HTML page! Amazingly handy for sites that build content based on JSON, such as many e-commerce websites.
When you enable a `json:` or `jq:` filter, you can even automatically extract and parse embedded JSON inside a HTML page! Amazingly handy for sites that build content based on JSON, such as many e-commerce websites.
```
<html>
@@ -169,7 +182,7 @@ When you enable a `json:` filter, you can even automatically extract and parse e
</script>
```
`json:$.price` would give `23.50`, or you can extract the whole structure
`json:$.price` or `jq:.price` would give `23.50`, or you can extract the whole structure
## Proxy configuration

View File

@@ -33,7 +33,7 @@ from flask_wtf import CSRFProtect
from changedetectionio import html_tools
from changedetectionio.api import api_v1
__version__ = '0.39.19.1'
__version__ = '0.39.20.4'
datastore = None
@@ -194,6 +194,9 @@ def changedetection_app(config=None, datastore_o=None):
watch_api.add_resource(api_v1.Watch, '/api/v1/watch/<string:uuid>',
resource_class_kwargs={'datastore': datastore, 'update_q': update_q})
watch_api.add_resource(api_v1.SystemInfo, '/api/v1/systeminfo',
resource_class_kwargs={'datastore': datastore, 'update_q': update_q})
@@ -636,20 +639,27 @@ def changedetection_app(config=None, datastore_o=None):
# Only works reliably with Playwright
visualselector_enabled = os.getenv('PLAYWRIGHT_DRIVER_URL', False) and default['fetch_backend'] == 'html_webdriver'
# JQ is difficult to install on windows and must be manually added (outside requirements.txt)
jq_support = True
try:
import jq
except ModuleNotFoundError:
jq_support = False
output = render_template("edit.html",
uuid=uuid,
watch=datastore.data['watching'][uuid],
form=form,
has_empty_checktime=using_default_check_time,
has_default_notification_urls=True if len(datastore.data['settings']['application']['notification_urls']) else False,
using_global_webdriver_wait=default['webdriver_delay'] is None,
current_base_url=datastore.data['settings']['application']['base_url'],
emailprefix=os.getenv('NOTIFICATION_MAIL_BUTTON_PREFIX', False),
form=form,
has_default_notification_urls=True if len(datastore.data['settings']['application']['notification_urls']) else False,
has_empty_checktime=using_default_check_time,
jq_support=jq_support,
playwright_enabled=os.getenv('PLAYWRIGHT_DRIVER_URL', False),
settings_application=datastore.data['settings']['application'],
using_global_webdriver_wait=default['webdriver_delay'] is None,
uuid=uuid,
visualselector_data_is_ready=visualselector_data_is_ready,
visualselector_enabled=visualselector_enabled,
playwright_enabled=os.getenv('PLAYWRIGHT_DRIVER_URL', False)
watch=datastore.data['watching'][uuid],
)
return output
@@ -809,8 +819,10 @@ def changedetection_app(config=None, datastore_o=None):
newest_file = history[dates[-1]]
# Read as binary and force decode as UTF-8
# Windows may fail decode in python if we just use 'r' mode (chardet decode exception)
try:
with open(newest_file, 'r') as f:
with open(newest_file, 'r', encoding='utf-8', errors='ignore') as f:
newest_version_file_contents = f.read()
except Exception as e:
newest_version_file_contents = "Unable to read {}.\n".format(newest_file)
@@ -823,7 +835,7 @@ def changedetection_app(config=None, datastore_o=None):
previous_file = history[dates[-2]]
try:
with open(previous_file, 'r') as f:
with open(previous_file, 'r', encoding='utf-8', errors='ignore') as f:
previous_version_file_contents = f.read()
except Exception as e:
previous_version_file_contents = "Unable to read {}.\n".format(previous_file)
@@ -900,7 +912,7 @@ def changedetection_app(config=None, datastore_o=None):
timestamp = list(watch.history.keys())[-1]
filename = watch.history[timestamp]
try:
with open(filename, 'r') as f:
with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
tmp = f.readlines()
# Get what needs to be highlighted

View File

@@ -122,3 +122,33 @@ class CreateWatch(Resource):
return {'status': "OK"}, 200
return list, 200
class SystemInfo(Resource):
def __init__(self, **kwargs):
# datastore is a black box dependency
self.datastore = kwargs['datastore']
self.update_q = kwargs['update_q']
@auth.check_token
def get(self):
import time
overdue_watches = []
# Check all watches and report which have not been checked but should have been
for uuid, watch in self.datastore.data.get('watching', {}).items():
# see if now - last_checked is greater than the time that should have been
# this is not super accurate (maybe they just edited it) but better than nothing
t = watch.threshold_seconds()
if not t:
t = self.datastore.threshold_seconds
time_since_check = time.time() - watch.get('last_checked')
if time_since_check > t:
overdue_watches.append(uuid)
return {
'queue_size': self.update_q.qsize(),
'overdue_watches': overdue_watches,
'uptime': round(time.time() - self.datastore.start_time, 2),
'watch_count': len(self.datastore.data.get('watching', {}))
}, 200

View File

@@ -102,6 +102,14 @@ def main():
has_password=datastore.data['settings']['application']['password'] != False
)
# Monitored websites will not receive a Referer header
# when a user clicks on an outgoing link.
@app.after_request
def hide_referrer(response):
if os.getenv("HIDE_REFERER", False):
response.headers["Referrer-Policy"] = "no-referrer"
return response
# Proxy sub-directory support
# Set environment var USE_X_SETTINGS=1 on this script
# And then in your proxy_pass settings

View File

@@ -316,6 +316,7 @@ class base_html_playwright(Fetcher):
import playwright._impl._api_types
from playwright._impl._api_types import Error, TimeoutError
response = None
with sync_playwright() as p:
browser_type = getattr(p, self.browser_type)
@@ -373,8 +374,11 @@ class base_html_playwright(Fetcher):
print("response object was none")
raise EmptyReply(url=url, status_code=None)
# Bug 2(?) Set the viewport size AFTER loading the page
page.set_viewport_size({"width": 1280, "height": 1024})
# Removed browser-set-size, seemed to be needed to make screenshots work reliably in older playwright versions
# Was causing exceptions like 'waiting for page but content is changing' etc
# https://www.browserstack.com/docs/automate/playwright/change-browser-window-size 1280x720 should be the default
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
time.sleep(extra_wait)
@@ -398,6 +402,13 @@ class base_html_playwright(Fetcher):
raise JSActionExceptions(status_code=response.status, screenshot=error_screenshot, message=str(e), url=url)
else:
# JS eval was run, now we also wait some time if possible to let the page settle
if self.render_extract_delay:
page.wait_for_timeout(self.render_extract_delay * 1000)
page.wait_for_timeout(500)
self.content = page.content()
self.status_code = response.status
self.headers = response.all_headers()
@@ -514,8 +525,6 @@ class base_html_webdriver(Fetcher):
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
self.screenshot = self.driver.get_screenshot_as_png()
# @todo - how to check this? is it possible?
self.status_code = 200
# @todo somehow we should try to get this working for WebDriver
@@ -526,6 +535,8 @@ class base_html_webdriver(Fetcher):
self.content = self.driver.page_source
self.headers = {}
self.screenshot = self.driver.get_screenshot_as_png()
# Does the connection to the webdriver work? run a test connection.
def is_ready(self):
from selenium import webdriver
@@ -564,6 +575,11 @@ class html_requests(Fetcher):
ignore_status_codes=False,
current_css_filter=None):
# Make requests use a more modern looking user-agent
if not 'User-Agent' in request_headers:
request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT",
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36')
proxies = {}
# Allows override the proxy on a per-request basis

View File

@@ -141,8 +141,9 @@ class perform_site_check():
has_filter_rule = True
if has_filter_rule:
if 'json:' in css_filter_rule:
stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
json_filter_prefixes = ['json:', 'jq:']
if any(prefix in css_filter_rule for prefix in json_filter_prefixes):
stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, json_filter=css_filter_rule)
is_html = False
if is_html or is_source:

View File

@@ -303,6 +303,25 @@ class ValidateCSSJSONXPATHInput(object):
# Re #265 - maybe in the future fetch the page and offer a
# warning/notice that its possible the rule doesnt yet match anything?
if not self.allow_json:
raise ValidationError("jq not permitted in this field!")
if 'jq:' in line:
try:
import jq
except ModuleNotFoundError:
# `jq` requires full compilation in windows and so isn't generally available
raise ValidationError("jq not support not found")
input = line.replace('jq:', '')
try:
jq.compile(input)
except (ValueError) as e:
message = field.gettext('\'%s\' is not a valid jq expression. (%s)')
raise ValidationError(message % (input, str(e)))
except:
raise ValidationError("A system-error occurred when validating your jq expression")
class quickWatchForm(Form):

View File

@@ -1,11 +1,11 @@
import json
from typing import List
from bs4 import BeautifulSoup
from jsonpath_ng.ext import parse
import re
from inscriptis import get_text
from inscriptis.model.config import ParserConfig
from jsonpath_ng.ext import parse
from typing import List
import json
import re
class FilterNotFoundInResponse(ValueError):
def __init__(self, msg):
@@ -79,19 +79,35 @@ def extract_element(find='title', html_content=''):
return element_text
#
def _parse_json(json_data, jsonpath_filter):
s=[]
jsonpath_expression = parse(jsonpath_filter.replace('json:', ''))
match = jsonpath_expression.find(json_data)
def _parse_json(json_data, json_filter):
if 'json:' in json_filter:
jsonpath_expression = parse(json_filter.replace('json:', ''))
match = jsonpath_expression.find(json_data)
return _get_stripped_text_from_json_match(match)
if 'jq:' in json_filter:
try:
import jq
except ModuleNotFoundError:
# `jq` requires full compilation in windows and so isn't generally available
raise Exception("jq not support not found")
jq_expression = jq.compile(json_filter.replace('jq:', ''))
match = jq_expression.input(json_data).all()
return _get_stripped_text_from_json_match(match)
def _get_stripped_text_from_json_match(match):
s = []
# More than one result, we will return it as a JSON list.
if len(match) > 1:
for i in match:
s.append(i.value)
s.append(i.value if hasattr(i, 'value') else i)
# Single value, use just the value, as it could be later used in a token in notifications.
if len(match) == 1:
s = match[0].value
s = match[0].value if hasattr(match[0], 'value') else match[0]
# Re #257 - Better handling where it does not exist, in the case the original 's' value was False..
if not match:
@@ -103,16 +119,16 @@ def _parse_json(json_data, jsonpath_filter):
return stripped_text_from_html
def extract_json_as_string(content, jsonpath_filter):
def extract_json_as_string(content, json_filter):
stripped_text_from_html = False
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
try:
stripped_text_from_html = _parse_json(json.loads(content), jsonpath_filter)
stripped_text_from_html = _parse_json(json.loads(content), json_filter)
except json.JSONDecodeError:
# Foreach <script json></script> blob.. just return the first that matches jsonpath_filter
# Foreach <script json></script> blob.. just return the first that matches json_filter
s = []
soup = BeautifulSoup(content, 'html.parser')
bs_result = soup.findAll('script')
@@ -131,7 +147,7 @@ def extract_json_as_string(content, jsonpath_filter):
# Just skip it
continue
else:
stripped_text_from_html = _parse_json(json_data, jsonpath_filter)
stripped_text_from_html = _parse_json(json_data, json_filter)
if stripped_text_from_html:
break

View File

@@ -13,10 +13,6 @@ class model(dict):
'watching': {},
'settings': {
'headers': {
'User-Agent': getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate', # No support for brolti in python requests yet.
'Accept-Language': 'en-GB,en-US;q=0.9,en;'
},
'requests': {
'timeout': int(getenv("DEFAULT_SETTINGS_REQUESTS_TIMEOUT", "45")), # Default 45 seconds

View File

@@ -118,7 +118,10 @@ class model(dict):
if os.path.isfile(fname):
logging.debug("Reading history index " + str(time.time()))
with open(fname, "r") as f:
tmp_history = dict(i.strip().split(',', 2) for i in f.readlines())
for i in f.readlines():
if ',' in i:
k, v = i.strip().split(',', 2)
tmp_history[k] = v
if len(tmp_history):
self.__newest_history_key = list(tmp_history.keys())[-1]
@@ -151,28 +154,30 @@ class model(dict):
import uuid
import logging
output_path = "{}/{}".format(self.__datastore_path, self['uuid'])
output_path = os.path.join(self.__datastore_path, self['uuid'])
self.ensure_data_dir_exists()
snapshot_fname = os.path.join(output_path, str(uuid.uuid4()))
snapshot_fname = "{}/{}.stripped.txt".format(output_path, uuid.uuid4())
logging.debug("Saving history text {}".format(snapshot_fname))
# in /diff/ and /preview/ we are going to assume for now that it's UTF-8 when reading
# most sites are utf-8 and some are even broken utf-8
with open(snapshot_fname, 'wb') as f:
f.write(contents)
f.close()
# Append to index
# @todo check last char was \n
index_fname = "{}/history.txt".format(output_path)
index_fname = os.path.join(output_path, "history.txt")
with open(index_fname, 'a') as f:
f.write("{},{}\n".format(timestamp, snapshot_fname))
f.close()
self.__newest_history_key = timestamp
self.__history_n+=1
self.__history_n += 1
#@todo bump static cache of the last timestamp so we dont need to examine the file to set a proper ''viewed'' status
# @todo bump static cache of the last timestamp so we dont need to examine the file to set a proper ''viewed'' status
return snapshot_fname
@property

View File

@@ -9,6 +9,8 @@
# exit when any command fails
set -e
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
find tests/test_*py -type f|while read test_name
do
echo "TEST RUNNING $test_name"
@@ -23,6 +25,13 @@ export BASE_URL="https://really-unique-domain.io"
pytest tests/test_notification.py
## JQ + JSON: filter test
# jq is not available on windows and we should just test it when the package is installed
# this will re-test with jq support
pip3 install jq~=1.3
pytest tests/test_jsonpath_jq_selector.py
# Now for the selenium and playwright/browserless fetchers
# Note - this is not UI functional tests - just checking that each one can fetch the content
@@ -38,7 +47,9 @@ docker kill $$-test_selenium
echo "TESTING WEBDRIVER FETCH > PLAYWRIGHT/BROWSERLESS..."
# Not all platforms support playwright (not ARM/rPI), so it's not packaged in requirements.txt
pip3 install playwright~=1.24
PLAYWRIGHT_VERSION=$(grep -i -E "RUN pip install.+" "$SCRIPT_DIR/../Dockerfile" | grep --only-matching -i -E "playwright[=><~+]+[0-9\.]+")
echo "using $PLAYWRIGHT_VERSION"
pip3 install "$PLAYWRIGHT_VERSION"
docker run -d --name $$-test_browserless -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm -p 3000:3000 --shm-size="2g" browserless/chrome:1.53-chrome-stable
# takes a while to spin up
sleep 5

View File

@@ -30,14 +30,14 @@ class ChangeDetectionStore:
def __init__(self, datastore_path="/datastore", include_default_watches=True, version_tag="0.0.0"):
# Should only be active for docker
# logging.basicConfig(filename='/dev/stdout', level=logging.INFO)
self.needs_write = False
self.__data = App.model()
self.datastore_path = datastore_path
self.json_store_path = "{}/url-watches.json".format(self.datastore_path)
self.needs_write = False
self.proxy_list = None
self.start_time = time.time()
self.stop_thread = False
self.__data = App.model()
# Base definition for all watchers
# deepcopy part of #569 - not sure why its needed exactly
self.generic_definition = deepcopy(Watch.model(datastore_path = datastore_path, default={}))
@@ -81,8 +81,6 @@ class ChangeDetectionStore:
except (FileNotFoundError, json.decoder.JSONDecodeError):
if include_default_watches:
print("Creating JSON store at", self.datastore_path)
self.add_watch(url='http://www.quotationspage.com/random.php', tag='test')
self.add_watch(url='https://news.ycombinator.com/', tag='Tech news')
self.add_watch(url='https://changedetection.io/CHANGELOG.txt', tag='changedetection.io')
@@ -577,3 +575,11 @@ class ChangeDetectionStore:
continue
return
# We incorrectly used common header overrides that should only apply to Requests
# These are now handled in content_fetcher::html_requests and shouldnt be passed to Playwright/Selenium
def update_7(self):
# These were hard-coded in early versions
for v in ['User-Agent', 'Accept', 'Accept-Encoding', 'Accept-Language']:
if self.data['settings']['headers'].get(v):
del self.data['settings']['headers'][v]

View File

@@ -77,6 +77,7 @@
<span class="pure-form-message-inline">
<p>Use the <strong>Basic</strong> method (default) where your watched site doesn't need Javascript to render.</p>
<p>The <strong>Chrome/Javascript</strong> method requires a network connection to a running WebDriver+Chrome server, set by the ENV var 'WEBDRIVER_URL'. </p>
Tip: <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration#brightdata-proxy-support">Connect using BrightData Proxies, find out more here.</a>
</span>
</div>
{% if form.proxy %}
@@ -183,8 +184,16 @@ User-Agent: wonderbra 1.0") }}
<span class="pure-form-message-inline">
<ul>
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
<li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required, <a
href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
<li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a> (if installed).
<ul>
<li>JSONPath: Prefix with <code>json:</code>, use <code>json:$</code> to force re-formatting if required, <a href="https://jsonpath.com/" target="new">test your JSONPath here</a>.</li>
{% if jq_support %}
<li>jq: Prefix with <code>jq:</code> and <a href="https://jqplay.org/" target="new">test your jq here</a>. Using <a href="https://stedolan.github.io/jq/" target="new">jq</a> allows for complex filtering and processing of JSON data with built-in functions, regex, filtering, and more. See examples and documentation <a href="https://stedolan.github.io/jq/manual/" target="new">here</a>.</li>
{% else %}
<li>jq support not installed</li>
{% endif %}
</ul>
</li>
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash,
<ul>
<li>Example: <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
@@ -193,7 +202,7 @@ User-Agent: wonderbra 1.0") }}
</ul>
</li>
</ul>
Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
Please be sure that you thoroughly understand how to write CSS, JSONPath, XPath{% if jq_support %}, or jq selector{%endif%} rules before filing an issue on GitHub! <a
href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
</span>
</div>

View File

@@ -99,6 +99,8 @@
<p>Use the <strong>Basic</strong> method (default) where your watched sites don't need Javascript to render.</p>
<p>The <strong>Chrome/Javascript</strong> method requires a network connection to a running WebDriver+Chrome server, set by the ENV var 'WEBDRIVER_URL'. </p>
</span>
<br/>
Tip: <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration#brightdata-proxy-support">Connect using BrightData Proxies, find out more here.</a>
</div>
<fieldset class="pure-group" id="webdriver-override-options">
<div class="pure-form-message-inline">

View File

@@ -147,6 +147,16 @@ def test_api_simple(client, live_server):
# @todo how to handle None/default global values?
assert watch['history_n'] == 2, "Found replacement history section, which is in its own API"
# basic systeminfo check
res = client.get(
url_for("systeminfo"),
headers={'x-api-key': api_key},
)
info = json.loads(res.data)
assert info.get('watch_count') == 1
assert info.get('uptime') > 0.5
# Finally delete the watch
res = client.delete(
url_for("watch", uuid=watch_uuid),

View File

@@ -2,10 +2,15 @@
# coding=utf-8
import time
from flask import url_for
from flask import url_for, escape
from . util import live_server_setup
import pytest
jq_support = True
try:
import jq
except ModuleNotFoundError:
jq_support = False
def test_setup(live_server):
live_server_setup(live_server)
@@ -36,16 +41,28 @@ and it can also be repeated
from .. import html_tools
# See that we can find the second <script> one, which is not broken, and matches our filter
text = html_tools.extract_json_as_string(content, "$.offers.price")
text = html_tools.extract_json_as_string(content, "json:$.offers.price")
assert text == "23.5"
text = html_tools.extract_json_as_string('{"id":5}', "$.id")
# also check for jq
if jq_support:
text = html_tools.extract_json_as_string(content, "jq:.offers.price")
assert text == "23.5"
text = html_tools.extract_json_as_string('{"id":5}', "jq:.id")
assert text == "5"
text = html_tools.extract_json_as_string('{"id":5}', "json:$.id")
assert text == "5"
# When nothing at all is found, it should throw JSONNOTFound
# Which is caught and shown to the user in the watch-overview table
with pytest.raises(html_tools.JSONNotFound) as e_info:
html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "$.id")
html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "json:$.id")
if jq_support:
with pytest.raises(html_tools.JSONNotFound) as e_info:
html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "jq:.id")
def set_original_ext_response():
data = """
@@ -66,6 +83,7 @@ def set_original_ext_response():
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(data)
return None
def set_modified_ext_response():
data = """
@@ -86,6 +104,7 @@ def set_modified_ext_response():
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(data)
return None
def set_original_response():
test_return_data = """
@@ -184,10 +203,10 @@ def test_check_json_without_filter(client, live_server):
assert b'&#34;&lt;b&gt;' in res.data
assert res.data.count(b'{\n') >= 2
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_check_json_filter(client, live_server):
json_filter = 'json:boss.name'
def check_json_filter(json_filter, client, live_server):
set_original_response()
# Give the endpoint time to spin up
@@ -226,7 +245,7 @@ def test_check_json_filter(client, live_server):
res = client.get(
url_for("edit_page", uuid="first"),
)
assert bytes(json_filter.encode('utf-8')) in res.data
assert bytes(escape(json_filter).encode('utf-8')) in res.data
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
@@ -252,10 +271,17 @@ def test_check_json_filter(client, live_server):
# And #462 - check we see the proper utf-8 string there
assert "Örnsköldsvik".encode('utf-8') in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_check_json_filter_bool_val(client, live_server):
json_filter = "json:$['available']"
def test_check_jsonpath_filter(client, live_server):
check_json_filter('json:boss.name', client, live_server)
def test_check_jq_filter(client, live_server):
if jq_support:
check_json_filter('jq:.boss.name', client, live_server)
def check_json_filter_bool_val(json_filter, client, live_server):
set_original_response()
# Give the endpoint time to spin up
@@ -304,14 +330,22 @@ def test_check_json_filter_bool_val(client, live_server):
# But the change should be there, tho its hard to test the change was detected because it will show old and new versions
assert b'false' in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_check_jsonpath_filter_bool_val(client, live_server):
check_json_filter_bool_val("json:$['available']", client, live_server)
def test_check_jq_filter_bool_val(client, live_server):
if jq_support:
check_json_filter_bool_val("jq:.available", client, live_server)
# Re #265 - Extended JSON selector test
# Stuff to consider here
# - Selector should be allowed to return empty when it doesnt match (people might wait for some condition)
# - The 'diff' tab could show the old and new content
# - Form should let us enter a selector that doesnt (yet) match anything
def test_check_json_ext_filter(client, live_server):
json_filter = 'json:$[?(@.status==Sold)]'
def check_json_ext_filter(json_filter, client, live_server):
set_original_ext_response()
# Give the endpoint time to spin up
@@ -350,7 +384,7 @@ def test_check_json_ext_filter(client, live_server):
res = client.get(
url_for("edit_page", uuid="first"),
)
assert bytes(json_filter.encode('utf-8')) in res.data
assert bytes(escape(json_filter).encode('utf-8')) in res.data
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
@@ -376,3 +410,12 @@ def test_check_json_ext_filter(client, live_server):
assert b'ForSale' not in res.data
assert b'Sold' in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_check_jsonpath_ext_filter(client, live_server):
check_json_ext_filter('json:$[?(@.status==Sold)]', client, live_server)
def test_check_jq_ext_filter(client, live_server):
if jq_support:
check_json_ext_filter('jq:.[] | select(.status | contains("Sold"))', client, live_server)

View File

@@ -13,9 +13,9 @@ def test_visual_selector_content_ready(client, live_server):
live_server_setup(live_server)
time.sleep(1)
# Add our URL to the import page, maybe better to use something we control?
# We use an external URL because the docker container is too difficult to setup to connect back to the pytest socket
test_url = 'https://news.ycombinator.com'
# Add our URL to the import page, because the docker container (playwright/selenium) wont be able to connect to our usual test url
test_url = "https://changedetection.io/ci-test/test-runjs.html"
res = client.post(
url_for("form_quick_watch_add"),
data={"url": test_url, "tag": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
@@ -25,13 +25,27 @@ def test_visual_selector_content_ready(client, live_server):
res = client.post(
url_for("edit_page", uuid="first", unpause_on_save=1),
data={"css_filter": ".does-not-exist", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_webdriver"},
data={
"url": test_url,
"tag": "",
"headers": "",
'fetch_backend': "html_webdriver",
'webdriver_js_execute_code': 'document.querySelector("button[name=test-button]").click();'
},
follow_redirects=True
)
assert b"unpaused" in res.data
time.sleep(1)
wait_for_all_checks(client)
uuid = extract_UUID_from_client(client)
# Check the JS execute code before extract worked
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
assert b'I smell JavaScript' in res.data
assert os.path.isfile(os.path.join('test-datastore', uuid, 'last-screenshot.png')), "last-screenshot.png should exist"
assert os.path.isfile(os.path.join('test-datastore', uuid, 'elements.json')), "xpath elements.json data should exist"

View File

@@ -45,6 +45,9 @@ services:
# Respect proxy_pass type settings, `proxy_set_header Host "localhost";` and `proxy_set_header X-Forwarded-Prefix /app;`
# More here https://github.com/dgtlmoon/changedetection.io/wiki/Running-changedetection.io-behind-a-reverse-proxy-sub-directory
# - USE_X_SETTINGS=1
#
# Hides the `Referer` header so that monitored websites can't see the changedetection.io hostname.
# - HIDE_REFERER=true
# Comment out ports: when using behind a reverse proxy , enable networks: etc.
ports:

BIN
docs/proxy-example.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

View File

@@ -1,8 +1,8 @@
flask~= 2.0
flask ~= 2.0
flask_wtf
eventlet>=0.31.0
eventlet >= 0.31.0
validators
timeago ~=1.0
timeago ~= 1.0
inscriptis ~= 2.2
feedgen ~= 0.9
flask-login ~= 0.5
@@ -10,15 +10,20 @@ flask_restful
pytz
# Set these versions together to avoid a RequestsDependencyWarning
requests[socks] ~= 2.26
# >= 2.26 also adds Brotli support if brotli is installed
brotli ~= 1.0
requests[socks] ~= 2.28
urllib3 > 1.26
chardet > 2.3.0
wtforms ~= 3.0
jsonpath-ng ~= 1.5.3
# jq not available on Windows so must be installed manually
# Notification library
apprise ~= 1.0.0
apprise ~= 1.1.0
# apprise mqtt https://github.com/dgtlmoon/changedetection.io/issues/315
paho-mqtt
@@ -42,3 +47,4 @@ selenium ~= 4.1.0
werkzeug ~= 2.0.0
# playwright is installed at Dockerfile build time because it's not available on all platforms