fix curl request

Merge branch 'master' into ipv6
add ipv6 checks
2025-10-31 06:37:41 +00:00 · 2022-12-26 16:45:52 +01:00 · 2022-12-26 16:34:59 +01:00 · 2022-12-26 16:34:05 +01:00 · 2022-12-26 16:33:11 +01:00 · 2022-12-26 14:17:40 +01:00
48 changed files with 1225 additions and 476 deletions
--- a/.github/workflows/containers.yml
+++ b/.github/workflows/containers.yml
@@ -50,7 +50,6 @@ jobs:
          python -m pip install --upgrade pip
          pip install flake8 pytest
          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-          if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi

      - name: Create release metadata
        run: |
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -19,12 +19,6 @@ jobs:
        with:
          python-version: 3.9

-#      - name: Install dependencies
-#        run: |
-#          python -m pip install --upgrade pip
-#          pip install flake8 pytest
-#          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-#          if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi

      - name: Test that pip builds without error
        run: |
--- a/.github/workflows/test-only.yml
+++ b/.github/workflows/test-only.yml
@@ -8,32 +8,70 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v2
-      - name: Set up Python 3.9
+
+      # Mainly just for link/flake8
+      - name: Set up Python 3.10
        uses: actions/setup-python@v2
        with:
-          python-version: 3.9
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install flake8 pytest
-          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-          if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
+          python-version: '3.10'

      - name: Lint with flake8
        run: |
+          pip3 install flake8
          # stop the build if there are Python syntax errors or undefined names
          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics

-      - name: Unit tests
+      - name: Spin up ancillary testable services
        run: |
-          python3 -m unittest changedetectionio.tests.unit.test_notification_diff
+          
+          docker network create changedet-network

-      - name: Test with pytest
+          # Selenium+browserless
+          docker run --network changedet-network -d --hostname selenium  -p 4444:4444 --rm --shm-size="2g"  selenium/standalone-chrome-debug:3.141.59
+          docker run --network changedet-network -d --hostname browserless -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm  -p 3000:3000  --shm-size="2g"  browserless/chrome:1.53-chrome-stable
+
+      - name: Build changedetection.io container for testing
+        run: |         
+          # Build a changedetection.io container and start testing inside
+          docker build . -t test-changedetectionio
+
+      - name: Test built container with pytest
        run: |
-          # Each test is totally isolated and performs its own cleanup/reset
-          cd changedetectionio; ./run_all_tests.sh
+          
+          # Unit tests
+          docker run test-changedetectionio  bash -c 'python3 -m unittest changedetectionio.tests.unit.test_notification_diff'
+          
+          # All tests
+          docker run --network changedet-network  test-changedetectionio  bash -c 'cd changedetectionio && ./run_basic_tests.sh'

+      - name: Test built container selenium+browserless/playwright
+        run: |
+          
+          # Selenium fetch
+          docker run -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network  test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py'
+          
+          # Playwright/Browserless fetch
+          docker run -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network  test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py'

+      - name: Test proxy interaction
+        run: |
+          cd changedetectionio
+          ./run_proxy_tests.sh
+          cd ..
+
+      - name: Test changedetection.io container starts+runs basically without error
+        run: |
+          docker run -p 5556:5000 -d test-changedetectionio
+          sleep 3
+          # Should return 0 (no error) when grep finds it
+          curl -s http://localhost:5556 |grep -q checkbox-uuid
+          curl -s http://localhost:5556/rss|grep -q rss-specification
+          # and IPv6
+          curl -s -g -6 "http://[::1]:5556"|grep -q checkbox-uuid
+          curl -s -g -6 "http://[::1]:5556/rss"|grep -q rss-specification
+
+#export WEBDRIVER_URL=http://localhost:4444/wd/hub
+#pytest tests/fetchers/test_content.py
+#pytest tests/test_errorhandling.py
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -7,9 +7,3 @@ Otherwise, it's always best to PR into the `dev` branch.
 Please be sure that all new functionality has a matching test!

 Use `pytest` to validate/test, you can run the existing tests as `pytest tests/test_notification.py` for example
-
-```
-pip3 install -r requirements-dev
-```
-
-this is from https://github.com/dgtlmoon/changedetection.io/blob/master/requirements-dev.txt
--- a/26
+++ b/26
@@ -1,7 +1,7 @@
 # pip dependencies install stage
-FROM python:3.8-slim as builder
+FROM python:3.10-slim as builder

-# rustc compiler would be needed on ARM type devices but theres an issue with some deps not building..
+# See `cryptography` pin comment in requirements.txt
 ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1

 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -29,22 +29,16 @@ RUN pip install --target=/dependencies playwright~=1.27.1 \
    || echo "WARN: Failed to install Playwright. The application can still run, but the Playwright option will be disabled."

 # Final image stage
-FROM python:3.8-slim
+FROM python:3.10-slim

-# Actual packages needed at runtime, usually due to the notification (apprise) backend
-# rustc compiler would be needed on ARM type devices but theres an issue with some deps not building..
-ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1
-
-# Re #93, #73, excluding rustc (adds another 430Mb~)
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    g++ \
-    gcc \
-    libc-dev \
-    libffi-dev \
-    libjpeg-dev \
-    libssl-dev \
-    libxslt-dev \
-    zlib1g-dev
+    libssl1.1 \
+    libxslt1.1 \
+    # For pdftohtml
+    poppler-utils \
+    zlib1g \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+

 # https://stackoverflow.com/questions/58701233/docker-logs-erroneously-appears-empty-until-container-stops
 ENV PYTHONUNBUFFERED=1
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,9 +1,10 @@
 recursive-include changedetectionio/api *
-recursive-include changedetectionio/templates *
-recursive-include changedetectionio/static *
+recursive-include changedetectionio/blueprint *
 recursive-include changedetectionio/model *
-recursive-include changedetectionio/tests *
 recursive-include changedetectionio/res *
+recursive-include changedetectionio/static *
+recursive-include changedetectionio/templates *
+recursive-include changedetectionio/tests *
 prune changedetectionio/static/package-lock.json
 prune changedetectionio/static/styles/node_modules
 prune changedetectionio/static/styles/package-lock.json
--- a/README.md
+++ b/README.md
@@ -43,6 +43,7 @@ Requires Playwright to be enabled.

 - Products and services have a change in pricing
 - _Out of stock notification_ and _Back In stock notification_
+- Monitor and track PDF file changes, know when a PDF file has text changes.
 - Governmental department updates (changes are often only on their websites)
 - New software releases, security advisories when you're not on their mailing list.
 - Festivals with changes
@@ -68,6 +69,7 @@ _Need an actual Chrome runner with Javascript support? We support fetching via W
 - Lots of trigger filters, such as "Trigger on text", "Remove text by selector", "Ignore text", "Extract text", also using regular-expressions!
 - Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JSONPath or jq
 - Switch between fast non-JS and Chrome JS based "fetchers"
+- Track changes in PDF files (Monitor text changed in the PDF, Also monitor PDF filesize and checksums)
 - Easily specify how often a site should be checked
 - Execute JS before extracting text (Good for logging in, see examples in the UI!)
 - Override Request Headers, Specify `POST` or `GET` and other methods
@@ -187,11 +189,29 @@ When you enable a `json:` or `jq:` filter, you can even automatically extract an
 <html>
 ...
 <script type="application/ld+json">
-  {"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula  800g","price": 23.50 }
+
+{
+   "@context":"http://schema.org/",
+   "@type":"Product",
+   "offers":{
+      "@type":"Offer",
+      "availability":"http://schema.org/InStock",
+      "price":"3949.99",
+      "priceCurrency":"USD",
+      "url":"https://www.newegg.com/p/3D5-000D-001T1"
+   },
+   "description":"Cobratype King Cobra Hero Desktop Gaming PC",
+   "name":"Cobratype King Cobra Hero Desktop Gaming PC",
+   "sku":"3D5-000D-001T1",
+   "itemCondition":"NewCondition"
+}
 </script>
 ```  

-`json:$.price` or `jq:.price` would give `23.50`, or you can extract the whole structure
+`json:$..price` or `jq:..price` would give `3949.99`, or you can extract the whole structure (use a JSONpath test website to validate with)
+
+The application also supports notifying you that it can follow this information automatically
+

 ## Proxy Configuration

--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@@ -10,6 +10,7 @@ import threading
 import time
 import timeago

+from changedetectionio import queuedWatchMetaData
 from copy import deepcopy
 from distutils.util import strtobool
 from feedgen.feed import FeedGenerator
@@ -35,7 +36,7 @@ from flask_wtf import CSRFProtect
 from changedetectionio import html_tools
 from changedetectionio.api import api_v1

-__version__ = '0.39.22.1'
+__version__ = '0.40.0.4'

 datastore = None

@@ -404,19 +405,21 @@ def changedetection_app(config=None, datastore_o=None):
                sorted_watches.append(watch)

        existing_tags = datastore.get_all_tags()
-
        form = forms.quickWatchForm(request.form)
-        output = render_template("watch-overview.html",
-                                 form=form,
-                                 watches=sorted_watches,
-                                 tags=existing_tags,
+        output = render_template(
+            "watch-overview.html",
+                                 # Don't link to hosting when we're on the hosting environment
                                 active_tag=limit_tag,
                                 app_rss_token=datastore.data['settings']['application']['rss_access_token'],
-                                 has_unviewed=datastore.has_unviewed,
-                                 # Don't link to hosting when we're on the hosting environment
-                                 hosted_sticky=os.getenv("SALTED_PASS", False) == False,
+                                 form=form,
                                 guid=datastore.data['app_guid'],
-                                 queued_uuids=[uuid for p,uuid in update_q.queue])
+                                 has_proxies=datastore.proxy_list,
+                                 has_unviewed=datastore.has_unviewed,
+                                 hosted_sticky=os.getenv("SALTED_PASS", False) == False,
+                                 queued_uuids=[q_uuid.item['uuid'] for q_uuid in update_q.queue],
+                                 tags=existing_tags,
+                                 watches=sorted_watches
+                                 )


        if session.get('share-link'):
@@ -596,25 +599,16 @@ def changedetection_app(config=None, datastore_o=None):
                    using_default_check_time = False
                    break

-            # Use the default if its the same as system wide
+            # Use the default if it's the same as system-wide.
            if form.fetch_backend.data == datastore.data['settings']['application']['fetch_backend']:
                extra_update_obj['fetch_backend'] = None


+
             # Ignore text
            form_ignore_text = form.ignore_text.data
            datastore.data['watching'][uuid]['ignore_text'] = form_ignore_text

-            # Reset the previous_md5 so we process a new snapshot including stripping ignore text.
-            if form_ignore_text:
-                if len(datastore.data['watching'][uuid].history):
-                    extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
-
-            # Reset the previous_md5 so we process a new snapshot including stripping ignore text.
-            if form.include_filters.data != datastore.data['watching'][uuid].get('include_filters', []):
-                if len(datastore.data['watching'][uuid].history):
-                    extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
-
            # Be sure proxy value is None
            if datastore.proxy_list is not None and form.data['proxy'] == '':
                extra_update_obj['proxy'] = None
@@ -632,7 +626,7 @@ def changedetection_app(config=None, datastore_o=None):
            datastore.needs_write_urgent = True

            # Queue the watch for immediate recheck, with a higher priority
-            update_q.put((1, uuid))
+            update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))

            # Diff page [edit] link should go back to diff page
            if request.args.get("next") and request.args.get("next") == 'diff':
@@ -773,7 +767,7 @@ def changedetection_app(config=None, datastore_o=None):
                importer = import_url_list()
                importer.run(data=request.values.get('urls'), flash=flash, datastore=datastore)
                for uuid in importer.new_uuids:
-                    update_q.put((1, uuid))
+                    update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))

                if len(importer.remaining_data) == 0:
                    return redirect(url_for('index'))
@@ -786,7 +780,7 @@ def changedetection_app(config=None, datastore_o=None):
                d_importer = import_distill_io_json()
                d_importer.run(data=request.values.get('distill-io'), flash=flash, datastore=datastore)
                for uuid in d_importer.new_uuids:
-                    update_q.put((1, uuid))
+                    update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))



@@ -1151,7 +1145,7 @@ def changedetection_app(config=None, datastore_o=None):

        if not add_paused and new_uuid:
            # Straight into the queue.
-            update_q.put((1, new_uuid))
+            update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid}))
            flash("Watch added.")

        if add_paused:
@@ -1188,7 +1182,7 @@ def changedetection_app(config=None, datastore_o=None):
            uuid = list(datastore.data['watching'].keys()).pop()

        new_uuid = datastore.clone(uuid)
-        update_q.put((5, new_uuid))
+        update_q.put(queuedWatchMetaData.PrioritizedItem(priority=5, item={'uuid': new_uuid, 'skip_when_checksum_same': True}))
        flash('Cloned.')

        return redirect(url_for('index'))
@@ -1196,7 +1190,7 @@ def changedetection_app(config=None, datastore_o=None):
    @app.route("/api/checknow", methods=['GET'])
    @login_required
    def form_watch_checknow():
-
+        # Forced recheck will skip the 'skip if content is the same' rule (, 'reprocess_existing_data': True})))
        tag = request.args.get('tag')
        uuid = request.args.get('uuid')
        i = 0
@@ -1205,11 +1199,9 @@ def changedetection_app(config=None, datastore_o=None):
        for t in running_update_threads:
            running_uuids.append(t.current_uuid)

-        # @todo check thread is running and skip
-
        if uuid:
            if uuid not in running_uuids:
-                update_q.put((1, uuid))
+                update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
            i = 1

        elif tag != None:
@@ -1217,14 +1209,14 @@ def changedetection_app(config=None, datastore_o=None):
            for watch_uuid, watch in datastore.data['watching'].items():
                if (tag != None and tag in watch['tag']):
                    if watch_uuid not in running_uuids and not datastore.data['watching'][watch_uuid]['paused']:
-                        update_q.put((1, watch_uuid))
+                        update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid, 'skip_when_checksum_same': False}))
                        i += 1

        else:
            # No tag, no uuid, add everything.
            for watch_uuid, watch in datastore.data['watching'].items():
                if watch_uuid not in running_uuids and not datastore.data['watching'][watch_uuid]['paused']:
-                    update_q.put((1, watch_uuid))
+                    update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid, 'skip_when_checksum_same': False}))
                    i += 1
        flash("{} watches are queued for rechecking.".format(i))
        return redirect(url_for('index', tag=tag))
@@ -1271,6 +1263,14 @@ def changedetection_app(config=None, datastore_o=None):
                    datastore.data['watching'][uuid.strip()]['notification_muted'] = False
            flash("{} watches un-muted".format(len(uuids)))

+        elif (op == 'recheck'):
+            for uuid in uuids:
+                uuid = uuid.strip()
+                if datastore.data['watching'].get(uuid):
+                    # Recheck and require a full reprocessing
+                    update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
+
+            flash("{} watches un-muted".format(len(uuids)))
        elif (op == 'notification-default'):
            from changedetectionio.notification import (
                default_notification_format_for_watch
@@ -1343,6 +1343,10 @@ def changedetection_app(config=None, datastore_o=None):
    import changedetectionio.blueprint.browser_steps as browser_steps
    app.register_blueprint(browser_steps.construct_blueprint(datastore), url_prefix='/browser-steps')

+    import changedetectionio.blueprint.price_data_follower as price_data_follower
+    app.register_blueprint(price_data_follower.construct_blueprint(datastore, update_q), url_prefix='/price_data_follower')
+
+
    # @todo handle ctrl break
    ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
    threading.Thread(target=notification_runner).start()
@@ -1448,7 +1452,11 @@ def ticker_thread_check_time_launch_checks():
        watch_uuid_list = []
        while True:
            try:
-                watch_uuid_list = datastore.data['watching'].keys()
+                # Get a list of watches sorted by last_checked, [1] because it gets passed a tuple
+                # This is so we examine the most over-due first
+                for k in sorted(datastore.data['watching'].items(), key=lambda item: item[1].get('last_checked',0)):
+                    watch_uuid_list.append(k[0])
+
            except RuntimeError as e:
                # RuntimeError: dictionary changed size during iteration
                time.sleep(0.1)
@@ -1488,7 +1496,7 @@ def ticker_thread_check_time_launch_checks():
            seconds_since_last_recheck = now - watch['last_checked']

            if seconds_since_last_recheck >= (threshold + watch.jitter_seconds) and seconds_since_last_recheck >= recheck_time_minimum_seconds:
-                if not uuid in running_uuids and uuid not in [q_uuid for p,q_uuid in update_q.queue]:
+                if not uuid in running_uuids and uuid not in [q_uuid.item['uuid'] for q_uuid in update_q.queue]:

                    # Proxies can be set to have a limit on seconds between which they can be called
                    watch_proxy = datastore.get_preferred_proxy_for_watch(uuid=uuid)
@@ -1519,8 +1527,9 @@ def ticker_thread_check_time_launch_checks():
                            priority,
                            watch.jitter_seconds,
                            now - watch['last_checked']))
+
                    # Into the queue with you
-                    update_q.put((priority, uuid))
+                    update_q.put(queuedWatchMetaData.PrioritizedItem(priority=priority, item={'uuid': uuid, 'skip_when_checksum_same': True}))

                    # Reset for next time
                    watch.jitter_seconds = 0
--- a/changedetectionio/api/api_v1.py
+++ b/changedetectionio/api/api_v1.py
@@ -1,3 +1,4 @@
+from changedetectionio import queuedWatchMetaData
 from flask_restful import abort, Resource
 from flask import request, make_response
 import validators
@@ -24,7 +25,7 @@ class Watch(Resource):
            abort(404, message='No watch exists with the UUID of {}'.format(uuid))

        if request.args.get('recheck'):
-            self.update_q.put((1, uuid))
+            self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))
            return "OK", 200

        # Return without history, get that via another API call
@@ -100,7 +101,7 @@ class CreateWatch(Resource):
        extras = {'title': json_data['title'].strip()} if json_data.get('title') else {}

        new_uuid = self.datastore.add_watch(url=json_data['url'].strip(), tag=tag, extras=extras)
-        self.update_q.put((1, new_uuid))
+        self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': new_uuid, 'skip_when_checksum_same': True}))
        return {'uuid': new_uuid}, 201

    # Return concise list of available watches and some very basic info
@@ -118,7 +119,7 @@ class CreateWatch(Resource):

        if request.args.get('recheck_all'):
            for uuid in self.datastore.data['watching'].keys():
-                self.update_q.put((1, uuid))
+                self.update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True}))
            return {'status': "OK"}, 200

        return list, 200
--- a/changedetectionio/blueprint/browser_steps/browser_steps.py
+++ b/changedetectionio/blueprint/browser_steps/browser_steps.py
@@ -75,15 +75,13 @@ class steppable_browser_interface():
    def action_goto_url(self, url, optional_value):
        # self.page.set_viewport_size({"width": 1280, "height": 5000})
        now = time.time()
-        response = self.page.goto(url, timeout=0, wait_until='domcontentloaded')
-        print("Time to goto URL", time.time() - now)
+        response = self.page.goto(url, timeout=0, wait_until='commit')

        # Wait_until = commit
        # - `'commit'` - consider operation to be finished when network response is received and the document started loading.
        # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
        # This seemed to solve nearly all 'TimeoutErrors'
-        extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))
-        self.page.wait_for_timeout(extra_wait * 1000)
+        print("Time to goto URL ", time.time() - now)

    def action_click_element_containing_text(self, selector=None, value=''):
        if not len(value.strip()):
--- a/changedetectionio/blueprint/price_data_follower/init.py
+++ b/changedetectionio/blueprint/price_data_follower/init.py
@@ -0,0 +1,33 @@
+
+from distutils.util import strtobool
+from flask import Blueprint, flash, redirect, url_for
+from flask_login import login_required
+from changedetectionio.store import ChangeDetectionStore
+from changedetectionio import queuedWatchMetaData
+from queue import PriorityQueue
+
+PRICE_DATA_TRACK_ACCEPT = 'accepted'
+PRICE_DATA_TRACK_REJECT = 'rejected'
+
+def construct_blueprint(datastore: ChangeDetectionStore, update_q: PriorityQueue):
+
+    price_data_follower_blueprint = Blueprint('price_data_follower', __name__)
+
+    @login_required
+    @price_data_follower_blueprint.route("/<string:uuid>/accept", methods=['GET'])
+    def accept(uuid):
+        datastore.data['watching'][uuid]['track_ldjson_price_data'] = PRICE_DATA_TRACK_ACCEPT
+        update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
+        return redirect(url_for("form_watch_checknow", uuid=uuid))
+
+
+    @login_required
+    @price_data_follower_blueprint.route("/<string:uuid>/reject", methods=['GET'])
+    def reject(uuid):
+        datastore.data['watching'][uuid]['track_ldjson_price_data'] = PRICE_DATA_TRACK_REJECT
+        return redirect(url_for("index"))
+
+
+    return price_data_follower_blueprint
+
+
--- a/changedetectionio/changedetection.py
+++ b/changedetectionio/changedetection.py
@@ -9,6 +9,7 @@ import getopt
 import os
 import signal
 import sys
+import socket

 from . import store, changedetection_app, content_fetcher
 from . import __version__
@@ -126,11 +127,11 @@ def main():

    if ssl_mode:
        # @todo finalise SSL config, but this should get you in the right direction if you need it.
-        eventlet.wsgi.server(eventlet.wrap_ssl(eventlet.listen((host, port)),
+        eventlet.wsgi.server(eventlet.wrap_ssl(eventlet.listen((host, port), socket.AF_INET6),
                                               certfile='cert.pem',
                                               keyfile='privkey.pem',
                                               server_side=True), app)

    else:
-        eventlet.wsgi.server(eventlet.listen((host, int(port))), app)
+        eventlet.wsgi.server(eventlet.listen((host, int(port)), socket.AF_INET6), app)

--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@@ -1,3 +1,4 @@
+import hashlib
 from abc import abstractmethod
 import chardet
 import json
@@ -23,6 +24,9 @@ class Non200ErrorCodeReceived(Exception):
            self.page_text = html_tools.html_to_text(page_html)
        return

+class checksumFromPreviousCheckWasTheSame(Exception):
+    def __init__(self):
+        return

 class JSActionExceptions(Exception):
    def __init__(self, status_code, url, screenshot, message=''):
@@ -39,7 +43,7 @@ class BrowserStepsStepTimout(Exception):


 class PageUnloadable(Exception):
-    def __init__(self, status_code, url, screenshot=False, message=False):
+    def __init__(self, status_code, url, message, screenshot=False):
        # Set this so we can use it in other parts of the app
        self.status_code = status_code
        self.url = url
@@ -113,7 +117,8 @@ class Fetcher():
            request_body,
            request_method,
            ignore_status_codes=False,
-            current_include_filters=None):
+            current_include_filters=None,
+            is_binary=False):
        # Should set self.error, self.status_code and self.content
        pass

@@ -238,6 +243,14 @@ class base_html_playwright(Fetcher):
        if proxy_override:
            self.proxy = {'server': proxy_override}

+        if self.proxy:
+            # Playwright needs separate username and password values
+            from urllib.parse import urlparse
+            parsed = urlparse(self.proxy.get('server'))
+            if parsed.username:
+                self.proxy['username'] = parsed.username
+                self.proxy['password'] = parsed.password
+
    def screenshot_step(self, step_n=''):

        # There's a bug where we need to do it twice or it doesnt take the whole page, dont know why.
@@ -264,7 +277,8 @@ class base_html_playwright(Fetcher):
            request_body,
            request_method,
            ignore_status_codes=False,
-            current_include_filters=None):
+            current_include_filters=None,
+            is_binary=False):

        from playwright.sync_api import sync_playwright
        import playwright._impl._api_types
@@ -286,6 +300,8 @@ class base_html_playwright(Fetcher):
                proxy=self.proxy,
                # This is needed to enable JavaScript execution on GitHub and others
                bypass_csp=True,
+                # Can't think why we need the service workers for our use case?
+                service_workers='block',
                # Should never be needed
                accept_downloads=False
            )
@@ -294,24 +310,34 @@ class base_html_playwright(Fetcher):
            if len(request_headers):
                context.set_extra_http_headers(request_headers)

-            try:
                self.page.set_default_navigation_timeout(90000)
                self.page.set_default_timeout(90000)

                # Listen for all console events and handle errors
                self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))

-                # Bug - never set viewport size BEFORE page.goto
-
-
-                # Waits for the next navigation. Using Python context manager
-                # prevents a race condition between clicking and waiting for a navigation.
-                with self.page.expect_navigation():
-                    response = self.page.goto(url, wait_until='load')
+            # Goto page
+            try:
                # Wait_until = commit
                # - `'commit'` - consider operation to be finished when network response is received and the document started loading.
                # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
                # This seemed to solve nearly all 'TimeoutErrors'
+                response = self.page.goto(url, wait_until='commit')
+            except playwright._impl._api_types.Error as e:
+                # Retry once - https://github.com/browserless/chrome/issues/2485
+                # Sometimes errors related to invalid cert's and other can be random
+                print ("Content Fetcher > retrying request got error - ", str(e))
+                time.sleep(1)
+                response = self.page.goto(url, wait_until='commit')
+
+            except Exception as e:
+                print ("Content Fetcher > Other exception when page.goto", str(e))
+                context.close()
+                browser.close()
+                raise PageUnloadable(url=url, status_code=None, message=str(e))
+
+            # Execute any browser steps
+            try:
                extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
                self.page.wait_for_timeout(extra_wait * 1000)

@@ -324,17 +350,15 @@ class base_html_playwright(Fetcher):
                # This can be ok, we will try to grab what we could retrieve
                pass
            except Exception as e:
-                print ("other exception when page.goto")
-                print (str(e))
+                print ("Content Fetcher > Other exception when executing custom JS code", str(e))
                context.close()
                browser.close()
-                raise PageUnloadable(url=url, status_code=None)
-
+                raise PageUnloadable(url=url, status_code=None, message=str(e))

            if response is None:
                context.close()
                browser.close()
-                print ("response object was none")
+                print ("Content Fetcher > Response object was none")
                raise EmptyReply(url=url, status_code=None)

            # Bug 2(?) Set the viewport size AFTER loading the page
@@ -353,8 +377,8 @@ class base_html_playwright(Fetcher):
            if len(self.page.content().strip()) == 0:
                context.close()
                browser.close()
-                print ("Content was empty")
-                raise EmptyReply(url=url, status_code=None)
+                print ("Content Fetcher > Content was empty")
+                raise EmptyReply(url=url, status_code=response.status)

            # Bug 2(?) Set the viewport size AFTER loading the page
            self.page.set_viewport_size({"width": 1280, "height": 1024})
@@ -440,7 +464,8 @@ class base_html_webdriver(Fetcher):
            request_body,
            request_method,
            ignore_status_codes=False,
-            current_include_filters=None):
+            current_include_filters=None,
+            is_binary=False):

        from selenium import webdriver
        from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
@@ -498,7 +523,7 @@ class base_html_webdriver(Fetcher):
            try:
                self.driver.quit()
            except Exception as e:
-                print("Exception in chrome shutdown/quit" + str(e))
+                print("Content Fetcher > Exception in chrome shutdown/quit" + str(e))


 # "html_requests" is listed as the default fetcher in store.py!
@@ -515,7 +540,8 @@ class html_requests(Fetcher):
            request_body,
            request_method,
            ignore_status_codes=False,
-            current_include_filters=None):
+            current_include_filters=None,
+            is_binary=False):

        # Make requests use a more modern looking user-agent
        if not 'User-Agent' in request_headers:
@@ -545,10 +571,12 @@ class html_requests(Fetcher):
        # For example - some sites don't tell us it's utf-8, but return utf-8 content
        # This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably.
        # https://github.com/psf/requests/issues/1604 good info about requests encoding detection
-        if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
-            encoding = chardet.detect(r.content)['encoding']
-            if encoding:
-                r.encoding = encoding
+        if not is_binary:
+            # Don't run this for PDF (and requests identified as binary) takes a _long_ time
+            if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
+                encoding = chardet.detect(r.content)['encoding']
+                if encoding:
+                    r.encoding = encoding

        if not r.content or not len(r.content):
            raise EmptyReply(url=url, status_code=r.status_code)
@@ -560,8 +588,14 @@ class html_requests(Fetcher):
            raise Non200ErrorCodeReceived(url=url, status_code=r.status_code, page_html=r.text)

        self.status_code = r.status_code
-        self.content = r.text
+        if is_binary:
+            # Binary files just return their checksum until we add something smarter
+            self.content = hashlib.md5(r.content).hexdigest()
+        else:
+            self.content = r.text
+
        self.headers = r.headers
+        self.raw_content = r.content


 # Decide which is the 'real' HTML webdriver, this is more a system wide config
--- a/changedetectionio/dev-docker/Dockerfile
+++ b/changedetectionio/dev-docker/Dockerfile
@@ -1,14 +0,0 @@
-FROM python:3.8-slim
-
-# https://stackoverflow.com/questions/58701233/docker-logs-erroneously-appears-empty-until-container-stops
-ENV PYTHONUNBUFFERED=1
-
-WORKDIR /app
-
-RUN [ ! -d "/datastore" ] && mkdir /datastore
-
-COPY sleep.py /
-CMD [ "python", "/sleep.py" ]
-
-
-
--- a/changedetectionio/dev-docker/sleep.py
+++ b/changedetectionio/dev-docker/sleep.py
@@ -1,7 +0,0 @@
-import time
-
-print ("Sleep loop, you should run your script from the console")
-
-while True: 
-    # Wait for 5 seconds
-    time.sleep(2)
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@@ -1,11 +1,13 @@
 import hashlib
+import json
 import logging
 import os
 import re
-import time
 import urllib3

 from changedetectionio import content_fetcher, html_tools
+from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
+from copy import deepcopy

 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

@@ -14,6 +16,10 @@ class FilterNotFoundInResponse(ValueError):
    def __init__(self, msg):
        ValueError.__init__(self, msg)

+class PDFToHTMLToolNotFound(ValueError):
+    def __init__(self, msg):
+        ValueError.__init__(self, msg)
+

 # Some common stuff here that can be moved to a base class
 # (set_proxy_from_list)
@@ -38,8 +44,7 @@ class perform_site_check():

        return regex

-    def run(self, uuid):
-        from copy import deepcopy
+    def run(self, uuid, skip_when_checksum_same=True):
        changed_detected = False
        screenshot = False  # as bytes
        stripped_text_from_html = ""
@@ -86,7 +91,7 @@ class perform_site_check():
            is_source = True

        # Pluggable content fetcher
-        prefer_backend = watch.get('fetch_backend')
+        prefer_backend = watch.get_fetch_backend
        if hasattr(content_fetcher, prefer_backend):
            klass = getattr(content_fetcher, prefer_backend)
        else:
@@ -116,12 +121,26 @@ class perform_site_check():
        if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip():
            fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code')

-        fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'))
+        # requests for PDF's, images etc should be passwd the is_binary flag
+        is_binary = watch.is_pdf
+
+        fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), is_binary=is_binary)
        fetcher.quit()

        self.screenshot = fetcher.screenshot
        self.xpath_data = fetcher.xpath_data

+        # Track the content type
+        update_obj['content_type'] = fetcher.headers.get('Content-Type', '')
+
+        # Watches added automatically in the queue manager will skip if its the same checksum as the previous run
+        # Saves a lot of CPU
+        update_obj['previous_md5_before_filters'] = hashlib.md5(fetcher.content.encode('utf-8')).hexdigest()
+        if skip_when_checksum_same:
+            if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'):
+                raise content_fetcher.checksumFromPreviousCheckWasTheSame()
+
+
        # Fetching complete, now filters
        # @todo move to class / maybe inside of fetcher abstract base?

@@ -140,7 +159,32 @@ class perform_site_check():
            is_html = False
            is_json = False

-        include_filters_rule = watch.get('include_filters', [])
+        if watch.is_pdf or 'application/pdf' in fetcher.headers.get('Content-Type', '').lower():
+            from shutil import which
+            tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml")
+            if not which(tool):
+                raise PDFToHTMLToolNotFound("Command-line `{}` tool was not found in system PATH, was it installed?".format(tool))
+
+            import subprocess
+            proc = subprocess.Popen(
+                [tool, '-stdout', '-', '-s', 'out.pdf', '-i'],
+                stdout=subprocess.PIPE,
+                stdin=subprocess.PIPE)
+            proc.stdin.write(fetcher.raw_content)
+            proc.stdin.close()
+            fetcher.content = proc.stdout.read().decode('utf-8')
+            proc.wait(timeout=60)
+
+            # Add a little metadata so we know if the file changes (like if an image changes, but the text is the same
+            # @todo may cause problems with non-UTF8?
+            metadata = "<p>Added by changedetection.io: Document checksum - {} Filesize - {} bytes</p>".format(
+                hashlib.md5(fetcher.raw_content).hexdigest().upper(),
+                len(fetcher.content))
+
+            fetcher.content = fetcher.content.replace('</body>', metadata + '</body>')
+
+
+        include_filters_rule = deepcopy(watch.get('include_filters', []))
        # include_filters_rule = watch['include_filters']
        subtractive_selectors = watch.get(
            "subtractive_selectors", []
@@ -148,6 +192,10 @@ class perform_site_check():
            "global_subtractive_selectors", []
        )

+        # Inject a virtual LD+JSON price tracker rule
+        if watch.get('track_ldjson_price_data', '') == PRICE_DATA_TRACK_ACCEPT:
+            include_filters_rule.append(html_tools.LD_JSON_PRODUCT_OFFER_SELECTOR)
+
        has_filter_rule = include_filters_rule and len("".join(include_filters_rule).strip())
        has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())

@@ -155,6 +203,14 @@ class perform_site_check():
            include_filters_rule.append("json:$")
            has_filter_rule = True

+        if is_json:
+            # Sort the JSON so we dont get false alerts when the content is just re-ordered
+            try:
+                fetcher.content = json.dumps(json.loads(fetcher.content), sort_keys=True)
+            except Exception as e:
+                # Might have just been a snippet, or otherwise bad JSON, continue
+                pass
+
        if has_filter_rule:
            json_filter_prefixes = ['json:', 'jq:']
            for filter in include_filters_rule:
@@ -162,6 +218,8 @@ class perform_site_check():
                    stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)
                    is_html = False

+
+
        if is_html or is_source:

            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
@@ -173,9 +231,13 @@ class perform_site_check():
                # Don't run get_text or xpath/css filters on plaintext
                stripped_text_from_html = html_content
            else:
+                # Does it have some ld+json price data? used for easier monitoring
+                update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(fetcher.content)
+
                # Then we assume HTML
                if has_filter_rule:
                    html_content = ""
+
                    for filter_rule in include_filters_rule:
                        # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
                        if filter_rule[0] == '/' or filter_rule.startswith('xpath:'):
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -426,6 +426,13 @@ class watchForm(commonSettingsForm):
        return result


+class SingleExtraProxy(Form):
+
+    # maybe better to set some <script>var..
+    proxy_name = StringField('Name', [validators.Optional()], render_kw={"placeholder": "Name"})
+    proxy_url = StringField('Proxy URL', [validators.Optional()], render_kw={"placeholder": "http://user:pass@...:3128", "size":50})
+    # @todo do the validation here instead
+
 # datastore.data['settings']['requests']..
 class globalSettingsRequestForm(Form):
    time_between_check = FormField(TimeBetweenCheckForm)
@@ -433,6 +440,15 @@ class globalSettingsRequestForm(Form):
    jitter_seconds = IntegerField('Random jitter seconds ± check',
                                  render_kw={"style": "width: 5em;"},
                                  validators=[validators.NumberRange(min=0, message="Should contain zero or more seconds")])
+    extra_proxies = FieldList(FormField(SingleExtraProxy), min_entries=5)
+
+    def validate_extra_proxies(self, extra_validators=None):
+        for e in self.data['extra_proxies']:
+            if e.get('proxy_name') or e.get('proxy_url'):
+                if not e.get('proxy_name','').strip() or not e.get('proxy_url','').strip():
+                    self.extra_proxies.errors.append('Both a name, and a Proxy URL is required.')
+                    return False
+

 # datastore.data['settings']['application']..
 class globalSettingsApplicationForm(commonSettingsForm):
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -10,6 +10,10 @@ import re
 # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
 TEXT_FILTER_LIST_LINE_SUFFIX = "<br/>"

+# 'price' , 'lowPrice', 'highPrice' are usually under here
+# all of those may or may not appear on different websites
+LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
+
 class JSONNotFound(ValueError):
    def __init__(self, msg):
        ValueError.__init__(self, msg)
@@ -127,8 +131,10 @@ def _get_stripped_text_from_json_match(match):

    return stripped_text_from_html

-def extract_json_as_string(content, json_filter):
-
+# content - json
+# json_filter - ie json:$..price
+# ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector)
+def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):
    stripped_text_from_html = False

    # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
@@ -139,7 +145,12 @@ def extract_json_as_string(content, json_filter):
        # Foreach <script json></script> blob.. just return the first that matches json_filter
        s = []
        soup = BeautifulSoup(content, 'html.parser')
-        bs_result = soup.findAll('script')
+
+        if ensure_is_ldjson_info_type:
+            bs_result = soup.findAll('script', {"type": "application/ld+json"})
+        else:
+            bs_result = soup.findAll('script')
+

        if not bs_result:
            raise JSONNotFound("No parsable JSON found in this document")
@@ -156,7 +167,14 @@ def extract_json_as_string(content, json_filter):
                continue
            else:
                stripped_text_from_html = _parse_json(json_data, json_filter)
-                if stripped_text_from_html:
+                if ensure_is_ldjson_info_type:
+                    # Could sometimes be list, string or something else random
+                    if isinstance(json_data, dict):
+                        # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
+                        # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
+                        if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html:
+                            break
+                elif stripped_text_from_html:
                    break

    if not stripped_text_from_html:
@@ -243,6 +261,18 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:

    return text_content

+
+# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
+def has_ldjson_product_info(content):
+    try:
+        pricing_data = extract_json_as_string(content=content, json_filter=LD_JSON_PRODUCT_OFFER_SELECTOR, ensure_is_ldjson_info_type="product")
+    except JSONNotFound as e:
+        # Totally fine
+        return False
+    x=bool(pricing_data)
+    return x
+
+
 def workarounds_for_obfuscations(content):
    """
    Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
--- a/changedetectionio/model/App.py
+++ b/changedetectionio/model/App.py
@@ -15,11 +15,12 @@ class model(dict):
                'headers': {
                },
                'requests': {
-                    'timeout': int(getenv("DEFAULT_SETTINGS_REQUESTS_TIMEOUT", "45")),  # Default 45 seconds
-                    'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None},
+                    'extra_proxies': [], # Configurable extra proxies via the UI
                    'jitter_seconds': 0,
+                    'proxy': None, # Preferred proxy connection
+                    'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None},
+                    'timeout': int(getenv("DEFAULT_SETTINGS_REQUESTS_TIMEOUT", "45")),  # Default 45 seconds
                    'workers': int(getenv("DEFAULT_SETTINGS_REQUESTS_WORKERS", "10")),  # Number of threads, lower is better for slow connections
-                    'proxy': None # Preferred proxy connection
                },
                'application': {
                    'api_access_token_enabled': True,
@@ -27,7 +28,6 @@ class model(dict):
                    'base_url' : None,
                    'extract_title_as_title': False,
                    'empty_pages_are_a_change': False,
-                    'css_dark_mode': False,
                    'fetch_backend': getenv("DEFAULT_FETCH_BACKEND", "html_requests"),
                    'filter_failure_notification_threshold_attempts': _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT,
                    'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -14,49 +14,52 @@ from changedetectionio.notification import (

 class model(dict):
    __newest_history_key = None
-    __history_n=0
+    __history_n = 0
    __base_config = {
-            #'history': {},  # Dict of timestamp and output stripped filename (removed)
-            #'newest_history_key': 0, (removed, taken from history.txt index)
-            'body': None,
-            'check_unique_lines': False, # On change-detected, compare against all history if its something new
-            'check_count': 0,
-            'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine.
-            'extract_text': [],  # Extract text by regex after filters
-            'extract_title_as_title': False,
-            'fetch_backend': None,
-            'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
-            'headers': {},  # Extra headers to send
-            'ignore_text': [],  # List of text to ignore when calculating the comparison checksum
-            'include_filters': [],
-            'last_checked': 0,
-            'last_error': False,
-            'last_viewed': 0,  # history key value of the last viewed via the [diff] link
-            'method': 'GET',
-             # Custom notification content
-            'notification_body': None,
-            'notification_format': default_notification_format_for_watch,
-            'notification_muted': False,
-            'notification_title': None,
-            'notification_screenshot': False, # Include the latest screenshot if available and supported by the apprise URL
-            'notification_urls': [],  # List of URLs to add to the notification Queue (Usually AppRise)
-            'paused': False,
-            'previous_md5': False,
-            'proxy': None, # Preferred proxy connection
-            'subtractive_selectors': [],
-            'tag': None,
-            'text_should_not_be_present': [], # Text that should not present
-            # Re #110, so then if this is set to None, we know to use the default value instead
-            # Requires setting to None on submit if it's the same as the default
-            # Should be all None by default, so we use the system default in this case.
-            'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None},
-            'title': None,
-            'trigger_text': [],  # List of text or regex to wait for until a change is detected
-            'url': None,
-            'uuid': str(uuid.uuid4()),
-            'webdriver_delay': None,
-            'webdriver_js_execute_code': None, # Run before change-detection
-        }
+        # 'history': {},  # Dict of timestamp and output stripped filename (removed)
+        # 'newest_history_key': 0, (removed, taken from history.txt index)
+        'body': None,
+        'check_unique_lines': False,  # On change-detected, compare against all history if its something new
+        'check_count': 0,
+        'consecutive_filter_failures': 0,  # Every time the CSS/xPath filter cannot be located, reset when all is fine.
+        'extract_text': [],  # Extract text by regex after filters
+        'extract_title_as_title': False,
+        'fetch_backend': None,
+        'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
+        'has_ldjson_price_data': None,
+        'track_ldjson_price_data': None,
+        'headers': {},  # Extra headers to send
+        'ignore_text': [],  # List of text to ignore when calculating the comparison checksum
+        'include_filters': [],
+        'last_checked': 0,
+        'last_error': False,
+        'last_viewed': 0,  # history key value of the last viewed via the [diff] link
+        'method': 'GET',
+        # Custom notification content
+        'notification_body': None,
+        'notification_format': default_notification_format_for_watch,
+        'notification_muted': False,
+        'notification_title': None,
+        'notification_screenshot': False,  # Include the latest screenshot if available and supported by the apprise URL
+        'notification_urls': [],  # List of URLs to add to the notification Queue (Usually AppRise)
+        'paused': False,
+        'previous_md5': False,
+        'previous_md5_before_filters': False,  # Used for skipping changedetection entirely
+        'proxy': None,  # Preferred proxy connection
+        'subtractive_selectors': [],
+        'tag': None,
+        'text_should_not_be_present': [],  # Text that should not present
+        # Re #110, so then if this is set to None, we know to use the default value instead
+        # Requires setting to None on submit if it's the same as the default
+        # Should be all None by default, so we use the system default in this case.
+        'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None},
+        'title': None,
+        'trigger_text': [],  # List of text or regex to wait for until a change is detected
+        'url': None,
+        'uuid': str(uuid.uuid4()),
+        'webdriver_delay': None,
+        'webdriver_js_execute_code': None,  # Run before change-detection
+    }
    jitter_seconds = 0

    def __init__(self, *arg, **kw):
@@ -111,6 +114,24 @@ class model(dict):

        return ready_url

+    @property
+    def get_fetch_backend(self):
+        """
+        Like just using the `fetch_backend` key but there could be some logic
+        :return:
+        """
+        # Maybe also if is_image etc?
+        # This is because chrome/playwright wont render the PDF in the browser and we will just fetch it and use pdf2html to see the text.
+        if self.is_pdf:
+            return 'html_requests'
+
+        return self.get('fetch_backend')
+
+    @property
+    def is_pdf(self):
+        # content_type field is set in the future
+        return '.pdf' in self.get('url', '').lower() or 'pdf' in self.get('content_type', '').lower()
+
    @property
    def label(self):
        # Used for sorting
--- a/changedetectionio/queuedWatchMetaData.py
+++ b/changedetectionio/queuedWatchMetaData.py
@@ -0,0 +1,10 @@
+from dataclasses import dataclass, field
+from typing import Any
+
+# So that we can queue some metadata in `item`
+# https://docs.python.org/3/library/queue.html#queue.PriorityQueue
+#
+@dataclass(order=True)
+class PrioritizedItem:
+    priority: int
+    item: Any=field(compare=False)
--- a/changedetectionio/res/xpath_element_scraper.js
+++ b/changedetectionio/res/xpath_element_scraper.js
@@ -1,3 +1,6 @@
+// Copyright (C) 2021 Leigh Morresi (dgtlmoon@gmail.com)
+// All rights reserved.
+
 // @file Scrape the page looking for elements of concern (%ELEMENTS%)
 // http://matatk.agrip.org.uk/tests/position-and-width/
 // https://stackoverflow.com/questions/26813480/when-is-element-getboundingclientrect-guaranteed-to-be-updated-accurate
@@ -81,8 +84,16 @@ var bbox;
 for (var i = 0; i < elements.length; i++) {
    bbox = elements[i].getBoundingClientRect();

-    // Forget really small ones
-    if (bbox['width'] < 10 && bbox['height'] < 10) {
+    // Exclude items that are not interactable or visible
+    if(elements[i].style.opacity === "0") {
+        continue
+    }
+    if(elements[i].style.display === "none" || elements[i].style.pointerEvents === "none" ) {
+        continue
+    }
+
+    // Skip really small ones, and where width or height ==0
+    if (bbox['width'] * bbox['height'] < 100) {
        continue;
    }

@@ -138,7 +149,6 @@ for (var i = 0; i < elements.length; i++) {

 }

-
 // Inject the current one set in the include_filters, which may be a CSS rule
 // used for displaying the current one in VisualSelector, where its not one we generated.
 if (include_filters.length) {
@@ -166,10 +176,23 @@ if (include_filters.length) {
        }

        if (q) {
-            bbox = q.getBoundingClientRect();
-            console.log("xpath_element_scraper: Got filter element, scroll from top was "+scroll_y)
-        } else {
-            console.log("xpath_element_scraper: filter element "+f+" was not found");
+            // #1231 - IN the case XPath attribute filter is applied, we will have to traverse up and find the element.
+            if (q.hasOwnProperty('getBoundingClientRect')) {
+                bbox = q.getBoundingClientRect();
+                console.log("xpath_element_scraper: Got filter element, scroll from top was " + scroll_y)
+            } else {
+                try {
+                    // Try and see we can find its ownerElement
+                    bbox = q.ownerElement.getBoundingClientRect();
+                    console.log("xpath_element_scraper: Got filter by ownerElement element, scroll from top was " + scroll_y)
+                } catch (e) {
+                    console.log("xpath_element_scraper: error looking up ownerElement")
+                }
+            }
+        }
+        
+        if(!q) {
+            console.log("xpath_element_scraper: filter element " + f + " was not found");
        }

        if (bbox && bbox['width'] > 0 && bbox['height'] > 0) {
@@ -184,5 +207,9 @@ if (include_filters.length) {
    }
 }

+// Sort the elements so we find the smallest one first, in other words, we find the smallest one matching in that area
+// so that we dont select the wrapping element by mistake and be unable to select what we want
+size_pos.sort((a, b) => (a.width*a.height > b.width*b.height) ? 1 : -1)
+
 // Window.width required for proper scaling in the frontend
 return {'size_pos': size_pos, 'browser_width': window.innerWidth};
--- a/changedetectionio/run_all_tests.sh
+++ b/changedetectionio/run_all_tests.sh
@@ -1,104 +0,0 @@
-#!/bin/bash
-
-
-# live_server will throw errors even with live_server_scope=function if I have the live_server setup in different functions
-# and I like to restart the server for each test (and have the test cleanup after each test)
-# merge request welcome :)
-
-
-# exit when any command fails
-set -e
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-
-find tests/test_*py -type f|while read test_name
-do
-  echo "TEST RUNNING $test_name"
-  pytest $test_name
-done
-
-echo "RUNNING WITH BASE_URL SET"
-
-# Now re-run some tests with BASE_URL enabled
-# Re #65 - Ability to include a link back to the installation, in the notification.
-export BASE_URL="https://really-unique-domain.io"
-pytest tests/test_notification.py
-
-
-# Re-run with HIDE_REFERER set - could affect login
-export HIDE_REFERER=True
-pytest tests/test_access_control.py
-
-
-# Now for the selenium and playwright/browserless fetchers
-# Note - this is not UI functional tests - just checking that each one can fetch the content
-
-echo "TESTING WEBDRIVER FETCH > SELENIUM/WEBDRIVER..."
-docker run -d --name $$-test_selenium  -p 4444:4444 --rm --shm-size="2g"  selenium/standalone-chrome-debug:3.141.59
-# takes a while to spin up
-sleep 5
-export WEBDRIVER_URL=http://localhost:4444/wd/hub
-pytest tests/fetchers/test_content.py
-pytest tests/test_errorhandling.py
-unset WEBDRIVER_URL
-docker kill $$-test_selenium
-
-echo "TESTING WEBDRIVER FETCH > PLAYWRIGHT/BROWSERLESS..."
-# Not all platforms support playwright (not ARM/rPI), so it's not packaged in requirements.txt
-PLAYWRIGHT_VERSION=$(grep -i -E "RUN pip install.+" "$SCRIPT_DIR/../Dockerfile" | grep --only-matching -i -E "playwright[=><~+]+[0-9\.]+")
-echo "using $PLAYWRIGHT_VERSION"
-pip3 install "$PLAYWRIGHT_VERSION"
-docker run -d --name $$-test_browserless -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm  -p 3000:3000  --shm-size="2g"  browserless/chrome:1.53-chrome-stable
-# takes a while to spin up
-sleep 5
-export PLAYWRIGHT_DRIVER_URL=ws://127.0.0.1:3000
-pytest tests/fetchers/test_content.py
-pytest tests/test_errorhandling.py
-pytest tests/visualselector/test_fetch_data.py
-
-unset PLAYWRIGHT_DRIVER_URL
-docker kill $$-test_browserless
-
-# Test proxy list handling, starting two squids on different ports
-# Each squid adds a different header to the response, which is the main thing we test for.
-docker run -d --name $$-squid-one --rm -v `pwd`/tests/proxy_list/squid.conf:/etc/squid/conf.d/debian.conf -p 3128:3128 ubuntu/squid:4.13-21.10_edge
-docker run -d --name $$-squid-two --rm -v `pwd`/tests/proxy_list/squid.conf:/etc/squid/conf.d/debian.conf -p 3129:3128 ubuntu/squid:4.13-21.10_edge
-
-
-# So, basic HTTP as env var test
-export HTTP_PROXY=http://localhost:3128
-export HTTPS_PROXY=http://localhost:3128
-pytest tests/proxy_list/test_proxy.py
-docker logs $$-squid-one 2>/dev/null|grep one.changedetection.io
-if [ $? -ne 0 ]
-then
-  echo "Did not see a request to one.changedetection.io in the squid logs (while checking env vars HTTP_PROXY/HTTPS_PROXY)"
-fi
-unset HTTP_PROXY
-unset HTTPS_PROXY
-
-
-# 2nd test actually choose the preferred proxy from proxies.json
-cp tests/proxy_list/proxies.json-example ./test-datastore/proxies.json
-# Makes a watch use a preferred proxy
-pytest tests/proxy_list/test_multiple_proxy.py
-
-# Should be a request in the default "first" squid
-docker logs $$-squid-one 2>/dev/null|grep chosen.changedetection.io
-if [ $? -ne 0 ]
-then
-  echo "Did not see a request to chosen.changedetection.io in the squid logs (while checking preferred proxy)"
-fi
-
-# And one in the 'second' squid (user selects this as preferred)
-docker logs $$-squid-two 2>/dev/null|grep chosen.changedetection.io
-if [ $? -ne 0 ]
-then
-  echo "Did not see a request to chosen.changedetection.io in the squid logs (while checking preferred proxy)"
-fi
-
-# @todo - test system override proxy selection and watch defaults, setup a 3rd squid?
-docker kill $$-squid-one
-docker kill $$-squid-two
-
-
--- a/changedetectionio/run_basic_tests.sh
+++ b/changedetectionio/run_basic_tests.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+
+# live_server will throw errors even with live_server_scope=function if I have the live_server setup in different functions
+# and I like to restart the server for each test (and have the test cleanup after each test)
+# merge request welcome :)
+
+
+# exit when any command fails
+set -e
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+find tests/test_*py -type f|while read test_name
+do
+  echo "TEST RUNNING $test_name"
+  pytest $test_name
+done
+
+echo "RUNNING WITH BASE_URL SET"
+
+# Now re-run some tests with BASE_URL enabled
+# Re #65 - Ability to include a link back to the installation, in the notification.
+export BASE_URL="https://really-unique-domain.io"
+pytest tests/test_notification.py
+
+
+# Re-run with HIDE_REFERER set - could affect login
+export HIDE_REFERER=True
+pytest tests/test_access_control.py
--- a/changedetectionio/run_proxy_tests.sh
+++ b/changedetectionio/run_proxy_tests.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# exit when any command fails
+set -e
+
+# Test proxy list handling, starting two squids on different ports
+# Each squid adds a different header to the response, which is the main thing we test for.
+docker run --network changedet-network -d --name squid-one --hostname squid-one --rm -v `pwd`/tests/proxy_list/squid.conf:/etc/squid/conf.d/debian.conf ubuntu/squid:4.13-21.10_edge
+docker run --network changedet-network -d --name squid-two --hostname squid-two --rm -v `pwd`/tests/proxy_list/squid.conf:/etc/squid/conf.d/debian.conf ubuntu/squid:4.13-21.10_edge
+
+# Used for configuring a custom proxy URL via the UI
+docker run --network changedet-network -d \
+  --name squid-custom \
+  --hostname squid-custom \
+  --rm \
+  -v `pwd`/tests/proxy_list/squid-auth.conf:/etc/squid/conf.d/debian.conf \
+  -v `pwd`/tests/proxy_list/squid-passwords.txt:/etc/squid3/passwords \
+  ubuntu/squid:4.13-21.10_edge
+
+
+## 2nd test actually choose the preferred proxy from proxies.json
+
+docker run --network changedet-network \
+  -v `pwd`/tests/proxy_list/proxies.json-example:/app/changedetectionio/test-datastore/proxies.json \
+  test-changedetectionio \
+  bash -c 'cd changedetectionio && pytest tests/proxy_list/test_multiple_proxy.py'
+
+
+## Should be a request in the default "first" squid
+docker logs squid-one 2>/dev/null|grep chosen.changedetection.io
+if [ $? -ne 0 ]
+then
+  echo "Did not see a request to chosen.changedetection.io in the squid logs (while checking preferred proxy - squid one)"
+  exit 1
+fi
+
+# And one in the 'second' squid (user selects this as preferred)
+docker logs squid-two 2>/dev/null|grep chosen.changedetection.io
+if [ $? -ne 0 ]
+then
+  echo "Did not see a request to chosen.changedetection.io in the squid logs (while checking preferred proxy - squid two)"
+  exit 1
+fi
+
+
+# Test the UI configurable proxies
+
+docker run --network changedet-network \
+  test-changedetectionio \
+  bash -c 'cd changedetectionio && pytest tests/proxy_list/test_select_custom_proxy.py'
+
+
+# Should see a request for one.changedetection.io in there
+docker logs squid-custom 2>/dev/null|grep "TCP_TUNNEL.200.*changedetection.io"
+if [ $? -ne 0 ]
+then
+  echo "Did not see a valid request to changedetection.io in the squid logs (while checking preferred proxy - squid two)"
+  exit 1
+fi
+
+docker kill squid-one squid-two squid-custom
--- a/changedetectionio/static/images/pdf-icon.svg
+++ b/changedetectionio/static/images/pdf-icon.svg
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg xmlns="http://www.w3.org/2000/svg" width="75.320129mm" height="92.604164mm" viewBox="0 0 75.320129 92.604164">
+  <g transform="translate(53.548057 -183.975276) scale(1.4843)">
+    <path fill="#ff2116" d="M-29.632812 123.94727c-3.551967 0-6.44336 2.89347-6.44336 6.44531v49.49804c0 3.55185 2.891393 6.44532 6.44336 6.44532H8.2167969c3.5519661 0 6.4433591-2.89335 6.4433591-6.44532v-40.70117s.101353-1.19181-.416015-2.35156c-.484969-1.08711-1.275391-1.84375-1.275391-1.84375a1.0584391 1.0584391 0 0 0-.0059-.008l-9.3906254-9.21094a1.0584391 1.0584391 0 0 0-.015625-.0156s-.8017392-.76344-1.9902344-1.27344c-1.39939552-.6005-2.8417968-.53711-2.8417968-.53711l.021484-.002z" color="#000" font-family="sans-serif" overflow="visible" paint-order="markers fill stroke" style="line-height:normal;font-variant-ligatures:normal;font-variant-position:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-alternates:normal;font-feature-settings:normal;text-indent:0;text-align:start;text-decoration-line:none;text-decoration-style:solid;text-decoration-color:#000000;text-transform:none;text-orientation:mixed;white-space:normal;shape-padding:0;isolation:auto;mix-blend-mode:normal;solid-color:#000000;solid-opacity:1"/>
+    <path fill="#f5f5f5" d="M-29.632812 126.06445h28.3789058a1.0584391 1.0584391 0 0 0 .021484 0s1.13480448.011 1.96484378.36719c.79889772.34282 1.36536982.86176 1.36914062.86524.0000125.00001.00391.004.00391.004l9.3671868 9.18945s.564354.59582.837891 1.20899c.220779.49491.234375 1.40039.234375 1.40039a1.0584391 1.0584391 0 0 0-.002.0449v40.74609c0 2.41592-1.910258 4.32813-4.3261717 4.32813H-29.632812c-2.415914 0-4.326172-1.91209-4.326172-4.32813v-49.49804c0-2.41603 1.910258-4.32813 4.326172-4.32813z" color="#000" font-family="sans-serif" overflow="visible" paint-order="markers fill stroke" style="line-height:normal;font-variant-ligatures:normal;font-variant-position:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-alternates:normal;font-feature-settings:normal;text-indent:0;text-align:start;text-decoration-line:none;text-decoration-style:solid;text-decoration-color:#000000;text-transform:none;text-orientation:mixed;white-space:normal;shape-padding:0;isolation:auto;mix-blend-mode:normal;solid-color:#000000;solid-opacity:1"/>
+    <path fill="#ff2116" d="M-23.40766 161.09299c-1.45669-1.45669.11934-3.45839 4.39648-5.58397l2.69124-1.33743 1.04845-2.29399c.57665-1.26169 1.43729-3.32036 1.91254-4.5748l.8641-2.28082-.59546-1.68793c-.73217-2.07547-.99326-5.19438-.52872-6.31588.62923-1.51909 2.69029-1.36323 3.50626.26515.63727 1.27176.57212 3.57488-.18329 6.47946l-.6193 2.38125.5455.92604c.30003.50932 1.1764 1.71867 1.9475 2.68743l1.44924 1.80272 1.8033728-.23533c5.72900399-.74758 7.6912472.523 7.6912472 2.34476 0 2.29921-4.4984914 2.48899-8.2760865-.16423-.8499666-.59698-1.4336605-1.19001-1.4336605-1.19001s-2.3665326.48178-3.531704.79583c-1.202707.32417-1.80274.52719-3.564509 1.12186 0 0-.61814.89767-1.02094 1.55026-1.49858 2.4279-3.24833 4.43998-4.49793 5.1723-1.3991.81993-2.86584.87582-3.60433.13733zm2.28605-.81668c.81883-.50607 2.47616-2.46625 3.62341-4.28553l.46449-.73658-2.11497 1.06339c-3.26655 1.64239-4.76093 3.19033-3.98386 4.12664.43653.52598.95874.48237 2.01093-.16792zm21.21809-5.95578c.80089-.56097.68463-1.69142-.22082-2.1472-.70466-.35471-1.2726074-.42759-3.1031574-.40057-1.1249.0767-2.9337647.3034-3.2403347.37237 0 0 .993716.68678 1.434896.93922.58731.33544 2.0145161.95811 3.0565161 1.27706 1.02785.31461 1.6224.28144 2.0729-.0409zm-8.53152-3.54594c-.4847-.50952-1.30889-1.57296-1.83152-2.3632-.68353-.89643-1.02629-1.52887-1.02629-1.52887s-.4996 1.60694-.90948 2.57394l-1.27876 3.16076-.37075.71695s1.971043-.64627 2.97389-.90822c1.0621668-.27744 3.21787-.70134 3.21787-.70134zm-2.74938-11.02573c.12363-1.0375.1761-2.07346-.15724-2.59587-.9246-1.01077-2.04057-.16787-1.85154 2.23517.0636.8084.26443 2.19033.53292 3.04209l.48817 1.54863.34358-1.16638c.18897-.64151.47882-2.02015.64411-3.06364z"/>
+    <path fill="#2c2c2c" d="M-20.930423 167.83862h2.364986q1.133514 0 1.840213.2169.706698.20991 1.189489.9446.482795.72769.482795 1.75625 0 .94459-.391832 1.6233-.391833.67871-1.056548.97958-.65772.30087-2.02913.30087h-.818651v3.72941h-1.581322zm1.581322 1.22447v3.33058h.783664q1.049552 0 1.44838-.39184.405826-.39183.405826-1.27345 0-.65772-.265887-1.06355-.265884-.41282-.587747-.50378-.314866-.098-1.000572-.098zm5.50664-1.22447h2.148082q1.560333 0 2.4909318.55276.9375993.55276 1.4133973 1.6443.482791 1.09153.482791 2.42096 0 1.3994-.4338151 2.49793-.4268149 1.09153-1.3154348 1.76324-.8816233.67172-2.5189212.67172h-2.267031zm1.581326 1.26645v7.018h.657715q1.378411 0 2.001144-.9516.6227329-.95858.6227329-2.5539 0-3.5125-2.6238769-3.5125zm6.4722254-1.26645h5.30372941v1.26645H-4.2075842v2.85478h2.9807225v1.26646h-2.9807225v4.16322h-1.5813254z" font-family="Franklin Gothic Medium Cond" letter-spacing="0" style="line-height:125%;-inkscape-font-specification:'Franklin Gothic Medium Cond'" word-spacing="4.26000023"/>
+  </g>
+</svg>
--- a/changedetectionio/static/images/price-tag-icon.svg
+++ b/changedetectionio/static/images/price-tag-icon.svg
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg width="83.39" height="89.648" enable-background="new 0 0 122.406 122.881" version="1.1" viewBox="0 0 83.39 89.648" xml:space="preserve" xmlns="http://www.w3.org/2000/svg"><g transform="translate(5e-4 -33.234)"><path d="m44.239 42.946-39.111 39.896 34.908 34.91 39.09-39.876-1.149-34.931zm-0.91791 42.273c0.979-0.979 1.507-1.99 1.577-3.027 0.077-1.043-0.248-2.424-0.967-4.135-0.725-1.717-1.348-3.346-1.87-4.885s-0.814-3.014-0.897-4.432c-0.07-1.42 0.134-2.768 0.624-4.045 0.477-1.279 1.348-2.545 2.607-3.804 2.099-2.099 4.535-3.123 7.314-3.065 2.773 0.063 5.457 1.158 8.04 3.294l2.881 3.034c1.946 2.607 2.799 5.33 2.557 8.166-0.235 2.83-1.532 5.426-3.893 7.785l-6.296-6.297c1.291-1.291 2.035-2.531 2.238-3.727 0.191-1.197-0.165-2.252-1.081-3.168-0.821-0.82-1.717-1.195-2.69-1.139-0.967 0.064-1.908 0.547-2.817 1.457-0.922 0.922-1.393 1.914-1.412 2.977s0.306 2.416 0.973 4.064c0.661 1.652 1.24 3.25 1.736 4.801 0.496 1.553 0.782 3.035 0.858 4.445 0.076 1.426-0.127 2.787-0.591 4.104-0.477 1.316-1.336 2.596-2.588 3.848-2.125 2.125-4.522 3.186-7.212 3.18s-5.311-1.063-7.855-3.16l-3.747 3.746-2.964-2.965 3.766-3.764c-2.423-2.996-3.568-5.998-3.447-9.02 0.127-3.014 1.476-5.813 4.045-8.383l6.278 6.277c-1.412 1.412-2.175 2.799-2.277 4.16-0.108 1.367 0.414 2.627 1.571 3.783 0.839 0.84 1.755 1.26 2.741 1.242 0.985-0.017 1.92-0.47 2.798-1.347zm21.127-46.435h17.457c-0.0269 2.2368 0.69936 16.025 0.69936 16.025l0.785 23.858c0.019 0.609-0.221 1.164-0.619 1.564l5e-3 4e-3 -41.236 42.022c-0.82213 0.8378-2.175 0.83-3.004 0l-37.913-37.91c-0.83-0.83-0.83-2.176 0-3.006l41.236-42.021c0.39287-0.42671 1.502-0.53568 1.502-0.53568zm18.011 11.59c-59.392-29.687-29.696-14.843 0 0z"/></g></svg>
--- a/changedetectionio/static/js/visual-selector.js
+++ b/changedetectionio/static/js/visual-selector.js
@@ -1,4 +1,5 @@
-// Horrible proof of concept code :)
+// Copyright (C) 2021 Leigh Morresi (dgtlmoon@gmail.com)
+// All rights reserved.
 // yes - this is really a hack, if you are a front-ender and want to help, please get in touch!

 $(document).ready(function () {
@@ -177,9 +178,10 @@ $(document).ready(function () {
            // Basically, find the most 'deepest'
            var found = 0;
            ctx.fillStyle = 'rgba(205,0,0,0.35)';
-            for (var i = selector_data['size_pos'].length; i !== 0; i--) {
+            // Will be sorted by smallest width*height first
+            for (var i = 0; i <= selector_data['size_pos'].length; i++) {
                // draw all of them? let them choose somehow?
-                var sel = selector_data['size_pos'][i - 1];
+                var sel = selector_data['size_pos'][i];
                // If we are in a bounding-box
                if (e.offsetY > sel.top * y_scale && e.offsetY < sel.top * y_scale + sel.height * y_scale
                    &&
@@ -195,7 +197,7 @@ $(document).ready(function () {
                    // no need to keep digging
                    // @todo or, O to go out/up, I to go in
                    // or double click to go up/out the selector?
-                    current_selected_i = i - 1;
+                    current_selected_i = i;
                    found += 1;
                    break;
                }
--- a/changedetectionio/static/styles/.dockerignore
+++ b/changedetectionio/static/styles/.dockerignore
@@ -0,0 +1,3 @@
+node_modules
+package-lock.json
+
--- a/changedetectionio/static/styles/scss/parts/_extra_proxies.scss
+++ b/changedetectionio/static/styles/scss/parts/_extra_proxies.scss
@@ -0,0 +1,17 @@
+ul#requests-extra_proxies {
+  list-style: none;
+  /* tidy up the table to look more "inline" */
+  li {
+    > label {
+      display: none;
+    }
+
+  }
+  /* each proxy entry is a `table` */
+  table {
+    tr {
+      display: inline;
+    }
+  }
+}
+
--- a/changedetectionio/static/styles/scss/styles.scss
+++ b/changedetectionio/static/styles/scss/styles.scss
@@ -2,10 +2,11 @@
 * -- BASE STYLES --
 */

-@import "parts/_variables";
-@import "parts/_spinners";
-@import "parts/_browser-steps";
@import "parts/_arrows";
+@import "parts/_browser-steps";
+@import "parts/_extra_proxies";
+@import "parts/_spinners";
+@import "parts/_variables";

 body {
  color: var(--color-text);
@@ -22,6 +23,13 @@ body {
  width: 1px;
 }

+// Row icons like chrome, pdf, share, etc
+.status-icon {
+  display: inline-block;
+  height: 1rem;
+  vertical-align: middle;
+}
+
 .pure-table-even {
  background: var(--color-background);
 }
@@ -1009,3 +1017,30 @@ ul {
  border-radius: 5px;
  color: var(--color-warning);
 }
+
+/* automatic price following helpers */
+.tracking-ldjson-price-data {
+  background-color: var(--color-background-button-green);
+  color: #000;
+  padding: 3px;
+  border-radius: 3px;
+  white-space: nowrap;
+}
+
+.ldjson-price-track-offer {
+  a.pure-button {
+    border-radius: 3px;
+    padding: 3px;
+    background-color: var(--color-background-button-green);
+  }
+
+  font-weight: bold;
+  font-style: italic;
+}
+
+.price-follow-tag-icon {
+  display: inline-block;
+  height: 0.8rem;
+  vertical-align: middle;
+}
+
--- a/changedetectionio/static/styles/styles.css
+++ b/changedetectionio/static/styles/styles.css
@@ -1,6 +1,139 @@
 /*
 * -- BASE STYLES --
 */
+.arrow {
+  border: solid #1b98f8;
+  border-width: 0 2px 2px 0;
+  display: inline-block;
+  padding: 3px; }
+  .arrow.right {
+    transform: rotate(-45deg);
+    -webkit-transform: rotate(-45deg); }
+  .arrow.left {
+    transform: rotate(135deg);
+    -webkit-transform: rotate(135deg); }
+  .arrow.up, .arrow.asc {
+    transform: rotate(-135deg);
+    -webkit-transform: rotate(-135deg); }
+  .arrow.down, .arrow.desc {
+    transform: rotate(45deg);
+    -webkit-transform: rotate(45deg); }
+
+#browser_steps {
+  /* convert rows to horizontal cells */ }
+  #browser_steps th {
+    display: none; }
+  #browser_steps li {
+    list-style: decimal;
+    padding: 5px; }
+    #browser_steps li:not(:first-child):hover {
+      opacity: 1.0; }
+    #browser_steps li .control {
+      padding-left: 5px;
+      padding-right: 5px; }
+      #browser_steps li .control a {
+        font-size: 70%; }
+    #browser_steps li.empty {
+      padding: 0px;
+      opacity: 0.35; }
+      #browser_steps li.empty .control {
+        display: none; }
+    #browser_steps li:hover {
+      background: #eee; }
+    #browser_steps li > label {
+      display: none; }
+
+#browser-steps-fieldlist {
+  height: 100%;
+  overflow-y: scroll; }
+
+#browser-steps .flex-wrapper {
+  display: flex;
+  flex-flow: row;
+  height: 600px;
+  /*@todo make this dynamic */ }
+
+/*  this is duplicate :( */
+#browsersteps-selector-wrapper {
+  height: 100%;
+  width: 100%;
+  overflow-y: scroll;
+  position: relative;
+  /* nice tall skinny one */ }
+  #browsersteps-selector-wrapper > img {
+    position: absolute;
+    max-width: 100%; }
+  #browsersteps-selector-wrapper > canvas {
+    position: relative;
+    max-width: 100%; }
+    #browsersteps-selector-wrapper > canvas:hover {
+      cursor: pointer; }
+  #browsersteps-selector-wrapper .loader {
+    position: absolute;
+    left: 50%;
+    top: 50%;
+    transform: translate(-50%, -50%);
+    margin-left: -40px;
+    z-index: 100;
+    max-width: 350px;
+    text-align: center; }
+  #browsersteps-selector-wrapper .spinner, #browsersteps-selector-wrapper .spinner:after {
+    width: 80px;
+    height: 80px;
+    font-size: 3px; }
+  #browsersteps-selector-wrapper #browsersteps-click-start {
+    color: var(--color-grey-400); }
+    #browsersteps-selector-wrapper #browsersteps-click-start:hover {
+      cursor: pointer; }
+
+ul#requests-extra_proxies {
+  list-style: none;
+  /* tidy up the table to look more "inline" */
+  /* each proxy entry is a `table` */ }
+  ul#requests-extra_proxies li > label {
+    display: none; }
+  ul#requests-extra_proxies table tr {
+    display: inline; }
+
+/* spinner */
+.spinner,
+.spinner:after {
+  border-radius: 50%;
+  width: 10px;
+  height: 10px; }
+
+.spinner {
+  margin: 0px auto;
+  font-size: 3px;
+  vertical-align: middle;
+  display: inline-block;
+  text-indent: -9999em;
+  border-top: 1.1em solid rgba(38, 104, 237, 0.2);
+  border-right: 1.1em solid rgba(38, 104, 237, 0.2);
+  border-bottom: 1.1em solid rgba(38, 104, 237, 0.2);
+  border-left: 1.1em solid #2668ed;
+  -webkit-transform: translateZ(0);
+  -ms-transform: translateZ(0);
+  transform: translateZ(0);
+  -webkit-animation: load8 1.1s infinite linear;
+  animation: load8 1.1s infinite linear; }
+
+@-webkit-keyframes load8 {
+  0% {
+    -webkit-transform: rotate(0deg);
+    transform: rotate(0deg); }
+  100% {
+    -webkit-transform: rotate(360deg);
+    transform: rotate(360deg); } }
+
+@keyframes load8 {
+  0% {
+    -webkit-transform: rotate(0deg);
+    transform: rotate(0deg); }
+  100% {
+    -webkit-transform: rotate(360deg);
+    transform: rotate(360deg); } }
+
 /**
 * CSS custom properties (aka variables).
 */
@@ -138,130 +271,6 @@ html[data-darkmode="true"] {
    html[data-darkmode="true"] .watch-table .unviewed.error {
      color: var(--color-watch-table-error); }

-/* spinner */
-.spinner,
-.spinner:after {
-  border-radius: 50%;
-  width: 10px;
-  height: 10px; }
-
-.spinner {
-  margin: 0px auto;
-  font-size: 3px;
-  vertical-align: middle;
-  display: inline-block;
-  text-indent: -9999em;
-  border-top: 1.1em solid rgba(38, 104, 237, 0.2);
-  border-right: 1.1em solid rgba(38, 104, 237, 0.2);
-  border-bottom: 1.1em solid rgba(38, 104, 237, 0.2);
-  border-left: 1.1em solid #2668ed;
-  -webkit-transform: translateZ(0);
-  -ms-transform: translateZ(0);
-  transform: translateZ(0);
-  -webkit-animation: load8 1.1s infinite linear;
-  animation: load8 1.1s infinite linear; }
-
-@-webkit-keyframes load8 {
-  0% {
-    -webkit-transform: rotate(0deg);
-    transform: rotate(0deg); }
-  100% {
-    -webkit-transform: rotate(360deg);
-    transform: rotate(360deg); } }
-
-@keyframes load8 {
-  0% {
-    -webkit-transform: rotate(0deg);
-    transform: rotate(0deg); }
-  100% {
-    -webkit-transform: rotate(360deg);
-    transform: rotate(360deg); } }
-
-#browser_steps {
-  /* convert rows to horizontal cells */ }
-  #browser_steps th {
-    display: none; }
-  #browser_steps li {
-    list-style: decimal;
-    padding: 5px; }
-    #browser_steps li:not(:first-child):hover {
-      opacity: 1.0; }
-    #browser_steps li .control {
-      padding-left: 5px;
-      padding-right: 5px; }
-      #browser_steps li .control a {
-        font-size: 70%; }
-    #browser_steps li.empty {
-      padding: 0px;
-      opacity: 0.35; }
-      #browser_steps li.empty .control {
-        display: none; }
-    #browser_steps li:hover {
-      background: #eee; }
-    #browser_steps li > label {
-      display: none; }
-
-#browser-steps-fieldlist {
-  height: 100%;
-  overflow-y: scroll; }
-
-#browser-steps .flex-wrapper {
-  display: flex;
-  flex-flow: row;
-  height: 600px;
-  /*@todo make this dynamic */ }
-
-/*  this is duplicate :( */
-#browsersteps-selector-wrapper {
-  height: 100%;
-  width: 100%;
-  overflow-y: scroll;
-  position: relative;
-  /* nice tall skinny one */ }
-  #browsersteps-selector-wrapper > img {
-    position: absolute;
-    max-width: 100%; }
-  #browsersteps-selector-wrapper > canvas {
-    position: relative;
-    max-width: 100%; }
-    #browsersteps-selector-wrapper > canvas:hover {
-      cursor: pointer; }
-  #browsersteps-selector-wrapper .loader {
-    position: absolute;
-    left: 50%;
-    top: 50%;
-    transform: translate(-50%, -50%);
-    margin-left: -40px;
-    z-index: 100;
-    max-width: 350px;
-    text-align: center; }
-  #browsersteps-selector-wrapper .spinner, #browsersteps-selector-wrapper .spinner:after {
-    width: 80px;
-    height: 80px;
-    font-size: 3px; }
-  #browsersteps-selector-wrapper #browsersteps-click-start {
-    color: var(--color-grey-400); }
-    #browsersteps-selector-wrapper #browsersteps-click-start:hover {
-      cursor: pointer; }
-
-.arrow {
-  border: solid #1b98f8;
-  border-width: 0 2px 2px 0;
-  display: inline-block;
-  padding: 3px; }
-  .arrow.right {
-    transform: rotate(-45deg);
-    -webkit-transform: rotate(-45deg); }
-  .arrow.left {
-    transform: rotate(135deg);
-    -webkit-transform: rotate(135deg); }
-  .arrow.up, .arrow.asc {
-    transform: rotate(-135deg);
-    -webkit-transform: rotate(-135deg); }
-  .arrow.down, .arrow.desc {
-    transform: rotate(45deg);
-    -webkit-transform: rotate(45deg); }
-
 body {
  color: var(--color-text);
  background: var(--color-background-page); }
@@ -275,6 +284,11 @@ body {
  white-space: nowrap;
  width: 1px; }

+.status-icon {
+  display: inline-block;
+  height: 1rem;
+  vertical-align: middle; }
+
 .pure-table-even {
  background: var(--color-background); }

@@ -945,3 +959,24 @@ ul {
    display: inline;
    height: 26px;
    vertical-align: middle; }
+
+/* automatic price following helpers */
+.tracking-ldjson-price-data {
+  background-color: var(--color-background-button-green);
+  color: #000;
+  padding: 3px;
+  border-radius: 3px;
+  white-space: nowrap; }
+
+.ldjson-price-track-offer {
+  font-weight: bold;
+  font-style: italic; }
+  .ldjson-price-track-offer a.pure-button {
+    border-radius: 3px;
+    padding: 3px;
+    background-color: var(--color-background-button-green); }
+
+.price-follow-tag-icon {
+  display: inline-block;
+  height: 0.8rem;
+  vertical-align: middle; }
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@@ -36,7 +36,6 @@ class ChangeDetectionStore:
        self.datastore_path = datastore_path
        self.json_store_path = "{}/url-watches.json".format(self.datastore_path)
        self.needs_write = False
-        self.proxy_list = None
        self.start_time = time.time()
        self.stop_thread = False
        # Base definition for all watchers
@@ -116,11 +115,6 @@ class ChangeDetectionStore:
            secret = secrets.token_hex(16)
            self.__data['settings']['application']['api_access_token'] = secret

-        # Proxy list support - available as a selection in settings when text file is imported
-        proxy_list_file = "{}/proxies.json".format(self.datastore_path)
-        if path.isfile(proxy_list_file):
-            self.import_proxy_list(proxy_list_file)
-
        # Bump the update version by running updates
        self.run_updates()

@@ -250,12 +244,15 @@ class ChangeDetectionStore:
    def clear_watch_history(self, uuid):
        import pathlib

-        self.__data['watching'][uuid].update(
-            {'last_checked': 0,
-             'last_viewed': 0,
-             'previous_md5': False,
-             'last_notification_error': False,
-             'last_error': False})
+        self.__data['watching'][uuid].update({
+                'last_checked': 0,
+                'has_ldjson_price_data': None,
+                'last_error': False,
+                'last_notification_error': False,
+                'last_viewed': 0,
+                'previous_md5': False,
+                'track_ldjson_price_data': None,
+            })

        # JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc
        for item in pathlib.Path(os.path.join(self.datastore_path, uuid)).rglob("*.*"):
@@ -460,10 +457,30 @@ class ChangeDetectionStore:
                    print ("Removing",item)
                    unlink(item)

-    def import_proxy_list(self, filename):
-        with open(filename) as f:
-            self.proxy_list = json.load(f)
-            print ("Registered proxy list", list(self.proxy_list.keys()))
+    @property
+    def proxy_list(self):
+        proxy_list = {}
+        proxy_list_file = os.path.join(self.datastore_path, 'proxies.json')
+
+        # Load from external config file
+        if path.isfile(proxy_list_file):
+            with open("{}/proxies.json".format(self.datastore_path)) as f:
+                proxy_list = json.load(f)
+
+        # Mapping from UI config if available
+        extras = self.data['settings']['requests'].get('extra_proxies')
+        if extras:
+            i=0
+            for proxy in extras:
+                i += 0
+                if proxy.get('proxy_name') and proxy.get('proxy_url'):
+                    k = "ui-" + str(i) + proxy.get('proxy_name')
+                    proxy_list[k] = {'label': proxy.get('proxy_name'), 'url': proxy.get('proxy_url')}
+
+
+        return proxy_list if len(proxy_list) else None
+
+


    def get_preferred_proxy_for_watch(self, uuid):
@@ -473,11 +490,10 @@ class ChangeDetectionStore:
        :return: proxy "key" id
        """

-        proxy_id = None
        if self.proxy_list is None:
            return None

-        # If its a valid one
+        # If it's a valid one
        watch = self.data['watching'].get(uuid)

        if watch.get('proxy') and watch.get('proxy') in list(self.proxy_list.keys()):
@@ -490,8 +506,9 @@ class ChangeDetectionStore:
            if self.proxy_list.get(system_proxy_id):
                return system_proxy_id

-        # Fallback - Did not resolve anything, use the first available
-        if system_proxy_id is None:
+
+        # Fallback - Did not resolve anything, or doesnt exist, use the first available
+        if system_proxy_id is None or not self.proxy_list.get(system_proxy_id):
            first_default = list(self.proxy_list)[0]
            return first_default

--- a/changedetectionio/templates/settings.html
+++ b/changedetectionio/templates/settings.html
@@ -21,6 +21,7 @@
            <li class="tab"><a href="#fetching">Fetching</a></li>
            <li class="tab"><a href="#filters">Global Filters</a></li>
            <li class="tab"><a href="#api">API</a></li>
+            <li class="tab"><a href="#proxies">CAPTCHA &amp; Proxies</a></li>
        </ul>
    </div>
    <div class="box-wrap inner">
@@ -170,14 +171,29 @@ nav
                    </div>
                </div>
            </div>
+            <div class="tab-pane-inner" id="proxies">

+                <p><strong>Tip</strong>: You can connect to websites using <a href="https://brightdata.grsm.io/n0r16zf7eivq">BrightData</a> proxies, their service <strong>WebUnlocker</strong> will solve most CAPTCHAs, whilst their <strong>Residential Proxies</strong> may help to avoid CAPTCHA altogether. </p>
+                <p>It may be easier to try <strong>WebUnlocker</strong> first, WebUnlocker also supports country selection.</p>
+                <p>
+                    When you have <a href="https://brightdata.grsm.io/n0r16zf7eivq">registered</a>, enabled the required services, visit the <A href="https://brightdata.com/cp/api_example?">API example page</A>, then select <strong>Python</strong>, set the country you wish to use, then copy+paste the example URL below<br/>
+                    The Proxy URL with BrightData should start with <code>http://brd-customer...</code>
+                </p>
+
+                <p>When you sign up using <a href="https://brightdata.grsm.io/n0r16zf7eivq">https://brightdata.grsm.io/n0r16zf7eivq</a> BrightData will match any first deposit up to $150</p>
+
+
+                <div class="pure-control-group">
+                    {{ render_field(form.requests.form.extra_proxies) }}
+                    <span class="pure-form-message-inline">"Name" will be used for selecting the proxy in the Watch Edit settings</span>
+                </div>
+            </div>
            <div id="actions">
                <div class="pure-control-group">
                    {{ render_button(form.save_button) }}
                    <a href="{{url_for('index')}}" class="pure-button button-small button-cancel">Back</a>
                    <a href="{{url_for('clear_all_history')}}" class="pure-button button-small button-cancel">Clear Snapshot History</a>
                </div>
-
            </div>
        </form>
    </div>
--- a/changedetectionio/templates/watch-overview.html
+++ b/changedetectionio/templates/watch-overview.html
@@ -32,6 +32,7 @@
        <button class="pure-button button-secondary button-xsmall" style="font-size: 70%"  name="op" value="unpause">UnPause</button>
        <button class="pure-button button-secondary button-xsmall" style="font-size: 70%"  name="op" value="mute">Mute</button>
        <button class="pure-button button-secondary button-xsmall" style="font-size: 70%"  name="op" value="unmute">UnMute</button>
+        <button class="pure-button button-secondary button-xsmall" style="font-size: 70%" name="op" value="recheck">Recheck</button>
        <button class="pure-button button-secondary button-xsmall" style="font-size: 70%" name="op" value="notification-default">Use default notification</button>
        <button class="pure-button button-secondary button-xsmall" style="background: #dd4242; font-size: 70%" name="op" value="delete">Delete</button>
    </div>
@@ -88,16 +89,31 @@
                </td>
                <td class="title-col inline">{{watch.title if watch.title is not none and watch.title|length > 0 else watch.url}}
                    <a class="external" target="_blank" rel="noopener" href="{{ watch.link.replace('source:','') }}"></a>
-                    <a class="link-spread" href="{{url_for('form_share_put_watch', uuid=watch.uuid)}}"><img style="height: 1em;display:inline-block;" src="{{url_for('static_content', group='images', filename='spread.svg')}}" class="icon icon-spread" /></a>
-
-                    {%if watch.fetch_backend == "html_webdriver" %}<img style="height: 1em; display:inline-block;" src="{{url_for('static_content', group='images', filename='Google-Chrome-icon.png')}}" />{% endif %}
+                    <a class="link-spread" href="{{url_for('form_share_put_watch', uuid=watch.uuid)}}"><img class="status-icon"  src="{{url_for('static_content', group='images', filename='spread.svg')}}" class="status-icon icon icon-spread" title="Create a link to share watch config with others" /></a>

+                    {%if watch.get_fetch_backend == "html_webdriver" %}<img class="status-icon" src="{{url_for('static_content', group='images', filename='Google-Chrome-icon.png')}}" title="Using a chrome browser" />{% endif %}
+                    {%if watch.is_pdf  %}<img class="status-icon" src="{{url_for('static_content', group='images', filename='pdf-icon.svg')}}" title="Converting PDF to text" />{% endif %}
                    {% if watch.last_error is defined and watch.last_error != False %}
-                    <div class="fetch-error">{{ watch.last_error }}</div>
+                    <div class="fetch-error">{{ watch.last_error }}
+
+                        {% if '403' in watch.last_error %}
+                            {% if has_proxies %}
+                                <a href="{{ url_for('settings_page', uuid=watch.uuid) }}#proxies">Try other proxies/location</a>&nbsp;
+                            {% endif %}
+                            <a href="{{ url_for('settings_page', uuid=watch.uuid) }}#proxies">Try adding external proxies/locations</a>
+                        
+                        {% endif %}
+                    </div>
                    {% endif %}
                    {% if watch.last_notification_error is defined and watch.last_notification_error != False %}
                    <div class="fetch-error notification-error"><a href="{{url_for('notification_logs')}}">{{ watch.last_notification_error }}</a></div>
                    {% endif %}
+                    {% if watch['has_ldjson_price_data'] and not watch['track_ldjson_price_data']  %}
+                    <div class="ldjson-price-track-offer">Embedded price data detected, follow only price data? <a href="{{url_for('price_data_follower.accept', uuid=watch.uuid)}}" class="pure-button button-xsmall">Yes</a> <a href="{{url_for('price_data_follower.reject', uuid=watch.uuid)}}" class="">No</a></div>
+                    {% endif %}
+                    {% if watch['track_ldjson_price_data'] == 'accepted' %}
+                    <span class="tracking-ldjson-price-data" title="Automatically following embedded price information"><img src="{{url_for('static_content', group='images', filename='price-tag-icon.svg')}}"  class="status-icon price-follow-tag-icon"/> Price</span>
+                    {% endif %}
                    {% if not active_tag %}
                    <span class="watch-tag-list">{{ watch.tag}}</span>
                    {% endif %}
--- a/changedetectionio/tests/proxy_list/proxies.json-example
+++ b/changedetectionio/tests/proxy_list/proxies.json-example
@@ -1,10 +1,10 @@
 {
  "proxy-one": {
-    "label": "One",
-    "url": "http://127.0.0.1:3128"
+    "label": "Proxy One",
+    "url": "http://squid-one:3128"
  },
  "proxy-two": {
-    "label": "two",
-    "url": "http://127.0.0.1:3129"
+    "label": "Proxy Two",
+    "url": "http://squid-two:3128"
  }
 }
--- a/changedetectionio/tests/proxy_list/squid-auth.conf
+++ b/changedetectionio/tests/proxy_list/squid-auth.conf
@@ -0,0 +1,48 @@
+acl localnet src 0.0.0.1-0.255.255.255  # RFC 1122 "this" network (LAN)
+acl localnet src 10.0.0.0/8             # RFC 1918 local private network (LAN)
+acl localnet src 100.64.0.0/10          # RFC 6598 shared address space (CGN)
+acl localnet src 169.254.0.0/16         # RFC 3927 link-local (directly plugged) machines
+acl localnet src 172.16.0.0/12          # RFC 1918 local private network (LAN)
+acl localnet src 192.168.0.0/16         # RFC 1918 local private network (LAN)
+acl localnet src fc00::/7               # RFC 4193 local private network range
+acl localnet src fe80::/10              # RFC 4291 link-local (directly plugged) machines
+acl localnet src 159.65.224.174
+acl SSL_ports port 443
+acl Safe_ports port 80          # http
+acl Safe_ports port 21          # ftp
+acl Safe_ports port 443         # https
+acl Safe_ports port 70          # gopher
+acl Safe_ports port 210         # wais
+acl Safe_ports port 1025-65535  # unregistered ports
+acl Safe_ports port 280         # http-mgmt
+acl Safe_ports port 488         # gss-http
+acl Safe_ports port 591         # filemaker
+acl Safe_ports port 777         # multiling http
+acl CONNECT method CONNECT
+
+http_access deny !Safe_ports
+http_access deny CONNECT !SSL_ports
+#http_access allow localhost manager
+http_access deny manager
+#http_access allow localhost
+#http_access allow localnet
+
+auth_param basic program /usr/lib/squid3/basic_ncsa_auth /etc/squid3/passwords
+auth_param basic realm proxy
+acl authenticated proxy_auth REQUIRED
+http_access allow authenticated
+http_access deny all
+
+
+http_port 3128
+coredump_dir /var/spool/squid
+refresh_pattern ^ftp:           1440    20%     10080
+refresh_pattern ^gopher:        1440    0%      1440
+refresh_pattern -i (/cgi-bin/|\?) 0     0%      0
+refresh_pattern \/(Packages|Sources)(|\.bz2|\.gz|\.xz)$ 0 0% 0 refresh-ims
+refresh_pattern \/Release(|\.gpg)$ 0 0% 0 refresh-ims
+refresh_pattern \/InRelease$ 0 0% 0 refresh-ims
+refresh_pattern \/(Translation-.*)(|\.bz2|\.gz|\.xz)$ 0 0% 0 refresh-ims
+refresh_pattern .               0       20%     4320
+logfile_rotate 0
+
--- a/changedetectionio/tests/proxy_list/squid-passwords.txt
+++ b/changedetectionio/tests/proxy_list/squid-passwords.txt
@@ -0,0 +1 @@
+test:$apr1$xvhFolTA$E/kz5/Rw1ewcyaSUdwqZs.
--- a/changedetectionio/tests/proxy_list/test_select_custom_proxy.py
+++ b/changedetectionio/tests/proxy_list/test_select_custom_proxy.py
@@ -0,0 +1,50 @@
+#!/usr/bin/python3
+
+import time
+from flask import url_for
+from ..util import live_server_setup, wait_for_all_checks
+
+# just make a request, we will grep in the docker logs to see it actually got called
+def test_select_custom(client, live_server):
+    live_server_setup(live_server)
+
+    # Goto settings, add our custom one
+    res = client.post(
+        url_for("settings_page"),
+        data={
+            "requests-time_between_check-minutes": 180,
+            "application-ignore_whitespace": "y",
+            "application-fetch_backend": "html_requests",
+            "requests-extra_proxies-0-proxy_name": "custom-test-proxy",
+            # test:awesome is set in tests/proxy_list/squid-passwords.txt
+            "requests-extra_proxies-0-proxy_url": "http://test:awesome@squid-custom:3128",
+        },
+        follow_redirects=True
+    )
+
+    assert b"Settings updated." in res.data
+
+    res = client.post(
+        url_for("import_page"),
+        # Because a URL wont show in squid/proxy logs due it being SSLed
+        # Use plain HTTP or a specific domain-name here
+        data={"urls": "https://changedetection.io/CHANGELOG.txt"},
+        follow_redirects=True
+    )
+
+    assert b"1 Imported" in res.data
+    wait_for_all_checks(client)
+
+    res = client.get(url_for("index"))
+    assert b'Proxy Authentication Required' not in res.data
+
+    res = client.get(
+        url_for("preview_page", uuid="first"),
+        follow_redirects=True
+    )
+    # We should see something via proxy
+    assert b'<div class=""> - 0.' in res.data
+
+    #
+    # Now we should see the request in the container logs for "squid-squid-custom" because it will be the only default
+
--- a/changedetectionio/tests/test.pdf
+++ b/changedetectionio/tests/test.pdf
--- a/changedetectionio/tests/test_automatic_follow_ldjson_price.py
+++ b/changedetectionio/tests/test_automatic_follow_ldjson_price.py
@@ -0,0 +1,146 @@
+#!/usr/bin/python3
+
+import time
+from flask import url_for
+from .util import live_server_setup, extract_UUID_from_client, extract_api_key_from_UI
+
+def set_response_with_ldjson():
+    test_return_data = """<html>
+       <body>
+     Some initial text</br>
+     <p>Which is across multiple lines</p>
+     </br>
+     So let's see what happens.  </br>
+     <div class="sametext">Some text thats the same</div>
+     <div class="changetext">Some text that will change</div>
+     <script type="application/ld+json">
+        {
+           "@context":"https://schema.org/",
+           "@type":"Product",
+           "@id":"https://www.some-virtual-phone-shop.com/celular-iphone-14/p",
+           "name":"Celular Iphone 14 Pro Max 256Gb E Sim A16 Bionic",
+           "brand":{
+              "@type":"Brand",
+              "name":"APPLE"
+           },
+           "image":"https://www.some-virtual-phone-shop.com/15509426/image.jpg",
+           "description":"You dont need it",
+           "mpn":"111111",
+           "sku":"22222",
+           "offers":{
+              "@type":"AggregateOffer",
+              "lowPrice":8097000,
+              "highPrice":8099900,
+              "priceCurrency":"COP",
+              "offers":[
+                 {
+                    "@type":"Offer",
+                    "price":8097000,
+                    "priceCurrency":"COP",
+                    "availability":"http://schema.org/InStock",
+                    "sku":"102375961",
+                    "itemCondition":"http://schema.org/NewCondition",
+                    "seller":{
+                       "@type":"Organization",
+                       "name":"ajax"
+                    }
+                 }
+              ],
+              "offerCount":1
+           }
+        }
+       </script>
+     </body>
+     </html>
+"""
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+    return None
+
+def set_response_without_ldjson():
+    test_return_data = """<html>
+       <body>
+     Some initial text</br>
+     <p>Which is across multiple lines</p>
+     </br>
+     So let's see what happens.  </br>
+     <div class="sametext">Some text thats the same</div>
+     <div class="changetext">Some text that will change</div>     
+     </body>
+     </html>
+"""
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+    return None
+
+# actually only really used by the distll.io importer, but could be handy too
+def test_check_ldjson_price_autodetect(client, live_server):
+    live_server_setup(live_server)
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    set_response_with_ldjson()
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+    time.sleep(3)
+
+    # Should get a notice that it's available
+    res = client.get(url_for("index"))
+    assert b'ldjson-price-track-offer' in res.data
+
+    # Accept it
+    uuid = extract_UUID_from_client(client)
+
+    client.get(url_for('price_data_follower.accept', uuid=uuid, follow_redirects=True))
+    time.sleep(2)
+
+    # Trigger a check
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+    time.sleep(2)
+    # Offer should be gone
+    res = client.get(url_for("index"))
+    assert b'Embedded price data' not in res.data
+    assert b'tracking-ldjson-price-data' in res.data
+
+    # and last snapshop (via API) should be just the price
+    api_key = extract_api_key_from_UI(client)
+    res = client.get(
+        url_for("watchsinglehistory", uuid=uuid, timestamp='latest'),
+        headers={'x-api-key': api_key},
+    )
+
+    # Should see this (dont know where the whitespace came from)
+    assert b'"highPrice": 8099900' in res.data
+    # And not this cause its not the ld-json
+    assert b"So let's see what happens" not in res.data
+
+    client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+
+    ##########################################################################################
+    # And we shouldnt see the offer
+    set_response_without_ldjson()
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+    time.sleep(3)
+    res = client.get(url_for("index"))
+    assert b'ldjson-price-track-offer' not in res.data
+    
+    ##########################################################################################
+    client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
--- a/changedetectionio/tests/test_filter_failure_notification.py
+++ b/changedetectionio/tests/test_filter_failure_notification.py
@@ -1,8 +1,7 @@
 import os
 import time
-import re
 from flask import url_for
-from .util import set_original_response, live_server_setup
+from .util import set_original_response, live_server_setup, extract_UUID_from_client
 from changedetectionio.model import App


@@ -121,6 +120,10 @@ def run_filter_test(client, content_filter):
        notification = f.read()
    assert not 'CSS/xPath filter was not present in the page' in notification

+    # Re #1247 - All tokens got replaced
+    uuid = extract_UUID_from_client(client)
+    assert uuid in notification
+
    # cleanup for the next
    client.get(
        url_for("form_delete", uuid="all"),
--- a/changedetectionio/tests/test_jsonpath_jq_selector.py
+++ b/changedetectionio/tests/test_jsonpath_jq_selector.py
@@ -394,6 +394,48 @@ def check_json_ext_filter(json_filter, client, live_server):
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data

+def test_ignore_json_order(client, live_server):
+    # A change in order shouldn't trigger a notification
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write('{"hello" : 123, "world": 123}')
+
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', content_type="application/json", _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    time.sleep(2)
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write('{"world" : 123, "hello": 123}')
+
+    # Trigger a check
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+    time.sleep(2)
+
+    res = client.get(url_for("index"))
+    assert b'unviewed' not in res.data
+
+    # Just to be sure it still works
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write('{"world" : 123, "hello": 124}')
+
+    # Trigger a check
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+    time.sleep(2)
+
+    res = client.get(url_for("index"))
+    assert b'unviewed' in res.data
+
+    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data
+
 def test_check_jsonpath_ext_filter(client, live_server):
    check_json_ext_filter('json:$[?(@.status==Sold)]', client, live_server)

--- a/changedetectionio/tests/test_pdf.py
+++ b/changedetectionio/tests/test_pdf.py
@@ -0,0 +1,40 @@
+#!/usr/bin/python3
+
+import time
+from flask import url_for
+from .util import set_original_response, set_modified_response, live_server_setup
+
+sleep_time_for_fetch_thread = 3
+
+# `subtractive_selectors` should still work in `source:` type requests
+def test_fetch_pdf(client, live_server):
+    import shutil
+    shutil.copy("tests/test.pdf", "test-datastore/endpoint-test.pdf")
+
+    live_server_setup(live_server)
+    test_url = url_for('test_pdf_endpoint', _external=True)
+    # Add our URL to the import page
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+
+    assert b"1 Imported" in res.data
+
+    time.sleep(sleep_time_for_fetch_thread)
+    res = client.get(
+        url_for("preview_page", uuid="first"),
+        follow_redirects=True
+    )
+
+    assert b'PDF-1.5' not in res.data
+    assert b'hello world' in res.data
+
+    # So we know if the file changes in other ways
+    import hashlib
+    md5 = hashlib.md5(open("test-datastore/endpoint-test.pdf", 'rb').read()).hexdigest().upper()
+    # We should have one
+    assert len(md5) >0
+    # And it's going to be in the document
+    assert b'Document checksum - '+bytes(str(md5).encode('utf-8')) in res.data
--- a/changedetectionio/tests/util.py
+++ b/changedetectionio/tests/util.py
@@ -168,5 +168,15 @@ def live_server_setup(live_server):
    def test_return_query():
        return request.query_string

+
+    @live_server.app.route('/endpoint-test.pdf')
+    def test_pdf_endpoint():
+
+        # Tried using a global var here but didn't seem to work, so reading from a file instead.
+        with open("test-datastore/endpoint-test.pdf", "rb") as f:
+            resp = make_response(f.read(), 200)
+            resp.headers['Content-Type'] = 'application/pdf'
+            return resp
+
    live_server.start()

--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -4,6 +4,7 @@ import queue
 import time

 from changedetectionio import content_fetcher
+from changedetectionio import queuedWatchMetaData
 from changedetectionio.fetch_site_status import FilterNotFoundInResponse

 # A single update worker
@@ -92,7 +93,7 @@ class update_worker(threading.Thread):
            return

        n_object = {'notification_title': 'Changedetection.io - Alert - CSS/xPath filter was not present in the page',
-                    'notification_body': "Your configured CSS/xPath filters of '{}' for {{watch_url}} did not appear on the page after {} attempts, did the page change layout?\n\nLink: {{base_url}}/edit/{{watch_uuid}}\n\nThanks - Your omniscient changedetection.io installation :)\n".format(
+                    'notification_body': "Your configured CSS/xPath filters of '{}' for {{{{watch_url}}}} did not appear on the page after {} attempts, did the page change layout?\n\nLink: {{{{base_url}}}}/edit/{{{{watch_uuid}}}}\n\nThanks - Your omniscient changedetection.io installation :)\n".format(
                        ", ".join(watch['include_filters']),
                        threshold),
                    'notification_format': 'text'}
@@ -157,11 +158,12 @@ class update_worker(threading.Thread):
        while not self.app.config.exit.is_set():

            try:
-                priority, uuid = self.q.get(block=False)
+                queued_item_data = self.q.get(block=False)
            except queue.Empty:
                pass

            else:
+                uuid = queued_item_data.item.get('uuid')
                self.current_uuid = uuid

                if uuid in list(self.datastore.data['watching'].keys()):
@@ -171,11 +173,11 @@ class update_worker(threading.Thread):
                    update_obj= {}
                    xpath_data = False
                    process_changedetection_results = True
-                    print("> Processing UUID {} Priority {} URL {}".format(uuid, priority, self.datastore.data['watching'][uuid]['url']))
+                    print("> Processing UUID {} Priority {} URL {}".format(uuid, queued_item_data.priority, self.datastore.data['watching'][uuid]['url']))
                    now = time.time()

                    try:
-                        changed_detected, update_obj, contents = update_handler.run(uuid)
+                        changed_detected, update_obj, contents = update_handler.run(uuid, skip_when_checksum_same=queued_item_data.item.get('skip_when_checksum_same'))
                        # Re #342
                        # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
                        # We then convert/.decode('utf-8') for the notification etc
@@ -241,6 +243,10 @@ class update_worker(threading.Thread):

                        process_changedetection_results = True

+                    except content_fetcher.checksumFromPreviousCheckWasTheSame as e:
+                        # Yes fine, so nothing todo
+                        pass
+
                    except content_fetcher.BrowserStepsStepTimout as e:

                        if not self.datastore.data['watching'].get(uuid):
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,2 +0,0 @@
-pytest ~=6.2
-pytest-flask ~=1.2
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,8 +29,9 @@ apprise~=1.2.0
 # apprise mqtt https://github.com/dgtlmoon/changedetection.io/issues/315
 paho-mqtt

-# Pinned version of cryptography otherwise
-# ERROR: Could not build wheels for cryptography which use PEP 517 and cannot be installed directly
+# This mainly affects some ARM builds, which unlike the other builds ignores "ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1"
+# so without this pinning, the newer versions on ARM will forcefully try to build rust, which results in "rust compiler not found"
+# (introduced once apprise became a dep)
 cryptography~=3.4

 # Used for CSS filtering
@@ -58,3 +59,7 @@ jq~=1.3 ;python_version >= "3.8" and sys_platform == "linux"
 # Any current modern version, required so far for screenshot PNG->JPEG conversion but will be used more in the future
 pillow
 # playwright is installed at Dockerfile build time because it's not available on all platforms
+
+# Include pytest, so if theres a support issue we can ask them to run these tests on their setup
+pytest ~=6.2
+pytest-flask ~=1.2
Author	SHA1	Message	Date
dgtlmoon	c81cda08f7	fix curl request	2022-12-26 16:45:52 +01:00
dgtlmoon	5d4dee2a1e	Merge branch 'master' into ipv6	2022-12-26 16:34:59 +01:00
dgtlmoon	4a86637f2d	add ipv6 checks	2022-12-26 16:34:05 +01:00
dgtlmoon	9b4b5cae1c	Merge branch 'ipv6' of https://github.com/dom6770/changedetection.io into ipv6	2022-12-26 16:33:11 +01:00
dgtlmoon	fc6424c39e	Test improvements (#1264 )	2022-12-26 14:17:40 +01:00
dgtlmoon	285a65ced4	wrong test check	2022-12-26 14:01:22 +01:00
dgtlmoon	467cd099e9	revert	2022-12-26 13:32:48 +01:00
dgtlmoon	794a6d59de	was simply the wrong hostname	2022-12-26 13:31:43 +01:00
dgtlmoon	5f997e5d1b	wtf	2022-12-26 10:07:24 +01:00
dgtlmoon	e412fd6146	hmm	2022-12-26 09:49:06 +01:00
dgtlmoon	c950ab5219	archive debug output	2022-12-26 09:29:22 +01:00
dgtlmoon	fcbda7829a	hmm fixes the error on github?	2022-12-25 23:03:14 +01:00
dgtlmoon	f0966eb23a	0.40.0.4	2022-12-25 18:25:45 +01:00
dgtlmoon	e4fb5ab4da	UI - Suggest adding proxy for watch when 403 access denied is reached (#1260 )	2022-12-23 22:26:24 +01:00
dgtlmoon	e99f07a51d	Filters & Notifications - fixed tokens in filter not found notification	2022-12-22 10:05:17 +01:00
dgtlmoon	08ee223b5f	UI - Fix broken html tags in settings page	2022-12-20 18:57:26 +01:00
dgtlmoon	572f9b8a31	Proxy Settings in UI - TidyUp BrightData text	2022-12-20 10:08:16 +01:00
dgtlmoon	fcfd1b5e10	Ability to configure extra proxies via the UI (#1235 )	2022-12-19 21:48:01 +01:00
dgtlmoon	0790dd555e	Docker container updates - use Python 3.10, remove unused packages	2022-12-19 20:46:02 +01:00
dgtlmoon	0b20dc7712	Tidy up list icons a bit (#1250 )	2022-12-19 20:30:32 +01:00
dgtlmoon	13c4121f52	PDF File change detection - Initial PDF fetcher support with basic text extraction (#1244 )	2022-12-19 17:51:41 +01:00
dgtlmoon	e8e176f3bd	Testing - Run test as fully built docker container (#1245 )	2022-12-19 14:41:34 +01:00
dgtlmoon	7a1d2d924e	Dark mode - system setting var is not required (its cookie based)	2022-12-19 14:13:57 +01:00
dgtlmoon	c3731cf055	0.40.0.3	2022-12-19 12:41:52 +01:00
dgtlmoon	a287e5a86c	Visual Selector - Select smallest/most precise element first, better filtering of zero size elements	2022-12-19 12:33:31 +01:00
dgtlmoon	235535c327	Fetching - Check the most overdue watch first (#1242 )	2022-12-17 15:40:57 +01:00
dgtlmoon	44dc62da2d	Overview list - Checkbox action "Recheck"	2022-12-16 18:35:09 +01:00
dgtlmoon	0c380c170f	Playwright - Better error reporting and re-try fetch on fail once (#1238 )	2022-12-16 18:06:14 +01:00
dgtlmoon	b7a2501d64	Fetching - Always sort the key order of JSON content for less false alerts (May cause an alert on upgrade, but will be better going forwards) #1219	2022-12-15 09:13:09 +01:00
dgtlmoon	e970fef991	Fetcher + VisualSelector - xPath filter with attribute filter was breaking the element finder	2022-12-14 19:06:49 +01:00
dgtlmoon	b76148a0f4	Fetcher - CPU usage - Skip processing if the previous checksum and the just fetched one was the same (#925 )	2022-12-14 15:08:34 +01:00
dgtlmoon	93cc30437f	Playwright+BrowserSteps - Fetch changes - Fetch simply after page starts rendering + delay seconds, disable service workers	2022-12-14 12:16:04 +01:00
dgtlmoon	6562d6e0d4	Improve ARM/rust build comment	2022-12-13 12:28:20 +01:00
dgtlmoon	6c217cc3b6	README.md - Improving JSONPath example for LD+JSON product data	2022-12-11 11:14:52 +01:00
dgtlmoon	f30cdf0674	0.40.0.2	2022-12-08 22:36:59 +01:00
dgtlmoon	14da0646a7	Price follower - Dont scan for ldjson data when 'no' was clicked on the suggestion (#1207 )	2022-12-08 22:35:37 +01:00
dgtlmoon	b413cdecc7	Adding missing parts for pip build Re #1206	2022-12-08 21:54:55 +01:00
dgtlmoon	7bf52d9275	0.40.0	2022-12-08 20:09:42 +01:00
dgtlmoon	09e6624afd	VisualSelector - Exclude items that are not interactable or visible	2022-12-08 20:08:41 +01:00
dgtlmoon	b58fd995b5	Automatically offer to track LD+JSON product price data (#1204 )	2022-12-08 19:28:20 +01:00
dom6770	366baaf322	fixed wrong ipv6 argument position at ssl_mode	2022-11-19 08:26:38 -06:00
dom6770	6c1b9bcc5c	added ipv6 support to the eventlet listener	2022-11-19 08:14:06 -06:00