mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-11-04 08:34:57 +00:00 
			
		
		
		
	Compare commits
	
		
			2 Commits
		
	
	
		
			exception-
			...
			2118-fix-m
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					575df972a4 | ||
| 
						 | 
					f1babbac33 | 
							
								
								
									
										4
									
								
								.github/dependabot.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/dependabot.yml
									
									
									
									
										vendored
									
									
								
							@@ -4,10 +4,6 @@ updates:
 | 
			
		||||
    directory: /
 | 
			
		||||
    schedule:
 | 
			
		||||
      interval: "weekly"
 | 
			
		||||
    "caronc/apprise":
 | 
			
		||||
      versioning-strategy: "increase"
 | 
			
		||||
      schedule:
 | 
			
		||||
        interval: "daily"
 | 
			
		||||
    groups:
 | 
			
		||||
      all:
 | 
			
		||||
        patterns:
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/test/Dockerfile-alpine
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/test/Dockerfile-alpine
									
									
									
									
										vendored
									
									
								
							@@ -12,10 +12,8 @@ RUN \
 | 
			
		||||
    cargo \
 | 
			
		||||
    g++ \
 | 
			
		||||
    gcc \
 | 
			
		||||
    jpeg-dev \
 | 
			
		||||
    libc-dev \
 | 
			
		||||
    libffi-dev \
 | 
			
		||||
    libjpeg \
 | 
			
		||||
    libxslt-dev \
 | 
			
		||||
    make \
 | 
			
		||||
    openssl-dev \
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										7
									
								
								.github/workflows/pypi-release.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										7
									
								
								.github/workflows/pypi-release.yml
									
									
									
									
										vendored
									
									
								
							@@ -11,7 +11,7 @@ jobs:
 | 
			
		||||
    - name: Set up Python
 | 
			
		||||
      uses: actions/setup-python@v5
 | 
			
		||||
      with:
 | 
			
		||||
        python-version: "3.11"
 | 
			
		||||
        python-version: "3.x"
 | 
			
		||||
    - name: Install pypa/build
 | 
			
		||||
      run: >-
 | 
			
		||||
        python3 -m
 | 
			
		||||
@@ -38,14 +38,9 @@ jobs:
 | 
			
		||||
      with:
 | 
			
		||||
        name: python-package-distributions
 | 
			
		||||
        path: dist/
 | 
			
		||||
    - name: Set up Python 3.11
 | 
			
		||||
      uses: actions/setup-python@v5
 | 
			
		||||
      with:
 | 
			
		||||
        python-version: '3.11'
 | 
			
		||||
    - name: Test that the basic pip built package runs without error
 | 
			
		||||
      run: |
 | 
			
		||||
        set -ex
 | 
			
		||||
        sudo pip3 install --upgrade pip 
 | 
			
		||||
        pip3 install dist/changedetection.io*.whl
 | 
			
		||||
        changedetection.io -d /tmp -p 10000 &
 | 
			
		||||
        sleep 3
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/workflows/test-container-build.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/test-container-build.yml
									
									
									
									
										vendored
									
									
								
							@@ -11,14 +11,12 @@ on:
 | 
			
		||||
      - requirements.txt
 | 
			
		||||
      - Dockerfile
 | 
			
		||||
      - .github/workflows/*
 | 
			
		||||
      - .github/test/Dockerfile*
 | 
			
		||||
 | 
			
		||||
  pull_request:
 | 
			
		||||
    paths:
 | 
			
		||||
      - requirements.txt
 | 
			
		||||
      - Dockerfile
 | 
			
		||||
      - .github/workflows/*
 | 
			
		||||
      - .github/test/Dockerfile*
 | 
			
		||||
 | 
			
		||||
  # Changes to requirements.txt packages and Dockerfile may or may not always be compatible with arm etc, so worth testing
 | 
			
		||||
  # @todo: some kind of path filter for requirements.txt and Dockerfile
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										112
									
								
								.github/workflows/test-only.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										112
									
								
								.github/workflows/test-only.yml
									
									
									
									
										vendored
									
									
								
							@@ -28,12 +28,12 @@ jobs:
 | 
			
		||||
          
 | 
			
		||||
          docker network create changedet-network
 | 
			
		||||
          
 | 
			
		||||
          # Selenium
 | 
			
		||||
          # Selenium+browserless
 | 
			
		||||
          docker run --network changedet-network -d --hostname selenium  -p 4444:4444 --rm --shm-size="2g"  selenium/standalone-chrome:4
 | 
			
		||||
          docker run --network changedet-network -d --name browserless --hostname browserless -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm  -p 3000:3000  --shm-size="2g"  browserless/chrome:1.60-chrome-stable
 | 
			
		||||
          
 | 
			
		||||
          # SocketPuppetBrowser + Extra for custom browser test
 | 
			
		||||
          docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser --hostname sockpuppetbrowser --rm -p 3000:3000 dgtlmoon/sockpuppetbrowser:latest                    
 | 
			
		||||
          docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser-custom-url --hostname sockpuppetbrowser-custom-url  -p 3001:3000 --rm dgtlmoon/sockpuppetbrowser:latest
 | 
			
		||||
          # For accessing custom browser tests
 | 
			
		||||
          docker run --network changedet-network -d --name browserless-custom-url --hostname browserless-custom-url -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm --shm-size="2g"  browserless/chrome:1.60-chrome-stable
 | 
			
		||||
 | 
			
		||||
      - name: Build changedetection.io container for testing
 | 
			
		||||
        run: |         
 | 
			
		||||
@@ -47,12 +47,6 @@ jobs:
 | 
			
		||||
          # Debug SMTP server/echo message back server
 | 
			
		||||
          docker run --network changedet-network -d -p 11025:11025 -p 11080:11080  --hostname mailserver test-changedetectionio  bash -c 'python changedetectionio/tests/smtp/smtp-test-server.py' 
 | 
			
		||||
 | 
			
		||||
      - name: Show docker container state and other debug info
 | 
			
		||||
        run: |
 | 
			
		||||
          set -x
 | 
			
		||||
          echo "Running processes in docker..."
 | 
			
		||||
          docker ps
 | 
			
		||||
 | 
			
		||||
      - name: Test built container with Pytest (generally as requests/plaintext fetching)
 | 
			
		||||
        run: |
 | 
			
		||||
          # Unit tests
 | 
			
		||||
@@ -65,76 +59,52 @@ jobs:
 | 
			
		||||
          # The default pytest logger_level is TRACE
 | 
			
		||||
          # To change logger_level for pytest(test/conftest.py),
 | 
			
		||||
          # append the docker option. e.g. '-e LOGGER_LEVEL=DEBUG'
 | 
			
		||||
          docker run --name test-cdio-basic-tests --network changedet-network  test-changedetectionio  bash -c 'cd changedetectionio && ./run_basic_tests.sh'
 | 
			
		||||
          docker run --network changedet-network  test-changedetectionio  bash -c 'cd changedetectionio && ./run_basic_tests.sh'
 | 
			
		||||
 | 
			
		||||
# PLAYWRIGHT/NODE-> CDP
 | 
			
		||||
      - name: Playwright and SocketPuppetBrowser - Specific tests in built container
 | 
			
		||||
        run: |
 | 
			
		||||
          # Playwright via Sockpuppetbrowser fetch
 | 
			
		||||
          # tests/visualselector/test_fetch_data.py will do browser steps  
 | 
			
		||||
          docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/fetchers/test_content.py'
 | 
			
		||||
          docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_errorhandling.py'
 | 
			
		||||
          docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/visualselector/test_fetch_data.py'
 | 
			
		||||
          docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/fetchers/test_custom_js_before_content.py'
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      - name: Playwright and SocketPuppetBrowser - Headers and requests
 | 
			
		||||
        run: |       
 | 
			
		||||
          # Settings headers playwright tests - Call back in from Sockpuppetbrowser, check headers
 | 
			
		||||
          docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000?dumpio=true" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py'
 | 
			
		||||
 | 
			
		||||
      - name: Playwright and SocketPuppetBrowser - Restock detection
 | 
			
		||||
        run: |                            
 | 
			
		||||
          # restock detection via playwright - added name=changedet here so that playwright and sockpuppetbrowser can connect to it
 | 
			
		||||
          docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py'
 | 
			
		||||
 | 
			
		||||
# STRAIGHT TO CDP
 | 
			
		||||
      - name: Pyppeteer and SocketPuppetBrowser - Specific tests in built container
 | 
			
		||||
        run: |
 | 
			
		||||
          # Playwright via Sockpuppetbrowser fetch 
 | 
			
		||||
          docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/fetchers/test_content.py'
 | 
			
		||||
          docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_errorhandling.py'
 | 
			
		||||
          docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/visualselector/test_fetch_data.py'
 | 
			
		||||
          docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/fetchers/test_custom_js_before_content.py'
 | 
			
		||||
 | 
			
		||||
      - name: Pyppeteer and SocketPuppetBrowser - Headers and requests checks
 | 
			
		||||
        run: |       
 | 
			
		||||
          # Settings headers playwright tests - Call back in from Sockpuppetbrowser, check headers
 | 
			
		||||
          docker run --name "changedet" --hostname changedet --rm  -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000?dumpio=true" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py'
 | 
			
		||||
 | 
			
		||||
      - name: Pyppeteer and SocketPuppetBrowser - Restock detection
 | 
			
		||||
        run: |                            
 | 
			
		||||
          # restock detection via playwright - added name=changedet here so that playwright and sockpuppetbrowser can connect to it
 | 
			
		||||
          docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet"  -e "FAST_PUPPETEER_CHROME_FETCHER=True"  -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py'
 | 
			
		||||
 | 
			
		||||
# SELENIUM
 | 
			
		||||
      - name: Specific tests in built container for Selenium
 | 
			
		||||
        run: |
 | 
			
		||||
          
 | 
			
		||||
          # Selenium fetch
 | 
			
		||||
          docker run --rm -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py'
 | 
			
		||||
 | 
			
		||||
      - name: Specific tests in built container for Playwright
 | 
			
		||||
        run: |         
 | 
			
		||||
          # Playwright/Browserless fetch
 | 
			
		||||
          docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py'
 | 
			
		||||
 | 
			
		||||
      - name: Specific tests in built container for headers and requests checks with Playwright
 | 
			
		||||
        run: |                  
 | 
			
		||||
          # Settings headers playwright tests - Call back in from Browserless, check headers
 | 
			
		||||
          docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000?dumpio=true" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py'
 | 
			
		||||
 | 
			
		||||
      - name: Specific tests in built container for headers and requests checks with Selenium
 | 
			
		||||
        run: |
 | 
			
		||||
        run: |                  
 | 
			
		||||
          docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py'
 | 
			
		||||
 | 
			
		||||
# OTHER STUFF
 | 
			
		||||
      - name: Specific tests in built container with Playwright as Puppeteer experimental fetcher
 | 
			
		||||
        run: |                  
 | 
			
		||||
          docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000?dumpio=true" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py'          
 | 
			
		||||
 | 
			
		||||
      - name: Test built container restock detection via Playwright
 | 
			
		||||
        run: |                            
 | 
			
		||||
          # restock detection via playwright - added name=changedet here so that playwright/browserless can connect to it
 | 
			
		||||
          docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py'
 | 
			
		||||
 | 
			
		||||
      - name: Test SMTP notification mime types
 | 
			
		||||
        run: |
 | 
			
		||||
          # SMTP content types - needs the 'Debug SMTP server/echo message back server' container from above
 | 
			
		||||
          docker run --rm  --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/smtp/test_notification_smtp.py'
 | 
			
		||||
 | 
			
		||||
      # @todo Add a test via playwright/puppeteer
 | 
			
		||||
      # squid with auth is tested in run_proxy_tests.sh -> tests/proxy_list/test_select_custom_proxy.py
 | 
			
		||||
      - name: Test proxy squid style interaction
 | 
			
		||||
      - name: Test with puppeteer fetcher and disk cache
 | 
			
		||||
        run: |
 | 
			
		||||
          docker run --rm -e "PUPPETEER_DISK_CACHE=/tmp/data/" -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py'
 | 
			
		||||
          # Browserless would have had -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" added above
 | 
			
		||||
 | 
			
		||||
      - name: Test proxy interaction
 | 
			
		||||
        run: |
 | 
			
		||||
          cd changedetectionio
 | 
			
		||||
          ./run_proxy_tests.sh
 | 
			
		||||
          cd ..
 | 
			
		||||
 | 
			
		||||
      - name: Test proxy SOCKS5 style interaction
 | 
			
		||||
        run: |
 | 
			
		||||
          cd changedetectionio
 | 
			
		||||
          ./run_socks_proxy_tests.sh
 | 
			
		||||
          # And again with PLAYWRIGHT_DRIVER_URL=..
 | 
			
		||||
          cd ..
 | 
			
		||||
 | 
			
		||||
      - name: Test custom browser URL
 | 
			
		||||
@@ -208,16 +178,6 @@ jobs:
 | 
			
		||||
          # @todo - scan the container log to see the right "graceful shutdown" text exists           
 | 
			
		||||
          docker rm sig-test
 | 
			
		||||
 | 
			
		||||
      - name: Dump container log
 | 
			
		||||
        if: always()
 | 
			
		||||
        run: |
 | 
			
		||||
          mkdir output-logs
 | 
			
		||||
          docker logs test-cdio-basic-tests > output-logs/test-cdio-basic-tests-stdout.txt
 | 
			
		||||
          docker logs test-cdio-basic-tests 2> output-logs/test-cdio-basic-tests-stderr.txt
 | 
			
		||||
 | 
			
		||||
      - name: Store container log
 | 
			
		||||
        if: always()
 | 
			
		||||
        uses: actions/upload-artifact@v4
 | 
			
		||||
        with:
 | 
			
		||||
          name: test-cdio-basic-tests-output
 | 
			
		||||
          path: output-logs
 | 
			
		||||
#export WEBDRIVER_URL=http://localhost:4444/wd/hub
 | 
			
		||||
#pytest tests/fetchers/test_content.py
 | 
			
		||||
#pytest tests/test_errorhandling.py
 | 
			
		||||
 
 | 
			
		||||
@@ -2,7 +2,7 @@ Contributing is always welcome!
 | 
			
		||||
 | 
			
		||||
I am no professional flask developer, if you know a better way that something can be done, please let me know!
 | 
			
		||||
 | 
			
		||||
Otherwise, it's always best to PR into the `master` branch.
 | 
			
		||||
Otherwise, it's always best to PR into the `dev` branch.
 | 
			
		||||
 | 
			
		||||
Please be sure that all new functionality has a matching test!
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,8 +1,5 @@
 | 
			
		||||
# pip dependencies install stage
 | 
			
		||||
 | 
			
		||||
# @NOTE! I would love to move to 3.11 but it breaks the async handler in changedetectionio/content_fetchers/puppeteer.py
 | 
			
		||||
#        If you know how to fix it, please do! and test it for both 3.10 and 3.11
 | 
			
		||||
FROM python:3.10-slim-bookworm as builder
 | 
			
		||||
FROM python:3.11-slim-bookworm as builder
 | 
			
		||||
 | 
			
		||||
# See `cryptography` pin comment in requirements.txt
 | 
			
		||||
ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1
 | 
			
		||||
@@ -28,11 +25,11 @@ RUN pip install --target=/dependencies -r /requirements.txt
 | 
			
		||||
# Playwright is an alternative to Selenium
 | 
			
		||||
# Excluded this package from requirements.txt to prevent arm/v6 and arm/v7 builds from failing
 | 
			
		||||
# https://github.com/dgtlmoon/changedetection.io/pull/1067 also musl/alpine (not supported)
 | 
			
		||||
RUN pip install --target=/dependencies playwright~=1.41.2 \
 | 
			
		||||
RUN pip install --target=/dependencies playwright~=1.40 \
 | 
			
		||||
    || echo "WARN: Failed to install Playwright. The application can still run, but the Playwright option will be disabled."
 | 
			
		||||
 | 
			
		||||
# Final image stage
 | 
			
		||||
FROM python:3.10-slim-bookworm
 | 
			
		||||
FROM python:3.11-slim-bookworm
 | 
			
		||||
 | 
			
		||||
RUN apt-get update && apt-get install -y --no-install-recommends \
 | 
			
		||||
    libxslt1.1 \
 | 
			
		||||
 
 | 
			
		||||
@@ -1,8 +1,8 @@
 | 
			
		||||
recursive-include changedetectionio/api *
 | 
			
		||||
recursive-include changedetectionio/blueprint *
 | 
			
		||||
recursive-include changedetectionio/content_fetchers *
 | 
			
		||||
recursive-include changedetectionio/model *
 | 
			
		||||
recursive-include changedetectionio/processors *
 | 
			
		||||
recursive-include changedetectionio/res *
 | 
			
		||||
recursive-include changedetectionio/static *
 | 
			
		||||
recursive-include changedetectionio/templates *
 | 
			
		||||
recursive-include changedetectionio/tests *
 | 
			
		||||
 
 | 
			
		||||
@@ -91,14 +91,6 @@ We [recommend and use Bright Data](https://brightdata.grsm.io/n0r16zf7eivq) glob
 | 
			
		||||
 | 
			
		||||
Please :star: star :star: this project and help it grow! https://github.com/dgtlmoon/changedetection.io/
 | 
			
		||||
 | 
			
		||||
### We have a Chrome extension!
 | 
			
		||||
 | 
			
		||||
Easily add the current web page to your changedetection.io tool, simply install the extension and click "Sync" to connect it to your existing changedetection.io install.
 | 
			
		||||
 | 
			
		||||
[<img src="./docs/chrome-extension-screenshot.png" style="max-width:80%;" alt="Chrome Extension to easily add the current web-page to detect a change."  title="Chrome Extension to easily add the current web-page to detect a change."  />](https://chromewebstore.google.com/detail/changedetectionio-website/kefcfmgmlhmankjmnbijimhofdjekbop)
 | 
			
		||||
 | 
			
		||||
[Goto the Chrome Webstore to download the extension.](https://chromewebstore.google.com/detail/changedetectionio-website/kefcfmgmlhmankjmnbijimhofdjekbop)
 | 
			
		||||
 | 
			
		||||
## Installation
 | 
			
		||||
 | 
			
		||||
### Docker
 | 
			
		||||
 
 | 
			
		||||
@@ -2,15 +2,15 @@
 | 
			
		||||
 | 
			
		||||
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
 | 
			
		||||
 | 
			
		||||
__version__ = '0.45.20'
 | 
			
		||||
__version__ = '0.45.13'
 | 
			
		||||
 | 
			
		||||
from changedetectionio.strtobool import strtobool
 | 
			
		||||
from distutils.util import strtobool
 | 
			
		||||
from json.decoder import JSONDecodeError
 | 
			
		||||
import os
 | 
			
		||||
#os.environ['EVENTLET_NO_GREENDNS'] = 'yes'
 | 
			
		||||
 | 
			
		||||
import eventlet
 | 
			
		||||
import eventlet.wsgi
 | 
			
		||||
import getopt
 | 
			
		||||
import os
 | 
			
		||||
import signal
 | 
			
		||||
import socket
 | 
			
		||||
import sys
 | 
			
		||||
 
 | 
			
		||||
@@ -1,5 +1,5 @@
 | 
			
		||||
import os
 | 
			
		||||
from changedetectionio.strtobool import strtobool
 | 
			
		||||
from distutils.util import strtobool
 | 
			
		||||
 | 
			
		||||
from flask_expects_json import expects_json
 | 
			
		||||
from changedetectionio import queuedWatchMetaData
 | 
			
		||||
 
 | 
			
		||||
@@ -1,7 +0,0 @@
 | 
			
		||||
- This needs an abstraction to directly handle the puppeteer connection methods
 | 
			
		||||
- Then remove the playwright stuff
 | 
			
		||||
- Remove hack redirect at line 65 changedetectionio/processors/__init__.py
 | 
			
		||||
 | 
			
		||||
The screenshots are base64 encoded/decoded which is very CPU intensive for large screenshots (in playwright) but not
 | 
			
		||||
in the direct puppeteer connection (they are binary end to end)
 | 
			
		||||
 | 
			
		||||
@@ -4,15 +4,24 @@
 | 
			
		||||
# Why?
 | 
			
		||||
# `browsersteps_playwright_browser_interface.chromium.connect_over_cdp()` will only run once without async()
 | 
			
		||||
# - this flask app is not async()
 | 
			
		||||
# - A single timeout/keepalive which applies to the session made at .connect_over_cdp()
 | 
			
		||||
# - browserless has a single timeout/keepalive which applies to the session made at .connect_over_cdp()
 | 
			
		||||
#
 | 
			
		||||
# So it means that we must unfortunately for now just keep a single timer since .connect_over_cdp() was run
 | 
			
		||||
# and know when that reaches timeout/keepalive :( when that time is up, restart the connection and tell the user
 | 
			
		||||
# that their time is up, insert another coin. (reload)
 | 
			
		||||
#
 | 
			
		||||
# Bigger picture
 | 
			
		||||
# - It's horrible that we have this click+wait deal, some nice socket.io solution using something similar
 | 
			
		||||
# to what the browserless debug UI already gives us would be smarter..
 | 
			
		||||
#
 | 
			
		||||
# OR
 | 
			
		||||
# - Some API call that should be hacked into browserless or playwright that we can "/api/bump-keepalive/{session_id}/60"
 | 
			
		||||
# So we can tell it that we need more time (run this on each action)
 | 
			
		||||
#
 | 
			
		||||
# OR
 | 
			
		||||
# - use multiprocessing to bump this over to its own process and add some transport layer (queue/pipes)
 | 
			
		||||
 | 
			
		||||
from changedetectionio.strtobool import strtobool
 | 
			
		||||
from distutils.util import strtobool
 | 
			
		||||
from flask import Blueprint, request, make_response
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -6,8 +6,6 @@ import re
 | 
			
		||||
from random import randint
 | 
			
		||||
from loguru import logger
 | 
			
		||||
 | 
			
		||||
from changedetectionio.content_fetchers.base import manage_user_agent
 | 
			
		||||
 | 
			
		||||
# Two flags, tell the JS which of the "Selector" or "Value" field should be enabled in the front end
 | 
			
		||||
# 0- off, 1- on
 | 
			
		||||
browser_step_ui_config = {'Choose one': '0 0',
 | 
			
		||||
@@ -171,7 +169,7 @@ class steppable_browser_interface():
 | 
			
		||||
        self.page.locator(selector, timeout=1000).uncheck(timeout=1000)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Responsible for maintaining a live 'context' with the chrome CDP
 | 
			
		||||
# Responsible for maintaining a live 'context' with browserless
 | 
			
		||||
# @todo - how long do contexts live for anyway?
 | 
			
		||||
class browsersteps_live_ui(steppable_browser_interface):
 | 
			
		||||
    context = None
 | 
			
		||||
@@ -180,7 +178,6 @@ class browsersteps_live_ui(steppable_browser_interface):
 | 
			
		||||
    stale = False
 | 
			
		||||
    # bump and kill this if idle after X sec
 | 
			
		||||
    age_start = 0
 | 
			
		||||
    headers = {}
 | 
			
		||||
 | 
			
		||||
    # use a special driver, maybe locally etc
 | 
			
		||||
    command_executor = os.getenv(
 | 
			
		||||
@@ -195,8 +192,7 @@ class browsersteps_live_ui(steppable_browser_interface):
 | 
			
		||||
 | 
			
		||||
    browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')
 | 
			
		||||
 | 
			
		||||
    def __init__(self, playwright_browser, proxy=None, headers=None):
 | 
			
		||||
        self.headers = headers or {}
 | 
			
		||||
    def __init__(self, playwright_browser, proxy=None):
 | 
			
		||||
        self.age_start = time.time()
 | 
			
		||||
        self.playwright_browser = playwright_browser
 | 
			
		||||
        if self.context is None:
 | 
			
		||||
@@ -210,17 +206,16 @@ class browsersteps_live_ui(steppable_browser_interface):
 | 
			
		||||
 | 
			
		||||
        # @todo handle multiple contexts, bind a unique id from the browser on each req?
 | 
			
		||||
        self.context = self.playwright_browser.new_context(
 | 
			
		||||
            accept_downloads=False,  # Should never be needed
 | 
			
		||||
            bypass_csp=True,  # This is needed to enable JavaScript execution on GitHub and others
 | 
			
		||||
            extra_http_headers=self.headers,
 | 
			
		||||
            ignore_https_errors=True,
 | 
			
		||||
            proxy=proxy,
 | 
			
		||||
            service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'),
 | 
			
		||||
            # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
 | 
			
		||||
            user_agent=manage_user_agent(headers=self.headers),
 | 
			
		||||
            # @todo
 | 
			
		||||
            #                user_agent=request_headers['User-Agent'] if request_headers.get('User-Agent') else 'Mozilla/5.0',
 | 
			
		||||
            #               proxy=self.proxy,
 | 
			
		||||
            # This is needed to enable JavaScript execution on GitHub and others
 | 
			
		||||
            bypass_csp=True,
 | 
			
		||||
            # Should never be needed
 | 
			
		||||
            accept_downloads=False,
 | 
			
		||||
            proxy=proxy
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        self.page = self.context.new_page()
 | 
			
		||||
 | 
			
		||||
        # self.page.set_default_navigation_timeout(keep_open)
 | 
			
		||||
@@ -248,7 +243,7 @@ class browsersteps_live_ui(steppable_browser_interface):
 | 
			
		||||
    def get_current_state(self):
 | 
			
		||||
        """Return the screenshot and interactive elements mapping, generally always called after action_()"""
 | 
			
		||||
        from pkg_resources import resource_string
 | 
			
		||||
        xpath_element_js = resource_string(__name__, "../../content_fetchers/res/xpath_element_scraper.js").decode('utf-8')
 | 
			
		||||
        xpath_element_js = resource_string(__name__, "../../res/xpath_element_scraper.js").decode('utf-8')
 | 
			
		||||
        now = time.time()
 | 
			
		||||
        self.page.wait_for_timeout(1 * 1000)
 | 
			
		||||
 | 
			
		||||
@@ -283,10 +278,10 @@ class browsersteps_live_ui(steppable_browser_interface):
 | 
			
		||||
        self.page.evaluate("var include_filters=''")
 | 
			
		||||
        from pkg_resources import resource_string
 | 
			
		||||
        # The code that scrapes elements and makes a list of elements/size/position to click on in the VisualSelector
 | 
			
		||||
        xpath_element_js = resource_string(__name__, "../../content_fetchers/res/xpath_element_scraper.js").decode('utf-8')
 | 
			
		||||
        from changedetectionio.content_fetchers import visualselector_xpath_selectors
 | 
			
		||||
        xpath_element_js = resource_string(__name__, "../../res/xpath_element_scraper.js").decode('utf-8')
 | 
			
		||||
        from changedetectionio.content_fetcher import visualselector_xpath_selectors
 | 
			
		||||
        xpath_element_js = xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
 | 
			
		||||
        xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}")
 | 
			
		||||
        screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
 | 
			
		||||
        screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)))
 | 
			
		||||
 | 
			
		||||
        return (screenshot, xpath_data)
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,5 @@
 | 
			
		||||
from playwright.sync_api import PlaywrightContextManager
 | 
			
		||||
import asyncio
 | 
			
		||||
 | 
			
		||||
# So playwright wants to run as a context manager, but we do something horrible and hacky
 | 
			
		||||
# we are holding the session open for as long as possible, then shutting it down, and opening a new one
 | 
			
		||||
 
 | 
			
		||||
@@ -1,11 +1,14 @@
 | 
			
		||||
from concurrent.futures import ThreadPoolExecutor
 | 
			
		||||
from changedetectionio.store import ChangeDetectionStore
 | 
			
		||||
 | 
			
		||||
from functools import wraps
 | 
			
		||||
 | 
			
		||||
from flask import Blueprint
 | 
			
		||||
from flask_login import login_required
 | 
			
		||||
 | 
			
		||||
from changedetectionio.processors import text_json_diff
 | 
			
		||||
from changedetectionio.store import ChangeDetectionStore
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
STATUS_CHECKING = 0
 | 
			
		||||
STATUS_FAILED = 1
 | 
			
		||||
STATUS_OK = 2
 | 
			
		||||
@@ -29,8 +32,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
 | 
			
		||||
    @threadpool
 | 
			
		||||
    def long_task(uuid, preferred_proxy):
 | 
			
		||||
        import time
 | 
			
		||||
        from changedetectionio.content_fetchers import exceptions as content_fetcher_exceptions
 | 
			
		||||
        from changedetectionio.processors import text_json_diff
 | 
			
		||||
        from changedetectionio import content_fetcher
 | 
			
		||||
 | 
			
		||||
        status = {'status': '', 'length': 0, 'text': ''}
 | 
			
		||||
        from jinja2 import Environment, BaseLoader
 | 
			
		||||
@@ -41,7 +43,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
 | 
			
		||||
            update_handler = text_json_diff.perform_site_check(datastore=datastore, watch_uuid=uuid)
 | 
			
		||||
            update_handler.call_browser()
 | 
			
		||||
        # title, size is len contents not len xfer
 | 
			
		||||
        except content_fetcher_exceptions.Non200ErrorCodeReceived as e:
 | 
			
		||||
        except content_fetcher.Non200ErrorCodeReceived as e:
 | 
			
		||||
            if e.status_code == 404:
 | 
			
		||||
                status.update({'status': 'OK', 'length': len(contents), 'text': f"OK but 404 (page not found)"})
 | 
			
		||||
            elif e.status_code == 403 or e.status_code == 401:
 | 
			
		||||
@@ -50,12 +52,12 @@ def construct_blueprint(datastore: ChangeDetectionStore):
 | 
			
		||||
                status.update({'status': 'ERROR', 'length': len(contents), 'text': f"Status code: {e.status_code}"})
 | 
			
		||||
        except text_json_diff.FilterNotFoundInResponse:
 | 
			
		||||
            status.update({'status': 'OK', 'length': len(contents), 'text': f"OK but CSS/xPath filter not found (page changed layout?)"})
 | 
			
		||||
        except content_fetcher_exceptions.EmptyReply as e:
 | 
			
		||||
        except content_fetcher.EmptyReply as e:
 | 
			
		||||
            if e.status_code == 403 or e.status_code == 401:
 | 
			
		||||
                status.update({'status': 'ERROR OTHER', 'length': len(contents), 'text': f"Got empty reply with code {e.status_code} - Access denied"})
 | 
			
		||||
            else:
 | 
			
		||||
                status.update({'status': 'ERROR OTHER', 'length': len(contents) if contents else 0, 'text': f"Empty reply with code {e.status_code}, needs chrome?"})
 | 
			
		||||
        except content_fetcher_exceptions.ReplyWithContentButNoText as e:
 | 
			
		||||
        except content_fetcher.ReplyWithContentButNoText as e:
 | 
			
		||||
            txt = f"Got reply but with no content - Status code {e.status_code} - It's possible that the filters were found, but contained no usable text (or contained only an image)."
 | 
			
		||||
            status.update({'status': 'ERROR', 'text': txt})
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
 
 | 
			
		||||
@@ -1,5 +1,5 @@
 | 
			
		||||
 | 
			
		||||
from changedetectionio.strtobool import strtobool
 | 
			
		||||
from distutils.util import strtobool
 | 
			
		||||
from flask import Blueprint, flash, redirect, url_for
 | 
			
		||||
from flask_login import login_required
 | 
			
		||||
from changedetectionio.store import ChangeDetectionStore
 | 
			
		||||
 
 | 
			
		||||
@@ -11,16 +11,9 @@ def construct_blueprint(datastore: ChangeDetectionStore):
 | 
			
		||||
    def tags_overview_page():
 | 
			
		||||
        from .form import SingleTag
 | 
			
		||||
        add_form = SingleTag(request.form)
 | 
			
		||||
        sorted_tags = sorted(datastore.data['settings']['application'].get('tags').items(), key=lambda x: x[1]['title'])
 | 
			
		||||
 | 
			
		||||
        from collections import Counter
 | 
			
		||||
 | 
			
		||||
        tag_count = Counter(tag for watch in datastore.data['watching'].values() if watch.get('tags') for tag in watch['tags'])
 | 
			
		||||
 | 
			
		||||
        output = render_template("groups-overview.html",
 | 
			
		||||
                                 available_tags=sorted_tags,
 | 
			
		||||
                                 form=add_form,
 | 
			
		||||
                                 tag_count=tag_count
 | 
			
		||||
                                 available_tags=datastore.data['settings']['application'].get('tags', {}),
 | 
			
		||||
                                 )
 | 
			
		||||
 | 
			
		||||
        return output
 | 
			
		||||
 
 | 
			
		||||
@@ -3,7 +3,7 @@
 | 
			
		||||
{% from '_helpers.jinja' import render_field, render_checkbox_field, render_button %}
 | 
			
		||||
{% from '_common_fields.jinja' import render_common_settings_form %}
 | 
			
		||||
<script>
 | 
			
		||||
    const notification_base_url="{{url_for('ajax_callback_send_notification_test', mode="group-settings")}}";
 | 
			
		||||
    const notification_base_url="{{url_for('ajax_callback_send_notification_test', watch_uuid=uuid)}}";
 | 
			
		||||
</script>
 | 
			
		||||
 | 
			
		||||
<script src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
 | 
			
		||||
 
 | 
			
		||||
@@ -27,7 +27,6 @@
 | 
			
		||||
            <thead>
 | 
			
		||||
            <tr>
 | 
			
		||||
                <th></th>
 | 
			
		||||
                <th># Watches</th>
 | 
			
		||||
                <th>Tag / Label name</th>
 | 
			
		||||
                <th></th>
 | 
			
		||||
            </tr>
 | 
			
		||||
@@ -41,13 +40,12 @@
 | 
			
		||||
                <td colspan="3">No website organisational tags/groups configured</td>
 | 
			
		||||
            </tr>
 | 
			
		||||
            {% endif %}
 | 
			
		||||
            {% for uuid, tag in available_tags  %}
 | 
			
		||||
            {% for uuid, tag in available_tags.items()  %}
 | 
			
		||||
            <tr id="{{ uuid }}" class="{{ loop.cycle('pure-table-odd', 'pure-table-even') }}">
 | 
			
		||||
                <td class="watch-controls">
 | 
			
		||||
                    <a class="link-mute state-{{'on' if tag.notification_muted else 'off'}}" href="{{url_for('tags.mute', uuid=tag.uuid)}}"><img src="{{url_for('static_content', group='images', filename='bell-off.svg')}}" alt="Mute notifications" title="Mute notifications" class="icon icon-mute" ></a>
 | 
			
		||||
                </td>
 | 
			
		||||
                <td>{{ "{:,}".format(tag_count[uuid]) if uuid in tag_count else 0 }}</td>
 | 
			
		||||
                <td class="title-col inline"> <a href="{{url_for('index', tag=uuid) }}">{{ tag.title }}</a></td>
 | 
			
		||||
                <td class="title-col inline">{{tag.title}}</td>
 | 
			
		||||
                <td>
 | 
			
		||||
                    <a class="pure-button pure-button-primary" href="{{ url_for('tags.form_tag_edit', uuid=uuid) }}">Edit</a> 
 | 
			
		||||
                    <a class="pure-button pure-button-primary" href="{{ url_for('tags.delete', uuid=uuid) }}" title="Deletes and removes tag">Delete</a>
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										789
									
								
								changedetectionio/content_fetcher.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										789
									
								
								changedetectionio/content_fetcher.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,789 @@
 | 
			
		||||
from abc import abstractmethod
 | 
			
		||||
from distutils.util import strtobool
 | 
			
		||||
from urllib.parse import urlparse
 | 
			
		||||
import chardet
 | 
			
		||||
import hashlib
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
import requests
 | 
			
		||||
import sys
 | 
			
		||||
import time
 | 
			
		||||
import urllib.parse
 | 
			
		||||
from loguru import logger
 | 
			
		||||
 | 
			
		||||
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section, summary'
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Non200ErrorCodeReceived(Exception):
 | 
			
		||||
    def __init__(self, status_code, url, screenshot=None, xpath_data=None, page_html=None):
 | 
			
		||||
        # Set this so we can use it in other parts of the app
 | 
			
		||||
        self.status_code = status_code
 | 
			
		||||
        self.url = url
 | 
			
		||||
        self.screenshot = screenshot
 | 
			
		||||
        self.xpath_data = xpath_data
 | 
			
		||||
        self.page_text = None
 | 
			
		||||
 | 
			
		||||
        if page_html:
 | 
			
		||||
            from changedetectionio import html_tools
 | 
			
		||||
            self.page_text = html_tools.html_to_text(page_html)
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class checksumFromPreviousCheckWasTheSame(Exception):
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class JSActionExceptions(Exception):
 | 
			
		||||
    def __init__(self, status_code, url, screenshot, message=''):
 | 
			
		||||
        self.status_code = status_code
 | 
			
		||||
        self.url = url
 | 
			
		||||
        self.screenshot = screenshot
 | 
			
		||||
        self.message = message
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class BrowserStepsStepException(Exception):
 | 
			
		||||
    def __init__(self, step_n, original_e):
 | 
			
		||||
        self.step_n = step_n
 | 
			
		||||
        self.original_e = original_e
 | 
			
		||||
        logger.debug(f"Browser Steps exception at step {self.step_n} {str(original_e)}")
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# @todo - make base Exception class that announces via logger()
 | 
			
		||||
class PageUnloadable(Exception):
 | 
			
		||||
    def __init__(self, status_code=None, url='', message='', screenshot=False):
 | 
			
		||||
        # Set this so we can use it in other parts of the app
 | 
			
		||||
        self.status_code = status_code
 | 
			
		||||
        self.url = url
 | 
			
		||||
        self.screenshot = screenshot
 | 
			
		||||
        self.message = message
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
class BrowserStepsInUnsupportedFetcher(Exception):
 | 
			
		||||
    def __init__(self, url):
 | 
			
		||||
        self.url = url
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
class EmptyReply(Exception):
 | 
			
		||||
    def __init__(self, status_code, url, screenshot=None):
 | 
			
		||||
        # Set this so we can use it in other parts of the app
 | 
			
		||||
        self.status_code = status_code
 | 
			
		||||
        self.url = url
 | 
			
		||||
        self.screenshot = screenshot
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ScreenshotUnavailable(Exception):
 | 
			
		||||
    def __init__(self, status_code, url, page_html=None):
 | 
			
		||||
        # Set this so we can use it in other parts of the app
 | 
			
		||||
        self.status_code = status_code
 | 
			
		||||
        self.url = url
 | 
			
		||||
        if page_html:
 | 
			
		||||
            from html_tools import html_to_text
 | 
			
		||||
            self.page_text = html_to_text(page_html)
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ReplyWithContentButNoText(Exception):
 | 
			
		||||
    def __init__(self, status_code, url, screenshot=None, has_filters=False, html_content=''):
 | 
			
		||||
        # Set this so we can use it in other parts of the app
 | 
			
		||||
        self.status_code = status_code
 | 
			
		||||
        self.url = url
 | 
			
		||||
        self.screenshot = screenshot
 | 
			
		||||
        self.has_filters = has_filters
 | 
			
		||||
        self.html_content = html_content
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Fetcher():
 | 
			
		||||
    browser_connection_is_custom = None
 | 
			
		||||
    browser_connection_url = None
 | 
			
		||||
    browser_steps = None
 | 
			
		||||
    browser_steps_screenshot_path = None
 | 
			
		||||
    content = None
 | 
			
		||||
    error = None
 | 
			
		||||
    fetcher_description = "No description"
 | 
			
		||||
    headers = {}
 | 
			
		||||
    instock_data = None
 | 
			
		||||
    instock_data_js = ""
 | 
			
		||||
    status_code = None
 | 
			
		||||
    webdriver_js_execute_code = None
 | 
			
		||||
    xpath_data = None
 | 
			
		||||
    xpath_element_js = ""
 | 
			
		||||
 | 
			
		||||
    # Will be needed in the future by the VisualSelector, always get this where possible.
 | 
			
		||||
    screenshot = False
 | 
			
		||||
    system_http_proxy = os.getenv('HTTP_PROXY')
 | 
			
		||||
    system_https_proxy = os.getenv('HTTPS_PROXY')
 | 
			
		||||
 | 
			
		||||
    # Time ONTOP of the system defined env minimum time
 | 
			
		||||
    render_extract_delay = 0
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        from pkg_resources import resource_string
 | 
			
		||||
        # The code that scrapes elements and makes a list of elements/size/position to click on in the VisualSelector
 | 
			
		||||
        self.xpath_element_js = resource_string(__name__, "res/xpath_element_scraper.js").decode('utf-8')
 | 
			
		||||
        self.instock_data_js = resource_string(__name__, "res/stock-not-in-stock.js").decode('utf-8')
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def get_error(self):
 | 
			
		||||
        return self.error
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def run(self,
 | 
			
		||||
            url,
 | 
			
		||||
            timeout,
 | 
			
		||||
            request_headers,
 | 
			
		||||
            request_body,
 | 
			
		||||
            request_method,
 | 
			
		||||
            ignore_status_codes=False,
 | 
			
		||||
            current_include_filters=None,
 | 
			
		||||
            is_binary=False):
 | 
			
		||||
        # Should set self.error, self.status_code and self.content
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def quit(self):
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def get_last_status_code(self):
 | 
			
		||||
        return self.status_code
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def screenshot_step(self, step_n):
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    # Return true/false if this checker is ready to run, in the case it needs todo some special config check etc
 | 
			
		||||
    def is_ready(self):
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    def get_all_headers(self):
 | 
			
		||||
        """
 | 
			
		||||
        Get all headers but ensure all keys are lowercase
 | 
			
		||||
        :return:
 | 
			
		||||
        """
 | 
			
		||||
        return {k.lower(): v for k, v in self.headers.items()}
 | 
			
		||||
 | 
			
		||||
    def browser_steps_get_valid_steps(self):
 | 
			
		||||
        if self.browser_steps is not None and len(self.browser_steps):
 | 
			
		||||
            valid_steps = filter(
 | 
			
		||||
                lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'),
 | 
			
		||||
                self.browser_steps)
 | 
			
		||||
 | 
			
		||||
            return valid_steps
 | 
			
		||||
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    def iterate_browser_steps(self):
 | 
			
		||||
        from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
 | 
			
		||||
        from playwright._impl._errors import TimeoutError, Error
 | 
			
		||||
        from jinja2 import Environment
 | 
			
		||||
        jinja2_env = Environment(extensions=['jinja2_time.TimeExtension'])
 | 
			
		||||
 | 
			
		||||
        step_n = 0
 | 
			
		||||
 | 
			
		||||
        if self.browser_steps is not None and len(self.browser_steps):
 | 
			
		||||
            interface = steppable_browser_interface()
 | 
			
		||||
            interface.page = self.page
 | 
			
		||||
            valid_steps = self.browser_steps_get_valid_steps()
 | 
			
		||||
 | 
			
		||||
            for step in valid_steps:
 | 
			
		||||
                step_n += 1
 | 
			
		||||
                logger.debug(f">> Iterating check - browser Step n {step_n} - {step['operation']}...")
 | 
			
		||||
                self.screenshot_step("before-" + str(step_n))
 | 
			
		||||
                self.save_step_html("before-" + str(step_n))
 | 
			
		||||
                try:
 | 
			
		||||
                    optional_value = step['optional_value']
 | 
			
		||||
                    selector = step['selector']
 | 
			
		||||
                    # Support for jinja2 template in step values, with date module added
 | 
			
		||||
                    if '{%' in step['optional_value'] or '{{' in step['optional_value']:
 | 
			
		||||
                        optional_value = str(jinja2_env.from_string(step['optional_value']).render())
 | 
			
		||||
                    if '{%' in step['selector'] or '{{' in step['selector']:
 | 
			
		||||
                        selector = str(jinja2_env.from_string(step['selector']).render())
 | 
			
		||||
 | 
			
		||||
                    getattr(interface, "call_action")(action_name=step['operation'],
 | 
			
		||||
                                                      selector=selector,
 | 
			
		||||
                                                      optional_value=optional_value)
 | 
			
		||||
                    self.screenshot_step(step_n)
 | 
			
		||||
                    self.save_step_html(step_n)
 | 
			
		||||
                except (Error, TimeoutError) as e:
 | 
			
		||||
                    logger.debug(str(e))
 | 
			
		||||
                    # Stop processing here
 | 
			
		||||
                    raise BrowserStepsStepException(step_n=step_n, original_e=e)
 | 
			
		||||
 | 
			
		||||
    # It's always good to reset these
 | 
			
		||||
    def delete_browser_steps_screenshots(self):
 | 
			
		||||
        import glob
 | 
			
		||||
        if self.browser_steps_screenshot_path is not None:
 | 
			
		||||
            dest = os.path.join(self.browser_steps_screenshot_path, 'step_*.jpeg')
 | 
			
		||||
            files = glob.glob(dest)
 | 
			
		||||
            for f in files:
 | 
			
		||||
                if os.path.isfile(f):
 | 
			
		||||
                    os.unlink(f)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#   Maybe for the future, each fetcher provides its own diff output, could be used for text, image
 | 
			
		||||
#   the current one would return javascript output (as we use JS to generate the diff)
 | 
			
		||||
#
 | 
			
		||||
def available_fetchers():
 | 
			
		||||
    # See the if statement at the bottom of this file for how we switch between playwright and webdriver
 | 
			
		||||
    import inspect
 | 
			
		||||
    p = []
 | 
			
		||||
    for name, obj in inspect.getmembers(sys.modules[__name__], inspect.isclass):
 | 
			
		||||
        if inspect.isclass(obj):
 | 
			
		||||
            # @todo html_ is maybe better as fetcher_ or something
 | 
			
		||||
            # In this case, make sure to edit the default one in store.py and fetch_site_status.py
 | 
			
		||||
            if name.startswith('html_'):
 | 
			
		||||
                t = tuple([name, obj.fetcher_description])
 | 
			
		||||
                p.append(t)
 | 
			
		||||
 | 
			
		||||
    return p
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class base_html_playwright(Fetcher):
 | 
			
		||||
    fetcher_description = "Playwright {}/Javascript".format(
 | 
			
		||||
        os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
 | 
			
		||||
    )
 | 
			
		||||
    if os.getenv("PLAYWRIGHT_DRIVER_URL"):
 | 
			
		||||
        fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL"))
 | 
			
		||||
 | 
			
		||||
    browser_type = ''
 | 
			
		||||
    command_executor = ''
 | 
			
		||||
 | 
			
		||||
    # Configs for Proxy setup
 | 
			
		||||
    # In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server"
 | 
			
		||||
    playwright_proxy_settings_mappings = ['bypass', 'server', 'username', 'password']
 | 
			
		||||
 | 
			
		||||
    proxy = None
 | 
			
		||||
 | 
			
		||||
    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
 | 
			
		||||
        super().__init__()
 | 
			
		||||
 | 
			
		||||
        self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')
 | 
			
		||||
 | 
			
		||||
        if custom_browser_connection_url:
 | 
			
		||||
            self.browser_connection_is_custom = True
 | 
			
		||||
            self.browser_connection_url = custom_browser_connection_url
 | 
			
		||||
        else:
 | 
			
		||||
            # Fallback to fetching from system
 | 
			
		||||
            # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
 | 
			
		||||
            self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        # If any proxy settings are enabled, then we should setup the proxy object
 | 
			
		||||
        proxy_args = {}
 | 
			
		||||
        for k in self.playwright_proxy_settings_mappings:
 | 
			
		||||
            v = os.getenv('playwright_proxy_' + k, False)
 | 
			
		||||
            if v:
 | 
			
		||||
                proxy_args[k] = v.strip('"')
 | 
			
		||||
 | 
			
		||||
        if proxy_args:
 | 
			
		||||
            self.proxy = proxy_args
 | 
			
		||||
 | 
			
		||||
        # allow per-watch proxy selection override
 | 
			
		||||
        if proxy_override:
 | 
			
		||||
            self.proxy = {'server': proxy_override}
 | 
			
		||||
 | 
			
		||||
        if self.proxy:
 | 
			
		||||
            # Playwright needs separate username and password values
 | 
			
		||||
            parsed = urlparse(self.proxy.get('server'))
 | 
			
		||||
            if parsed.username:
 | 
			
		||||
                self.proxy['username'] = parsed.username
 | 
			
		||||
                self.proxy['password'] = parsed.password
 | 
			
		||||
 | 
			
		||||
    def screenshot_step(self, step_n=''):
 | 
			
		||||
        screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=85)
 | 
			
		||||
 | 
			
		||||
        if self.browser_steps_screenshot_path is not None:
 | 
			
		||||
            destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n))
 | 
			
		||||
            logger.debug(f"Saving step screenshot to {destination}")
 | 
			
		||||
            with open(destination, 'wb') as f:
 | 
			
		||||
                f.write(screenshot)
 | 
			
		||||
 | 
			
		||||
    def save_step_html(self, step_n):
 | 
			
		||||
        content = self.page.content()
 | 
			
		||||
        destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n))
 | 
			
		||||
        logger.debug(f"Saving step HTML to {destination}")
 | 
			
		||||
        with open(destination, 'w') as f:
 | 
			
		||||
            f.write(content)
 | 
			
		||||
 | 
			
		||||
    def run_fetch_browserless_puppeteer(self,
 | 
			
		||||
            url,
 | 
			
		||||
            timeout,
 | 
			
		||||
            request_headers,
 | 
			
		||||
            request_body,
 | 
			
		||||
            request_method,
 | 
			
		||||
            ignore_status_codes=False,
 | 
			
		||||
            current_include_filters=None,
 | 
			
		||||
            is_binary=False):
 | 
			
		||||
 | 
			
		||||
        from pkg_resources import resource_string
 | 
			
		||||
 | 
			
		||||
        extra_wait_ms = (int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) * 1000
 | 
			
		||||
 | 
			
		||||
        self.xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
 | 
			
		||||
        code = resource_string(__name__, "res/puppeteer_fetch.js").decode('utf-8')
 | 
			
		||||
        # In the future inject this is a proper JS package
 | 
			
		||||
        code = code.replace('%xpath_scrape_code%', self.xpath_element_js)
 | 
			
		||||
        code = code.replace('%instock_scrape_code%', self.instock_data_js)
 | 
			
		||||
 | 
			
		||||
        from requests.exceptions import ConnectTimeout, ReadTimeout
 | 
			
		||||
        wait_browserless_seconds = 240
 | 
			
		||||
 | 
			
		||||
        browserless_function_url = os.getenv('BROWSERLESS_FUNCTION_URL')
 | 
			
		||||
        from urllib.parse import urlparse
 | 
			
		||||
        if not browserless_function_url:
 | 
			
		||||
            # Convert/try to guess from PLAYWRIGHT_DRIVER_URL
 | 
			
		||||
            o = urlparse(os.getenv('PLAYWRIGHT_DRIVER_URL'))
 | 
			
		||||
            browserless_function_url = o._replace(scheme="http")._replace(path="function").geturl()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        # Append proxy connect string
 | 
			
		||||
        if self.proxy:
 | 
			
		||||
            # Remove username/password if it exists in the URL or you will receive "ERR_NO_SUPPORTED_PROXIES" error
 | 
			
		||||
            # Actual authentication handled by Puppeteer/node
 | 
			
		||||
            o = urlparse(self.proxy.get('server'))
 | 
			
		||||
            proxy_url = urllib.parse.quote(o._replace(netloc="{}:{}".format(o.hostname, o.port)).geturl())
 | 
			
		||||
            browserless_function_url = f"{browserless_function_url}&--proxy-server={proxy_url}"
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            amp = '&' if '?' in browserless_function_url else '?'
 | 
			
		||||
            response = requests.request(
 | 
			
		||||
                method="POST",
 | 
			
		||||
                json={
 | 
			
		||||
                    "code": code,
 | 
			
		||||
                    "context": {
 | 
			
		||||
                        # Very primitive disk cache - USE WITH EXTREME CAUTION
 | 
			
		||||
                        # Run browserless container  with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]"
 | 
			
		||||
                        'disk_cache_dir': os.getenv("PUPPETEER_DISK_CACHE", False), # or path to disk cache ending in /, ie /tmp/cache/
 | 
			
		||||
                        'execute_js': self.webdriver_js_execute_code,
 | 
			
		||||
                        'extra_wait_ms': extra_wait_ms,
 | 
			
		||||
                        'include_filters': current_include_filters,
 | 
			
		||||
                        'req_headers': request_headers,
 | 
			
		||||
                        'screenshot_quality': int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)),
 | 
			
		||||
                        'url': url,
 | 
			
		||||
                        'user_agent': {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None),
 | 
			
		||||
                        'proxy_username': self.proxy.get('username', '') if self.proxy else False,
 | 
			
		||||
                        'proxy_password': self.proxy.get('password', '') if self.proxy and self.proxy.get('username') else False,
 | 
			
		||||
                        'no_cache_list': [
 | 
			
		||||
                            'twitter',
 | 
			
		||||
                            '.pdf'
 | 
			
		||||
                        ],
 | 
			
		||||
                        # Could use https://github.com/easylist/easylist here, or install a plugin
 | 
			
		||||
                        'block_url_list': [
 | 
			
		||||
                            'adnxs.com',
 | 
			
		||||
                            'analytics.twitter.com',
 | 
			
		||||
                            'doubleclick.net',
 | 
			
		||||
                            'google-analytics.com',
 | 
			
		||||
                            'googletagmanager',
 | 
			
		||||
                            'trustpilot.com'
 | 
			
		||||
                        ]
 | 
			
		||||
                    }
 | 
			
		||||
                },
 | 
			
		||||
                # @todo /function needs adding ws:// to http:// rebuild this
 | 
			
		||||
                url=browserless_function_url+f"{amp}--disable-features=AudioServiceOutOfProcess&dumpio=true&--disable-remote-fonts",
 | 
			
		||||
                timeout=wait_browserless_seconds)
 | 
			
		||||
 | 
			
		||||
        except ReadTimeout:
 | 
			
		||||
            raise PageUnloadable(url=url, status_code=None, message=f"No response from browserless in {wait_browserless_seconds}s")
 | 
			
		||||
        except ConnectTimeout:
 | 
			
		||||
            raise PageUnloadable(url=url, status_code=None, message=f"Timed out connecting to browserless, retrying..")
 | 
			
		||||
        else:
 | 
			
		||||
            # 200 Here means that the communication to browserless worked only, not the page state
 | 
			
		||||
            try:
 | 
			
		||||
                x = response.json()
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                raise PageUnloadable(url=url, message="Error reading JSON response from browserless")
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
                self.status_code = response.status_code
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                raise PageUnloadable(url=url, message="Error reading status_code code response from browserless")
 | 
			
		||||
 | 
			
		||||
            self.headers = x.get('headers')
 | 
			
		||||
 | 
			
		||||
            if self.status_code != 200 and not ignore_status_codes:
 | 
			
		||||
                raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, page_html=x.get('content',''))
 | 
			
		||||
 | 
			
		||||
            if self.status_code == 200:
 | 
			
		||||
                import base64
 | 
			
		||||
 | 
			
		||||
                if not x.get('screenshot'):
 | 
			
		||||
                    # https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips
 | 
			
		||||
                    # https://github.com/puppeteer/puppeteer/issues/1834
 | 
			
		||||
                    # https://github.com/puppeteer/puppeteer/issues/1834#issuecomment-381047051
 | 
			
		||||
                    # Check your memory is shared and big enough
 | 
			
		||||
                    raise ScreenshotUnavailable(url=url, status_code=None)
 | 
			
		||||
 | 
			
		||||
                if not x.get('content', '').strip():
 | 
			
		||||
                    raise EmptyReply(url=url, status_code=None)
 | 
			
		||||
 | 
			
		||||
                self.content = x.get('content')
 | 
			
		||||
                self.instock_data = x.get('instock_data')
 | 
			
		||||
                self.screenshot = base64.b64decode(x.get('screenshot'))
 | 
			
		||||
                self.xpath_data = x.get('xpath_data')
 | 
			
		||||
            else:
 | 
			
		||||
                # Some other error from browserless
 | 
			
		||||
                raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8'))
 | 
			
		||||
 | 
			
		||||
    def run(self,
 | 
			
		||||
            url,
 | 
			
		||||
            timeout,
 | 
			
		||||
            request_headers,
 | 
			
		||||
            request_body,
 | 
			
		||||
            request_method,
 | 
			
		||||
            ignore_status_codes=False,
 | 
			
		||||
            current_include_filters=None,
 | 
			
		||||
            is_binary=False):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        # For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!)
 | 
			
		||||
        # browser_connection_is_custom doesnt work with puppeteer style fetch (use playwright native too in this case)
 | 
			
		||||
        if not self.browser_connection_is_custom and not self.browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
 | 
			
		||||
            if strtobool(os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH')):
 | 
			
		||||
                # Temporary backup solution until we rewrite the playwright code
 | 
			
		||||
                return self.run_fetch_browserless_puppeteer(
 | 
			
		||||
                    url,
 | 
			
		||||
                    timeout,
 | 
			
		||||
                    request_headers,
 | 
			
		||||
                    request_body,
 | 
			
		||||
                    request_method,
 | 
			
		||||
                    ignore_status_codes,
 | 
			
		||||
                    current_include_filters,
 | 
			
		||||
                    is_binary)
 | 
			
		||||
 | 
			
		||||
        from playwright.sync_api import sync_playwright
 | 
			
		||||
        import playwright._impl._errors
 | 
			
		||||
 | 
			
		||||
        self.delete_browser_steps_screenshots()
 | 
			
		||||
        response = None
 | 
			
		||||
 | 
			
		||||
        with sync_playwright() as p:
 | 
			
		||||
            browser_type = getattr(p, self.browser_type)
 | 
			
		||||
 | 
			
		||||
            # Seemed to cause a connection Exception even tho I can see it connect
 | 
			
		||||
            # self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000)
 | 
			
		||||
            # 60,000 connection timeout only
 | 
			
		||||
            browser = browser_type.connect_over_cdp(self.browser_connection_url, timeout=60000)
 | 
			
		||||
 | 
			
		||||
            # SOCKS5 with authentication is not supported (yet)
 | 
			
		||||
            # https://github.com/microsoft/playwright/issues/10567
 | 
			
		||||
 | 
			
		||||
            # Set user agent to prevent Cloudflare from blocking the browser
 | 
			
		||||
            # Use the default one configured in the App.py model that's passed from fetch_site_status.py
 | 
			
		||||
            context = browser.new_context(
 | 
			
		||||
                user_agent={k.lower(): v for k, v in request_headers.items()}.get('user-agent', None),
 | 
			
		||||
                proxy=self.proxy,
 | 
			
		||||
                # This is needed to enable JavaScript execution on GitHub and others
 | 
			
		||||
                bypass_csp=True,
 | 
			
		||||
                # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
 | 
			
		||||
                service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'),
 | 
			
		||||
                # Should never be needed
 | 
			
		||||
                accept_downloads=False
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            self.page = context.new_page()
 | 
			
		||||
            if len(request_headers):
 | 
			
		||||
                context.set_extra_http_headers(request_headers)
 | 
			
		||||
 | 
			
		||||
            # Listen for all console events and handle errors
 | 
			
		||||
            self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
 | 
			
		||||
 | 
			
		||||
            # Re-use as much code from browser steps as possible so its the same
 | 
			
		||||
            from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
 | 
			
		||||
            browsersteps_interface = steppable_browser_interface()
 | 
			
		||||
            browsersteps_interface.page = self.page
 | 
			
		||||
 | 
			
		||||
            response = browsersteps_interface.action_goto_url(value=url)
 | 
			
		||||
            self.headers = response.all_headers()
 | 
			
		||||
 | 
			
		||||
            if response is None:
 | 
			
		||||
                context.close()
 | 
			
		||||
                browser.close()
 | 
			
		||||
                logger.debug("Content Fetcher > Response object was none")
 | 
			
		||||
                raise EmptyReply(url=url, status_code=None)
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
                if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
 | 
			
		||||
                    browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)
 | 
			
		||||
            except playwright._impl._errors.TimeoutError as e:
 | 
			
		||||
                context.close()
 | 
			
		||||
                browser.close()
 | 
			
		||||
                # This can be ok, we will try to grab what we could retrieve
 | 
			
		||||
                pass
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                logger.debug(f"Content Fetcher > Other exception when executing custom JS code {str(e)}")
 | 
			
		||||
                context.close()
 | 
			
		||||
                browser.close()
 | 
			
		||||
                raise PageUnloadable(url=url, status_code=None, message=str(e))
 | 
			
		||||
 | 
			
		||||
            extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
 | 
			
		||||
            self.page.wait_for_timeout(extra_wait * 1000)
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
                self.status_code = response.status
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                # https://github.com/dgtlmoon/changedetection.io/discussions/2122#discussioncomment-8241962
 | 
			
		||||
                logger.critical(f"Response from browserless/playwright did not have a status_code! Response follows.")
 | 
			
		||||
                logger.critical(response)
 | 
			
		||||
                raise PageUnloadable(url=url, status_code=None, message=str(e))
 | 
			
		||||
 | 
			
		||||
            if self.status_code != 200 and not ignore_status_codes:
 | 
			
		||||
 | 
			
		||||
                screenshot=self.page.screenshot(type='jpeg', full_page=True,
 | 
			
		||||
                                     quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)))
 | 
			
		||||
 | 
			
		||||
                raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
 | 
			
		||||
 | 
			
		||||
            if len(self.page.content().strip()) == 0:
 | 
			
		||||
                context.close()
 | 
			
		||||
                browser.close()
 | 
			
		||||
                logger.debug("Content Fetcher > Content was empty")
 | 
			
		||||
                raise EmptyReply(url=url, status_code=response.status)
 | 
			
		||||
 | 
			
		||||
            # Run Browser Steps here
 | 
			
		||||
            if self.browser_steps_get_valid_steps():
 | 
			
		||||
                self.iterate_browser_steps()
 | 
			
		||||
                
 | 
			
		||||
            self.page.wait_for_timeout(extra_wait * 1000)
 | 
			
		||||
 | 
			
		||||
            # So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
 | 
			
		||||
            if current_include_filters is not None:
 | 
			
		||||
                self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters)))
 | 
			
		||||
            else:
 | 
			
		||||
                self.page.evaluate("var include_filters=''")
 | 
			
		||||
 | 
			
		||||
            self.xpath_data = self.page.evaluate(
 | 
			
		||||
                "async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}")
 | 
			
		||||
            self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}")
 | 
			
		||||
 | 
			
		||||
            self.content = self.page.content()
 | 
			
		||||
            # Bug 3 in Playwright screenshot handling
 | 
			
		||||
            # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
 | 
			
		||||
            # JPEG is better here because the screenshots can be very very large
 | 
			
		||||
 | 
			
		||||
            # Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
 | 
			
		||||
            # which will significantly increase the IO size between the server and client, it's recommended to use the lowest
 | 
			
		||||
            # acceptable screenshot quality here
 | 
			
		||||
            try:
 | 
			
		||||
                # The actual screenshot
 | 
			
		||||
                self.screenshot = self.page.screenshot(type='jpeg', full_page=True,
 | 
			
		||||
                                                       quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)))
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                context.close()
 | 
			
		||||
                browser.close()
 | 
			
		||||
                raise ScreenshotUnavailable(url=url, status_code=response.status_code)
 | 
			
		||||
 | 
			
		||||
            context.close()
 | 
			
		||||
            browser.close()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class base_html_webdriver(Fetcher):
 | 
			
		||||
    if os.getenv("WEBDRIVER_URL"):
 | 
			
		||||
        fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))
 | 
			
		||||
    else:
 | 
			
		||||
        fetcher_description = "WebDriver Chrome/Javascript"
 | 
			
		||||
 | 
			
		||||
    # Configs for Proxy setup
 | 
			
		||||
    # In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy"
 | 
			
		||||
    selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy',
 | 
			
		||||
                                        'proxyAutoconfigUrl', 'sslProxy', 'autodetect',
 | 
			
		||||
                                        'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
 | 
			
		||||
    proxy = None
 | 
			
		||||
 | 
			
		||||
    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
 | 
			
		||||
        super().__init__()
 | 
			
		||||
        from selenium.webdriver.common.proxy import Proxy as SeleniumProxy
 | 
			
		||||
 | 
			
		||||
        # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
 | 
			
		||||
        if not custom_browser_connection_url:
 | 
			
		||||
            self.browser_connection_url = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')
 | 
			
		||||
        else:
 | 
			
		||||
            self.browser_connection_is_custom = True
 | 
			
		||||
            self.browser_connection_url = custom_browser_connection_url
 | 
			
		||||
 | 
			
		||||
        # If any proxy settings are enabled, then we should setup the proxy object
 | 
			
		||||
        proxy_args = {}
 | 
			
		||||
        for k in self.selenium_proxy_settings_mappings:
 | 
			
		||||
            v = os.getenv('webdriver_' + k, False)
 | 
			
		||||
            if v:
 | 
			
		||||
                proxy_args[k] = v.strip('"')
 | 
			
		||||
 | 
			
		||||
        # Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy
 | 
			
		||||
        if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy:
 | 
			
		||||
            proxy_args['httpProxy'] = self.system_http_proxy
 | 
			
		||||
        if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy:
 | 
			
		||||
            proxy_args['httpsProxy'] = self.system_https_proxy
 | 
			
		||||
 | 
			
		||||
        # Allows override the proxy on a per-request basis
 | 
			
		||||
        if proxy_override is not None:
 | 
			
		||||
            proxy_args['httpProxy'] = proxy_override
 | 
			
		||||
 | 
			
		||||
        if proxy_args:
 | 
			
		||||
            self.proxy = SeleniumProxy(raw=proxy_args)
 | 
			
		||||
 | 
			
		||||
    def run(self,
 | 
			
		||||
            url,
 | 
			
		||||
            timeout,
 | 
			
		||||
            request_headers,
 | 
			
		||||
            request_body,
 | 
			
		||||
            request_method,
 | 
			
		||||
            ignore_status_codes=False,
 | 
			
		||||
            current_include_filters=None,
 | 
			
		||||
            is_binary=False):
 | 
			
		||||
 | 
			
		||||
        from selenium import webdriver
 | 
			
		||||
        from selenium.webdriver.chrome.options import Options as ChromeOptions
 | 
			
		||||
        from selenium.common.exceptions import WebDriverException
 | 
			
		||||
        # request_body, request_method unused for now, until some magic in the future happens.
 | 
			
		||||
 | 
			
		||||
        options = ChromeOptions()
 | 
			
		||||
        if self.proxy:
 | 
			
		||||
            options.proxy = self.proxy
 | 
			
		||||
 | 
			
		||||
        self.driver = webdriver.Remote(
 | 
			
		||||
            command_executor=self.browser_connection_url,
 | 
			
		||||
            options=options)
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            self.driver.get(url)
 | 
			
		||||
        except WebDriverException as e:
 | 
			
		||||
            # Be sure we close the session window
 | 
			
		||||
            self.quit()
 | 
			
		||||
            raise
 | 
			
		||||
 | 
			
		||||
        self.driver.set_window_size(1280, 1024)
 | 
			
		||||
        self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
 | 
			
		||||
 | 
			
		||||
        if self.webdriver_js_execute_code is not None:
 | 
			
		||||
            self.driver.execute_script(self.webdriver_js_execute_code)
 | 
			
		||||
            # Selenium doesn't automatically wait for actions as good as Playwright, so wait again
 | 
			
		||||
            self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
 | 
			
		||||
 | 
			
		||||
        # @todo - how to check this? is it possible?
 | 
			
		||||
        self.status_code = 200
 | 
			
		||||
        # @todo somehow we should try to get this working for WebDriver
 | 
			
		||||
        # raise EmptyReply(url=url, status_code=r.status_code)
 | 
			
		||||
 | 
			
		||||
        # @todo - dom wait loaded?
 | 
			
		||||
        time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
 | 
			
		||||
        self.content = self.driver.page_source
 | 
			
		||||
        self.headers = {}
 | 
			
		||||
 | 
			
		||||
        self.screenshot = self.driver.get_screenshot_as_png()
 | 
			
		||||
 | 
			
		||||
    # Does the connection to the webdriver work? run a test connection.
 | 
			
		||||
    def is_ready(self):
 | 
			
		||||
        from selenium import webdriver
 | 
			
		||||
        from selenium.webdriver.chrome.options import Options as ChromeOptions
 | 
			
		||||
 | 
			
		||||
        self.driver = webdriver.Remote(
 | 
			
		||||
            command_executor=self.command_executor,
 | 
			
		||||
            options=ChromeOptions())
 | 
			
		||||
 | 
			
		||||
        # driver.quit() seems to cause better exceptions
 | 
			
		||||
        self.quit()
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    def quit(self):
 | 
			
		||||
        if self.driver:
 | 
			
		||||
            try:
 | 
			
		||||
                self.driver.quit()
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                logger.debug(f"Content Fetcher > Exception in chrome shutdown/quit {str(e)}")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# "html_requests" is listed as the default fetcher in store.py!
 | 
			
		||||
class html_requests(Fetcher):
 | 
			
		||||
    fetcher_description = "Basic fast Plaintext/HTTP Client"
 | 
			
		||||
 | 
			
		||||
    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
 | 
			
		||||
        super().__init__()
 | 
			
		||||
        self.proxy_override = proxy_override
 | 
			
		||||
        # browser_connection_url is none because its always 'launched locally'
 | 
			
		||||
 | 
			
		||||
    def run(self,
 | 
			
		||||
            url,
 | 
			
		||||
            timeout,
 | 
			
		||||
            request_headers,
 | 
			
		||||
            request_body,
 | 
			
		||||
            request_method,
 | 
			
		||||
            ignore_status_codes=False,
 | 
			
		||||
            current_include_filters=None,
 | 
			
		||||
            is_binary=False):
 | 
			
		||||
 | 
			
		||||
        if self.browser_steps_get_valid_steps():
 | 
			
		||||
            raise BrowserStepsInUnsupportedFetcher(url=url)
 | 
			
		||||
 | 
			
		||||
        # Make requests use a more modern looking user-agent
 | 
			
		||||
        if not {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None):
 | 
			
		||||
            request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT",
 | 
			
		||||
                                                      'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36')
 | 
			
		||||
 | 
			
		||||
        proxies = {}
 | 
			
		||||
 | 
			
		||||
        # Allows override the proxy on a per-request basis
 | 
			
		||||
 | 
			
		||||
        # https://requests.readthedocs.io/en/latest/user/advanced/#socks
 | 
			
		||||
        # Should also work with `socks5://user:pass@host:port` type syntax.
 | 
			
		||||
 | 
			
		||||
        if self.proxy_override:
 | 
			
		||||
            proxies = {'http': self.proxy_override, 'https': self.proxy_override, 'ftp': self.proxy_override}
 | 
			
		||||
        else:
 | 
			
		||||
            if self.system_http_proxy:
 | 
			
		||||
                proxies['http'] = self.system_http_proxy
 | 
			
		||||
            if self.system_https_proxy:
 | 
			
		||||
                proxies['https'] = self.system_https_proxy
 | 
			
		||||
 | 
			
		||||
        r = requests.request(method=request_method,
 | 
			
		||||
                             data=request_body,
 | 
			
		||||
                             url=url,
 | 
			
		||||
                             headers=request_headers,
 | 
			
		||||
                             timeout=timeout,
 | 
			
		||||
                             proxies=proxies,
 | 
			
		||||
                             verify=False)
 | 
			
		||||
 | 
			
		||||
        # If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
 | 
			
		||||
        # For example - some sites don't tell us it's utf-8, but return utf-8 content
 | 
			
		||||
        # This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably.
 | 
			
		||||
        # https://github.com/psf/requests/issues/1604 good info about requests encoding detection
 | 
			
		||||
        if not is_binary:
 | 
			
		||||
            # Don't run this for PDF (and requests identified as binary) takes a _long_ time
 | 
			
		||||
            if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
 | 
			
		||||
                encoding = chardet.detect(r.content)['encoding']
 | 
			
		||||
                if encoding:
 | 
			
		||||
                    r.encoding = encoding
 | 
			
		||||
 | 
			
		||||
        self.headers = r.headers
 | 
			
		||||
 | 
			
		||||
        if not r.content or not len(r.content):
 | 
			
		||||
            raise EmptyReply(url=url, status_code=r.status_code)
 | 
			
		||||
 | 
			
		||||
        # @todo test this
 | 
			
		||||
        # @todo maybe you really want to test zero-byte return pages?
 | 
			
		||||
        if r.status_code != 200 and not ignore_status_codes:
 | 
			
		||||
            # maybe check with content works?
 | 
			
		||||
            raise Non200ErrorCodeReceived(url=url, status_code=r.status_code, page_html=r.text)
 | 
			
		||||
 | 
			
		||||
        self.status_code = r.status_code
 | 
			
		||||
        if is_binary:
 | 
			
		||||
            # Binary files just return their checksum until we add something smarter
 | 
			
		||||
            self.content = hashlib.md5(r.content).hexdigest()
 | 
			
		||||
        else:
 | 
			
		||||
            self.content = r.text
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        self.raw_content = r.content
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Decide which is the 'real' HTML webdriver, this is more a system wide config
 | 
			
		||||
# rather than site-specific.
 | 
			
		||||
use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False)
 | 
			
		||||
if use_playwright_as_chrome_fetcher:
 | 
			
		||||
    html_webdriver = base_html_playwright
 | 
			
		||||
else:
 | 
			
		||||
    html_webdriver = base_html_webdriver
 | 
			
		||||
@@ -1,43 +0,0 @@
 | 
			
		||||
import sys
 | 
			
		||||
from changedetectionio.strtobool import strtobool
 | 
			
		||||
from loguru import logger
 | 
			
		||||
from changedetectionio.content_fetchers.exceptions import BrowserStepsStepException
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary'
 | 
			
		||||
 | 
			
		||||
# available_fetchers() will scan this implementation looking for anything starting with html_
 | 
			
		||||
# this information is used in the form selections
 | 
			
		||||
from changedetectionio.content_fetchers.requests import fetcher as html_requests
 | 
			
		||||
 | 
			
		||||
def available_fetchers():
 | 
			
		||||
    # See the if statement at the bottom of this file for how we switch between playwright and webdriver
 | 
			
		||||
    import inspect
 | 
			
		||||
    p = []
 | 
			
		||||
    for name, obj in inspect.getmembers(sys.modules[__name__], inspect.isclass):
 | 
			
		||||
        if inspect.isclass(obj):
 | 
			
		||||
            # @todo html_ is maybe better as fetcher_ or something
 | 
			
		||||
            # In this case, make sure to edit the default one in store.py and fetch_site_status.py
 | 
			
		||||
            if name.startswith('html_'):
 | 
			
		||||
                t = tuple([name, obj.fetcher_description])
 | 
			
		||||
                p.append(t)
 | 
			
		||||
 | 
			
		||||
    return p
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Decide which is the 'real' HTML webdriver, this is more a system wide config
 | 
			
		||||
# rather than site-specific.
 | 
			
		||||
use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False)
 | 
			
		||||
if use_playwright_as_chrome_fetcher:
 | 
			
		||||
    # @note - For now, browser steps always uses playwright
 | 
			
		||||
    if not strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')):
 | 
			
		||||
        logger.debug('Using Playwright library as fetcher')
 | 
			
		||||
        from .playwright import fetcher as html_webdriver
 | 
			
		||||
    else:
 | 
			
		||||
        logger.debug('Using direct Python Puppeteer library as fetcher')
 | 
			
		||||
        from .puppeteer import fetcher as html_webdriver
 | 
			
		||||
 | 
			
		||||
else:
 | 
			
		||||
    logger.debug("Falling back to selenium as fetcher")
 | 
			
		||||
    from .webdriver_selenium import fetcher as html_webdriver
 | 
			
		||||
 | 
			
		||||
@@ -1,171 +0,0 @@
 | 
			
		||||
import os
 | 
			
		||||
from abc import abstractmethod
 | 
			
		||||
from loguru import logger
 | 
			
		||||
 | 
			
		||||
from changedetectionio.content_fetchers import BrowserStepsStepException
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def manage_user_agent(headers, current_ua=''):
 | 
			
		||||
    """
 | 
			
		||||
    Basic setting of user-agent
 | 
			
		||||
 | 
			
		||||
    NOTE!!!!!! The service that does the actual Chrome fetching should handle any anti-robot techniques
 | 
			
		||||
    THERE ARE MANY WAYS THAT IT CAN BE DETECTED AS A ROBOT!!
 | 
			
		||||
    This does not take care of
 | 
			
		||||
    - Scraping of 'navigator' (platform, productSub, vendor, oscpu etc etc) browser object (navigator.appVersion) etc
 | 
			
		||||
    - TCP/IP fingerprint JA3 etc
 | 
			
		||||
    - Graphic rendering fingerprinting
 | 
			
		||||
    - Your IP being obviously in a pool of bad actors
 | 
			
		||||
    - Too many requests
 | 
			
		||||
    - Scraping of SCH-UA browser replies (thanks google!!)
 | 
			
		||||
    - Scraping of ServiceWorker, new window calls etc
 | 
			
		||||
 | 
			
		||||
    See https://filipvitas.medium.com/how-to-set-user-agent-header-with-puppeteer-js-and-not-fail-28c7a02165da
 | 
			
		||||
    Puppeteer requests https://github.com/dgtlmoon/pyppeteerstealth
 | 
			
		||||
 | 
			
		||||
    :param page:
 | 
			
		||||
    :param headers:
 | 
			
		||||
    :return:
 | 
			
		||||
    """
 | 
			
		||||
    # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
 | 
			
		||||
    ua_in_custom_headers = next((v for k, v in headers.items() if k.lower() == "user-agent"), None)
 | 
			
		||||
    if ua_in_custom_headers:
 | 
			
		||||
        return ua_in_custom_headers
 | 
			
		||||
 | 
			
		||||
    if not ua_in_custom_headers and current_ua:
 | 
			
		||||
        current_ua = current_ua.replace('HeadlessChrome', 'Chrome')
 | 
			
		||||
        return current_ua
 | 
			
		||||
 | 
			
		||||
    return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Fetcher():
 | 
			
		||||
    browser_connection_is_custom = None
 | 
			
		||||
    browser_connection_url = None
 | 
			
		||||
    browser_steps = None
 | 
			
		||||
    browser_steps_screenshot_path = None
 | 
			
		||||
    content = None
 | 
			
		||||
    error = None
 | 
			
		||||
    fetcher_description = "No description"
 | 
			
		||||
    headers = {}
 | 
			
		||||
    instock_data = None
 | 
			
		||||
    instock_data_js = ""
 | 
			
		||||
    status_code = None
 | 
			
		||||
    webdriver_js_execute_code = None
 | 
			
		||||
    xpath_data = None
 | 
			
		||||
    xpath_element_js = ""
 | 
			
		||||
 | 
			
		||||
    # Will be needed in the future by the VisualSelector, always get this where possible.
 | 
			
		||||
    screenshot = False
 | 
			
		||||
    system_http_proxy = os.getenv('HTTP_PROXY')
 | 
			
		||||
    system_https_proxy = os.getenv('HTTPS_PROXY')
 | 
			
		||||
 | 
			
		||||
    # Time ONTOP of the system defined env minimum time
 | 
			
		||||
    render_extract_delay = 0
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        from pkg_resources import resource_string
 | 
			
		||||
        # The code that scrapes elements and makes a list of elements/size/position to click on in the VisualSelector
 | 
			
		||||
        self.xpath_element_js = resource_string(__name__, "res/xpath_element_scraper.js").decode('utf-8')
 | 
			
		||||
        self.instock_data_js = resource_string(__name__, "res/stock-not-in-stock.js").decode('utf-8')
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def get_error(self):
 | 
			
		||||
        return self.error
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def run(self,
 | 
			
		||||
            url,
 | 
			
		||||
            timeout,
 | 
			
		||||
            request_headers,
 | 
			
		||||
            request_body,
 | 
			
		||||
            request_method,
 | 
			
		||||
            ignore_status_codes=False,
 | 
			
		||||
            current_include_filters=None,
 | 
			
		||||
            is_binary=False):
 | 
			
		||||
        # Should set self.error, self.status_code and self.content
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def quit(self):
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def get_last_status_code(self):
 | 
			
		||||
        return self.status_code
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def screenshot_step(self, step_n):
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    # Return true/false if this checker is ready to run, in the case it needs todo some special config check etc
 | 
			
		||||
    def is_ready(self):
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    def get_all_headers(self):
 | 
			
		||||
        """
 | 
			
		||||
        Get all headers but ensure all keys are lowercase
 | 
			
		||||
        :return:
 | 
			
		||||
        """
 | 
			
		||||
        return {k.lower(): v for k, v in self.headers.items()}
 | 
			
		||||
 | 
			
		||||
    def browser_steps_get_valid_steps(self):
 | 
			
		||||
        if self.browser_steps is not None and len(self.browser_steps):
 | 
			
		||||
            valid_steps = filter(
 | 
			
		||||
                lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'),
 | 
			
		||||
                self.browser_steps)
 | 
			
		||||
 | 
			
		||||
            return valid_steps
 | 
			
		||||
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    def iterate_browser_steps(self):
 | 
			
		||||
        from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
 | 
			
		||||
        from playwright._impl._errors import TimeoutError, Error
 | 
			
		||||
        from jinja2 import Environment
 | 
			
		||||
        jinja2_env = Environment(extensions=['jinja2_time.TimeExtension'])
 | 
			
		||||
 | 
			
		||||
        step_n = 0
 | 
			
		||||
 | 
			
		||||
        if self.browser_steps is not None and len(self.browser_steps):
 | 
			
		||||
            interface = steppable_browser_interface()
 | 
			
		||||
            interface.page = self.page
 | 
			
		||||
            valid_steps = self.browser_steps_get_valid_steps()
 | 
			
		||||
 | 
			
		||||
            for step in valid_steps:
 | 
			
		||||
                step_n += 1
 | 
			
		||||
                logger.debug(f">> Iterating check - browser Step n {step_n} - {step['operation']}...")
 | 
			
		||||
                self.screenshot_step("before-" + str(step_n))
 | 
			
		||||
                self.save_step_html("before-" + str(step_n))
 | 
			
		||||
                try:
 | 
			
		||||
                    optional_value = step['optional_value']
 | 
			
		||||
                    selector = step['selector']
 | 
			
		||||
                    # Support for jinja2 template in step values, with date module added
 | 
			
		||||
                    if '{%' in step['optional_value'] or '{{' in step['optional_value']:
 | 
			
		||||
                        optional_value = str(jinja2_env.from_string(step['optional_value']).render())
 | 
			
		||||
                    if '{%' in step['selector'] or '{{' in step['selector']:
 | 
			
		||||
                        selector = str(jinja2_env.from_string(step['selector']).render())
 | 
			
		||||
 | 
			
		||||
                    getattr(interface, "call_action")(action_name=step['operation'],
 | 
			
		||||
                                                      selector=selector,
 | 
			
		||||
                                                      optional_value=optional_value)
 | 
			
		||||
                    self.screenshot_step(step_n)
 | 
			
		||||
                    self.save_step_html(step_n)
 | 
			
		||||
                except (Error, TimeoutError) as e:
 | 
			
		||||
                    logger.debug(str(e))
 | 
			
		||||
                    # Stop processing here
 | 
			
		||||
                    raise BrowserStepsStepException(step_n=step_n, original_e=e)
 | 
			
		||||
 | 
			
		||||
    # It's always good to reset these
 | 
			
		||||
    def delete_browser_steps_screenshots(self):
 | 
			
		||||
        import glob
 | 
			
		||||
        if self.browser_steps_screenshot_path is not None:
 | 
			
		||||
            dest = os.path.join(self.browser_steps_screenshot_path, 'step_*.jpeg')
 | 
			
		||||
            files = glob.glob(dest)
 | 
			
		||||
            for f in files:
 | 
			
		||||
                if os.path.isfile(f):
 | 
			
		||||
                    os.unlink(f)
 | 
			
		||||
 | 
			
		||||
    def save_step_html(self, param):
 | 
			
		||||
        pass
 | 
			
		||||
@@ -1,97 +0,0 @@
 | 
			
		||||
from loguru import logger
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Non200ErrorCodeReceived(Exception):
 | 
			
		||||
    def __init__(self, status_code, url, screenshot=None, xpath_data=None, page_html=None):
 | 
			
		||||
        # Set this so we can use it in other parts of the app
 | 
			
		||||
        self.status_code = status_code
 | 
			
		||||
        self.url = url
 | 
			
		||||
        self.screenshot = screenshot
 | 
			
		||||
        self.xpath_data = xpath_data
 | 
			
		||||
        self.page_text = None
 | 
			
		||||
 | 
			
		||||
        if page_html:
 | 
			
		||||
            from changedetectionio import html_tools
 | 
			
		||||
            self.page_text = html_tools.html_to_text(page_html)
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class checksumFromPreviousCheckWasTheSame(Exception):
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class JSActionExceptions(Exception):
 | 
			
		||||
    def __init__(self, status_code, url, screenshot, message=''):
 | 
			
		||||
        self.status_code = status_code
 | 
			
		||||
        self.url = url
 | 
			
		||||
        self.screenshot = screenshot
 | 
			
		||||
        self.message = message
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
class BrowserConnectError(Exception):
 | 
			
		||||
    msg = ''
 | 
			
		||||
    def __init__(self, msg):
 | 
			
		||||
        self.msg = msg
 | 
			
		||||
        logger.error(f"Browser connection error {msg}")
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
class BrowserFetchTimedOut(Exception):
 | 
			
		||||
    msg = ''
 | 
			
		||||
    def __init__(self, msg):
 | 
			
		||||
        self.msg = msg
 | 
			
		||||
        logger.error(f"Browser processing took too long - {msg}")
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
class BrowserStepsStepException(Exception):
 | 
			
		||||
    def __init__(self, step_n, original_e):
 | 
			
		||||
        self.step_n = step_n
 | 
			
		||||
        self.original_e = original_e
 | 
			
		||||
        logger.debug(f"Browser Steps exception at step {self.step_n} {str(original_e)}")
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# @todo - make base Exception class that announces via logger()
 | 
			
		||||
class PageUnloadable(Exception):
 | 
			
		||||
    def __init__(self, status_code=None, url='', message='', screenshot=False):
 | 
			
		||||
        # Set this so we can use it in other parts of the app
 | 
			
		||||
        self.status_code = status_code
 | 
			
		||||
        self.url = url
 | 
			
		||||
        self.screenshot = screenshot
 | 
			
		||||
        self.message = message
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
class BrowserStepsInUnsupportedFetcher(Exception):
 | 
			
		||||
    def __init__(self, url):
 | 
			
		||||
        self.url = url
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
class EmptyReply(Exception):
 | 
			
		||||
    def __init__(self, status_code, url, screenshot=None):
 | 
			
		||||
        # Set this so we can use it in other parts of the app
 | 
			
		||||
        self.status_code = status_code
 | 
			
		||||
        self.url = url
 | 
			
		||||
        self.screenshot = screenshot
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ScreenshotUnavailable(Exception):
 | 
			
		||||
    def __init__(self, status_code, url, page_html=None):
 | 
			
		||||
        # Set this so we can use it in other parts of the app
 | 
			
		||||
        self.status_code = status_code
 | 
			
		||||
        self.url = url
 | 
			
		||||
        if page_html:
 | 
			
		||||
            from html_tools import html_to_text
 | 
			
		||||
            self.page_text = html_to_text(page_html)
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ReplyWithContentButNoText(Exception):
 | 
			
		||||
    def __init__(self, status_code, url, screenshot=None, has_filters=False, html_content=''):
 | 
			
		||||
        # Set this so we can use it in other parts of the app
 | 
			
		||||
        self.status_code = status_code
 | 
			
		||||
        self.url = url
 | 
			
		||||
        self.screenshot = screenshot
 | 
			
		||||
        self.has_filters = has_filters
 | 
			
		||||
        self.html_content = html_content
 | 
			
		||||
        return
 | 
			
		||||
@@ -1,208 +0,0 @@
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
from urllib.parse import urlparse
 | 
			
		||||
 | 
			
		||||
from loguru import logger
 | 
			
		||||
 | 
			
		||||
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
 | 
			
		||||
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
 | 
			
		||||
 | 
			
		||||
class fetcher(Fetcher):
 | 
			
		||||
    fetcher_description = "Playwright {}/Javascript".format(
 | 
			
		||||
        os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
 | 
			
		||||
    )
 | 
			
		||||
    if os.getenv("PLAYWRIGHT_DRIVER_URL"):
 | 
			
		||||
        fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL"))
 | 
			
		||||
 | 
			
		||||
    browser_type = ''
 | 
			
		||||
    command_executor = ''
 | 
			
		||||
 | 
			
		||||
    # Configs for Proxy setup
 | 
			
		||||
    # In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server"
 | 
			
		||||
    playwright_proxy_settings_mappings = ['bypass', 'server', 'username', 'password']
 | 
			
		||||
 | 
			
		||||
    proxy = None
 | 
			
		||||
 | 
			
		||||
    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
 | 
			
		||||
        super().__init__()
 | 
			
		||||
 | 
			
		||||
        self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')
 | 
			
		||||
 | 
			
		||||
        if custom_browser_connection_url:
 | 
			
		||||
            self.browser_connection_is_custom = True
 | 
			
		||||
            self.browser_connection_url = custom_browser_connection_url
 | 
			
		||||
        else:
 | 
			
		||||
            # Fallback to fetching from system
 | 
			
		||||
            # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
 | 
			
		||||
            self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"')
 | 
			
		||||
 | 
			
		||||
        # If any proxy settings are enabled, then we should setup the proxy object
 | 
			
		||||
        proxy_args = {}
 | 
			
		||||
        for k in self.playwright_proxy_settings_mappings:
 | 
			
		||||
            v = os.getenv('playwright_proxy_' + k, False)
 | 
			
		||||
            if v:
 | 
			
		||||
                proxy_args[k] = v.strip('"')
 | 
			
		||||
 | 
			
		||||
        if proxy_args:
 | 
			
		||||
            self.proxy = proxy_args
 | 
			
		||||
 | 
			
		||||
        # allow per-watch proxy selection override
 | 
			
		||||
        if proxy_override:
 | 
			
		||||
            self.proxy = {'server': proxy_override}
 | 
			
		||||
 | 
			
		||||
        if self.proxy:
 | 
			
		||||
            # Playwright needs separate username and password values
 | 
			
		||||
            parsed = urlparse(self.proxy.get('server'))
 | 
			
		||||
            if parsed.username:
 | 
			
		||||
                self.proxy['username'] = parsed.username
 | 
			
		||||
                self.proxy['password'] = parsed.password
 | 
			
		||||
 | 
			
		||||
    def screenshot_step(self, step_n=''):
 | 
			
		||||
        screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
 | 
			
		||||
 | 
			
		||||
        if self.browser_steps_screenshot_path is not None:
 | 
			
		||||
            destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n))
 | 
			
		||||
            logger.debug(f"Saving step screenshot to {destination}")
 | 
			
		||||
            with open(destination, 'wb') as f:
 | 
			
		||||
                f.write(screenshot)
 | 
			
		||||
 | 
			
		||||
    def save_step_html(self, step_n):
 | 
			
		||||
        content = self.page.content()
 | 
			
		||||
        destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n))
 | 
			
		||||
        logger.debug(f"Saving step HTML to {destination}")
 | 
			
		||||
        with open(destination, 'w') as f:
 | 
			
		||||
            f.write(content)
 | 
			
		||||
 | 
			
		||||
    def run(self,
 | 
			
		||||
            url,
 | 
			
		||||
            timeout,
 | 
			
		||||
            request_headers,
 | 
			
		||||
            request_body,
 | 
			
		||||
            request_method,
 | 
			
		||||
            ignore_status_codes=False,
 | 
			
		||||
            current_include_filters=None,
 | 
			
		||||
            is_binary=False):
 | 
			
		||||
 | 
			
		||||
        from playwright.sync_api import sync_playwright
 | 
			
		||||
        import playwright._impl._errors
 | 
			
		||||
        from changedetectionio.content_fetchers import visualselector_xpath_selectors
 | 
			
		||||
        self.delete_browser_steps_screenshots()
 | 
			
		||||
        response = None
 | 
			
		||||
 | 
			
		||||
        with sync_playwright() as p:
 | 
			
		||||
            browser_type = getattr(p, self.browser_type)
 | 
			
		||||
 | 
			
		||||
            # Seemed to cause a connection Exception even tho I can see it connect
 | 
			
		||||
            # self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000)
 | 
			
		||||
            # 60,000 connection timeout only
 | 
			
		||||
            browser = browser_type.connect_over_cdp(self.browser_connection_url, timeout=60000)
 | 
			
		||||
 | 
			
		||||
            # SOCKS5 with authentication is not supported (yet)
 | 
			
		||||
            # https://github.com/microsoft/playwright/issues/10567
 | 
			
		||||
 | 
			
		||||
            # Set user agent to prevent Cloudflare from blocking the browser
 | 
			
		||||
            # Use the default one configured in the App.py model that's passed from fetch_site_status.py
 | 
			
		||||
            context = browser.new_context(
 | 
			
		||||
                accept_downloads=False,  # Should never be needed
 | 
			
		||||
                bypass_csp=True,  # This is needed to enable JavaScript execution on GitHub and others
 | 
			
		||||
                extra_http_headers=request_headers,
 | 
			
		||||
                ignore_https_errors=True,
 | 
			
		||||
                proxy=self.proxy,
 | 
			
		||||
                service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
 | 
			
		||||
                user_agent=manage_user_agent(headers=request_headers),
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            self.page = context.new_page()
 | 
			
		||||
 | 
			
		||||
            # Listen for all console events and handle errors
 | 
			
		||||
            self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
 | 
			
		||||
 | 
			
		||||
            # Re-use as much code from browser steps as possible so its the same
 | 
			
		||||
            from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
 | 
			
		||||
            browsersteps_interface = steppable_browser_interface()
 | 
			
		||||
            browsersteps_interface.page = self.page
 | 
			
		||||
 | 
			
		||||
            response = browsersteps_interface.action_goto_url(value=url)
 | 
			
		||||
            self.headers = response.all_headers()
 | 
			
		||||
 | 
			
		||||
            if response is None:
 | 
			
		||||
                context.close()
 | 
			
		||||
                browser.close()
 | 
			
		||||
                logger.debug("Content Fetcher > Response object was none")
 | 
			
		||||
                raise EmptyReply(url=url, status_code=None)
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
                if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
 | 
			
		||||
                    browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)
 | 
			
		||||
            except playwright._impl._errors.TimeoutError as e:
 | 
			
		||||
                context.close()
 | 
			
		||||
                browser.close()
 | 
			
		||||
                # This can be ok, we will try to grab what we could retrieve
 | 
			
		||||
                pass
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                logger.debug(f"Content Fetcher > Other exception when executing custom JS code {str(e)}")
 | 
			
		||||
                context.close()
 | 
			
		||||
                browser.close()
 | 
			
		||||
                raise PageUnloadable(url=url, status_code=None, message=str(e))
 | 
			
		||||
 | 
			
		||||
            extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
 | 
			
		||||
            self.page.wait_for_timeout(extra_wait * 1000)
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
                self.status_code = response.status
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                # https://github.com/dgtlmoon/changedetection.io/discussions/2122#discussioncomment-8241962
 | 
			
		||||
                logger.critical(f"Response from the browser/Playwright did not have a status_code! Response follows.")
 | 
			
		||||
                logger.critical(response)
 | 
			
		||||
                context.close()
 | 
			
		||||
                browser.close()
 | 
			
		||||
                raise PageUnloadable(url=url, status_code=None, message=str(e))
 | 
			
		||||
 | 
			
		||||
            if self.status_code != 200 and not ignore_status_codes:
 | 
			
		||||
                screenshot = self.page.screenshot(type='jpeg', full_page=True,
 | 
			
		||||
                                                  quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
 | 
			
		||||
 | 
			
		||||
                raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
 | 
			
		||||
 | 
			
		||||
            if len(self.page.content().strip()) == 0:
 | 
			
		||||
                context.close()
 | 
			
		||||
                browser.close()
 | 
			
		||||
                logger.debug("Content Fetcher > Content was empty")
 | 
			
		||||
                raise EmptyReply(url=url, status_code=response.status)
 | 
			
		||||
 | 
			
		||||
            # Run Browser Steps here
 | 
			
		||||
            if self.browser_steps_get_valid_steps():
 | 
			
		||||
                self.iterate_browser_steps()
 | 
			
		||||
 | 
			
		||||
            self.page.wait_for_timeout(extra_wait * 1000)
 | 
			
		||||
 | 
			
		||||
            # So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
 | 
			
		||||
            if current_include_filters is not None:
 | 
			
		||||
                self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters)))
 | 
			
		||||
            else:
 | 
			
		||||
                self.page.evaluate("var include_filters=''")
 | 
			
		||||
 | 
			
		||||
            self.xpath_data = self.page.evaluate(
 | 
			
		||||
                "async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}")
 | 
			
		||||
            self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}")
 | 
			
		||||
 | 
			
		||||
            self.content = self.page.content()
 | 
			
		||||
            # Bug 3 in Playwright screenshot handling
 | 
			
		||||
            # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
 | 
			
		||||
            # JPEG is better here because the screenshots can be very very large
 | 
			
		||||
 | 
			
		||||
            # Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
 | 
			
		||||
            # which will significantly increase the IO size between the server and client, it's recommended to use the lowest
 | 
			
		||||
            # acceptable screenshot quality here
 | 
			
		||||
            try:
 | 
			
		||||
                # The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage
 | 
			
		||||
                self.screenshot = self.page.screenshot(type='jpeg',
 | 
			
		||||
                                                       full_page=True,
 | 
			
		||||
                                                       quality=int(os.getenv("SCREENSHOT_QUALITY", 72)),
 | 
			
		||||
                                                       )
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                # It's likely the screenshot was too long/big and something crashed
 | 
			
		||||
                raise ScreenshotUnavailable(url=url, status_code=self.status_code)
 | 
			
		||||
            finally:
 | 
			
		||||
                context.close()
 | 
			
		||||
                browser.close()
 | 
			
		||||
@@ -1,247 +0,0 @@
 | 
			
		||||
import asyncio
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
import websockets.exceptions
 | 
			
		||||
from urllib.parse import urlparse
 | 
			
		||||
 | 
			
		||||
from loguru import logger
 | 
			
		||||
 | 
			
		||||
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
 | 
			
		||||
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class fetcher(Fetcher):
 | 
			
		||||
    fetcher_description = "Puppeteer/direct {}/Javascript".format(
 | 
			
		||||
        os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
 | 
			
		||||
    )
 | 
			
		||||
    if os.getenv("PLAYWRIGHT_DRIVER_URL"):
 | 
			
		||||
        fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL"))
 | 
			
		||||
 | 
			
		||||
    browser_type = ''
 | 
			
		||||
    command_executor = ''
 | 
			
		||||
 | 
			
		||||
    proxy = None
 | 
			
		||||
 | 
			
		||||
    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
 | 
			
		||||
        super().__init__()
 | 
			
		||||
 | 
			
		||||
        if custom_browser_connection_url:
 | 
			
		||||
            self.browser_connection_is_custom = True
 | 
			
		||||
            self.browser_connection_url = custom_browser_connection_url
 | 
			
		||||
        else:
 | 
			
		||||
            # Fallback to fetching from system
 | 
			
		||||
            # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
 | 
			
		||||
            self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"')
 | 
			
		||||
 | 
			
		||||
        # allow per-watch proxy selection override
 | 
			
		||||
        # @todo check global too?
 | 
			
		||||
        if proxy_override:
 | 
			
		||||
            # Playwright needs separate username and password values
 | 
			
		||||
            parsed = urlparse(proxy_override)
 | 
			
		||||
            if parsed:
 | 
			
		||||
                self.proxy = {'username': parsed.username, 'password': parsed.password}
 | 
			
		||||
                # Add the proxy server chrome start option, the username and password never gets added here
 | 
			
		||||
                # (It always goes in via await self.page.authenticate(self.proxy))
 | 
			
		||||
 | 
			
		||||
                # @todo filter some injection attack?
 | 
			
		||||
                # check scheme when no scheme
 | 
			
		||||
                proxy_url = parsed.scheme + "://" if parsed.scheme else 'http://'
 | 
			
		||||
                r = "?" if not '?' in self.browser_connection_url else '&'
 | 
			
		||||
                port = ":"+str(parsed.port) if parsed.port else ''
 | 
			
		||||
                q = "?"+parsed.query if parsed.query else ''
 | 
			
		||||
                proxy_url += f"{parsed.hostname}{port}{parsed.path}{q}"
 | 
			
		||||
                self.browser_connection_url += f"{r}--proxy-server={proxy_url}"
 | 
			
		||||
 | 
			
		||||
    # def screenshot_step(self, step_n=''):
 | 
			
		||||
    #     screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=85)
 | 
			
		||||
    #
 | 
			
		||||
    #     if self.browser_steps_screenshot_path is not None:
 | 
			
		||||
    #         destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n))
 | 
			
		||||
    #         logger.debug(f"Saving step screenshot to {destination}")
 | 
			
		||||
    #         with open(destination, 'wb') as f:
 | 
			
		||||
    #             f.write(screenshot)
 | 
			
		||||
    #
 | 
			
		||||
    # def save_step_html(self, step_n):
 | 
			
		||||
    #     content = self.page.content()
 | 
			
		||||
    #     destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n))
 | 
			
		||||
    #     logger.debug(f"Saving step HTML to {destination}")
 | 
			
		||||
    #     with open(destination, 'w') as f:
 | 
			
		||||
    #         f.write(content)
 | 
			
		||||
 | 
			
		||||
    async def fetch_page(self,
 | 
			
		||||
                         url,
 | 
			
		||||
                         timeout,
 | 
			
		||||
                         request_headers,
 | 
			
		||||
                         request_body,
 | 
			
		||||
                         request_method,
 | 
			
		||||
                         ignore_status_codes,
 | 
			
		||||
                         current_include_filters,
 | 
			
		||||
                         is_binary
 | 
			
		||||
                         ):
 | 
			
		||||
 | 
			
		||||
        from changedetectionio.content_fetchers import visualselector_xpath_selectors
 | 
			
		||||
        self.delete_browser_steps_screenshots()
 | 
			
		||||
        extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
 | 
			
		||||
 | 
			
		||||
        from pyppeteer import Pyppeteer
 | 
			
		||||
        pyppeteer_instance = Pyppeteer()
 | 
			
		||||
 | 
			
		||||
        # Connect directly using the specified browser_ws_endpoint
 | 
			
		||||
        # @todo timeout
 | 
			
		||||
        try:
 | 
			
		||||
            browser = await pyppeteer_instance.connect(browserWSEndpoint=self.browser_connection_url,
 | 
			
		||||
                                                       ignoreHTTPSErrors=True
 | 
			
		||||
                                                       )
 | 
			
		||||
        except websockets.exceptions.InvalidStatusCode as e:
 | 
			
		||||
            raise BrowserConnectError(msg=f"Error while trying to connect the browser, Code {e.status_code} (check your access)")
 | 
			
		||||
        except websockets.exceptions.InvalidURI:
 | 
			
		||||
            raise BrowserConnectError(msg=f"Error connecting to the browser, check your browser connection address (should be ws:// or wss://")
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            raise BrowserConnectError(msg=f"Error connecting to the browser {str(e)}")
 | 
			
		||||
        else:
 | 
			
		||||
            self.page = await browser.newPage()
 | 
			
		||||
 | 
			
		||||
        await self.page.setUserAgent(manage_user_agent(headers=request_headers, current_ua=await self.page.evaluate('navigator.userAgent')))
 | 
			
		||||
 | 
			
		||||
        await self.page.setBypassCSP(True)
 | 
			
		||||
        if request_headers:
 | 
			
		||||
            await self.page.setExtraHTTPHeaders(request_headers)
 | 
			
		||||
 | 
			
		||||
        # SOCKS5 with authentication is not supported (yet)
 | 
			
		||||
        # https://github.com/microsoft/playwright/issues/10567
 | 
			
		||||
        self.page.setDefaultNavigationTimeout(0)
 | 
			
		||||
        await self.page.setCacheEnabled(True)
 | 
			
		||||
        if self.proxy and self.proxy.get('username'):
 | 
			
		||||
            # Setting Proxy-Authentication header is deprecated, and doing so can trigger header change errors from Puppeteer
 | 
			
		||||
            # https://github.com/puppeteer/puppeteer/issues/676 ?
 | 
			
		||||
            # https://help.brightdata.com/hc/en-us/articles/12632549957649-Proxy-Manager-How-to-Guides#h_01HAKWR4Q0AFS8RZTNYWRDFJC2
 | 
			
		||||
            # https://cri.dev/posts/2020-03-30-How-to-solve-Puppeteer-Chrome-Error-ERR_INVALID_ARGUMENT/
 | 
			
		||||
            await self.page.authenticate(self.proxy)
 | 
			
		||||
 | 
			
		||||
        # Re-use as much code from browser steps as possible so its the same
 | 
			
		||||
        # from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
 | 
			
		||||
 | 
			
		||||
        # not yet used here, we fallback to playwright when browsersteps is required
 | 
			
		||||
        #            browsersteps_interface = steppable_browser_interface()
 | 
			
		||||
        #            browsersteps_interface.page = self.page
 | 
			
		||||
 | 
			
		||||
        response = await self.page.goto(url, waitUntil="load")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        if response is None:
 | 
			
		||||
            await self.page.close()
 | 
			
		||||
            await browser.close()
 | 
			
		||||
            logger.warning("Content Fetcher > Response object was none")
 | 
			
		||||
            raise EmptyReply(url=url, status_code=None)
 | 
			
		||||
 | 
			
		||||
        self.headers = response.headers
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
 | 
			
		||||
                await self.page.evaluate(self.webdriver_js_execute_code)
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.warning("Got exception when running evaluate on custom JS code")
 | 
			
		||||
            logger.error(str(e))
 | 
			
		||||
            await self.page.close()
 | 
			
		||||
            await browser.close()
 | 
			
		||||
            # This can be ok, we will try to grab what we could retrieve
 | 
			
		||||
            raise PageUnloadable(url=url, status_code=None, message=str(e))
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            self.status_code = response.status
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            # https://github.com/dgtlmoon/changedetection.io/discussions/2122#discussioncomment-8241962
 | 
			
		||||
            logger.critical(f"Response from the browser/Playwright did not have a status_code! Response follows.")
 | 
			
		||||
            logger.critical(response)
 | 
			
		||||
            await self.page.close()
 | 
			
		||||
            await browser.close()
 | 
			
		||||
            raise PageUnloadable(url=url, status_code=None, message=str(e))
 | 
			
		||||
 | 
			
		||||
        if self.status_code != 200 and not ignore_status_codes:
 | 
			
		||||
            screenshot = await self.page.screenshot(type_='jpeg',
 | 
			
		||||
                                                    fullPage=True,
 | 
			
		||||
                                                    quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
 | 
			
		||||
 | 
			
		||||
            raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
 | 
			
		||||
        content = await self.page.content
 | 
			
		||||
        if len(content.strip()) == 0:
 | 
			
		||||
            await self.page.close()
 | 
			
		||||
            await browser.close()
 | 
			
		||||
            logger.error("Content Fetcher > Content was empty")
 | 
			
		||||
            raise EmptyReply(url=url, status_code=response.status)
 | 
			
		||||
 | 
			
		||||
        # Run Browser Steps here
 | 
			
		||||
        # @todo not yet supported, we switch to playwright in this case
 | 
			
		||||
        #            if self.browser_steps_get_valid_steps():
 | 
			
		||||
        #                self.iterate_browser_steps()
 | 
			
		||||
 | 
			
		||||
        await asyncio.sleep(1 + extra_wait)
 | 
			
		||||
 | 
			
		||||
        # So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
 | 
			
		||||
        # Setup the xPath/VisualSelector scraper
 | 
			
		||||
        if current_include_filters is not None:
 | 
			
		||||
            js = json.dumps(current_include_filters)
 | 
			
		||||
            await self.page.evaluate(f"var include_filters={js}")
 | 
			
		||||
        else:
 | 
			
		||||
            await self.page.evaluate(f"var include_filters=''")
 | 
			
		||||
 | 
			
		||||
        self.xpath_data = await self.page.evaluate(
 | 
			
		||||
            "async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}")
 | 
			
		||||
        self.instock_data = await self.page.evaluate("async () => {" + self.instock_data_js + "}")
 | 
			
		||||
 | 
			
		||||
        self.content = await self.page.content
 | 
			
		||||
        # Bug 3 in Playwright screenshot handling
 | 
			
		||||
        # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
 | 
			
		||||
        # JPEG is better here because the screenshots can be very very large
 | 
			
		||||
 | 
			
		||||
        # Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
 | 
			
		||||
        # which will significantly increase the IO size between the server and client, it's recommended to use the lowest
 | 
			
		||||
        # acceptable screenshot quality here
 | 
			
		||||
        try:
 | 
			
		||||
            self.screenshot = await self.page.screenshot(type_='jpeg',
 | 
			
		||||
                                                         fullPage=True,
 | 
			
		||||
                                                         quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.error("Error fetching screenshot")
 | 
			
		||||
            # // May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw'
 | 
			
		||||
            # // @ todo after text extract, we can place some overlay text with red background to say 'croppped'
 | 
			
		||||
            logger.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot')
 | 
			
		||||
            try:
 | 
			
		||||
                self.screenshot = await self.page.screenshot(type_='jpeg',
 | 
			
		||||
                                                             fullPage=False,
 | 
			
		||||
                                                             quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                logger.error('ERROR: Failed to get viewport-only reduced screenshot :(')
 | 
			
		||||
                pass
 | 
			
		||||
        finally:
 | 
			
		||||
            # It's good to log here in the case that the browser crashes on shutting down but we still get the data we need
 | 
			
		||||
            logger.success(f"Fetching '{url}' complete, closing page")
 | 
			
		||||
            await self.page.close()
 | 
			
		||||
            logger.success(f"Fetching '{url}' complete, closing browser")
 | 
			
		||||
            await browser.close()
 | 
			
		||||
        logger.success(f"Fetching '{url}' complete, exiting puppeteer fetch.")
 | 
			
		||||
 | 
			
		||||
    async def main(self, **kwargs):
 | 
			
		||||
        await self.fetch_page(**kwargs)
 | 
			
		||||
 | 
			
		||||
    def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False,
 | 
			
		||||
            current_include_filters=None, is_binary=False):
 | 
			
		||||
 | 
			
		||||
        #@todo make update_worker async which could run any of these content_fetchers within memory and time constraints
 | 
			
		||||
        max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180)
 | 
			
		||||
 | 
			
		||||
        # This will work in 3.10 but not >= 3.11 because 3.11 wants tasks only
 | 
			
		||||
        try:
 | 
			
		||||
            asyncio.run(asyncio.wait_for(self.main(
 | 
			
		||||
                url=url,
 | 
			
		||||
                timeout=timeout,
 | 
			
		||||
                request_headers=request_headers,
 | 
			
		||||
                request_body=request_body,
 | 
			
		||||
                request_method=request_method,
 | 
			
		||||
                ignore_status_codes=ignore_status_codes,
 | 
			
		||||
                current_include_filters=current_include_filters,
 | 
			
		||||
                is_binary=is_binary
 | 
			
		||||
            ), timeout=max_time))
 | 
			
		||||
        except asyncio.TimeoutError:
 | 
			
		||||
            raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))
 | 
			
		||||
 | 
			
		||||
@@ -1,91 +0,0 @@
 | 
			
		||||
import hashlib
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
import chardet
 | 
			
		||||
import requests
 | 
			
		||||
 | 
			
		||||
from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
 | 
			
		||||
from changedetectionio.content_fetchers.base import Fetcher
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# "html_requests" is listed as the default fetcher in store.py!
 | 
			
		||||
class fetcher(Fetcher):
 | 
			
		||||
    fetcher_description = "Basic fast Plaintext/HTTP Client"
 | 
			
		||||
 | 
			
		||||
    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
 | 
			
		||||
        super().__init__()
 | 
			
		||||
        self.proxy_override = proxy_override
 | 
			
		||||
        # browser_connection_url is none because its always 'launched locally'
 | 
			
		||||
 | 
			
		||||
    def run(self,
 | 
			
		||||
            url,
 | 
			
		||||
            timeout,
 | 
			
		||||
            request_headers,
 | 
			
		||||
            request_body,
 | 
			
		||||
            request_method,
 | 
			
		||||
            ignore_status_codes=False,
 | 
			
		||||
            current_include_filters=None,
 | 
			
		||||
            is_binary=False):
 | 
			
		||||
 | 
			
		||||
        if self.browser_steps_get_valid_steps():
 | 
			
		||||
            raise BrowserStepsInUnsupportedFetcher(url=url)
 | 
			
		||||
 | 
			
		||||
        # Make requests use a more modern looking user-agent
 | 
			
		||||
        if not {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None):
 | 
			
		||||
            request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT",
 | 
			
		||||
                                                      'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36')
 | 
			
		||||
 | 
			
		||||
        proxies = {}
 | 
			
		||||
 | 
			
		||||
        # Allows override the proxy on a per-request basis
 | 
			
		||||
 | 
			
		||||
        # https://requests.readthedocs.io/en/latest/user/advanced/#socks
 | 
			
		||||
        # Should also work with `socks5://user:pass@host:port` type syntax.
 | 
			
		||||
 | 
			
		||||
        if self.proxy_override:
 | 
			
		||||
            proxies = {'http': self.proxy_override, 'https': self.proxy_override, 'ftp': self.proxy_override}
 | 
			
		||||
        else:
 | 
			
		||||
            if self.system_http_proxy:
 | 
			
		||||
                proxies['http'] = self.system_http_proxy
 | 
			
		||||
            if self.system_https_proxy:
 | 
			
		||||
                proxies['https'] = self.system_https_proxy
 | 
			
		||||
 | 
			
		||||
        r = requests.request(method=request_method,
 | 
			
		||||
                             data=request_body,
 | 
			
		||||
                             url=url,
 | 
			
		||||
                             headers=request_headers,
 | 
			
		||||
                             timeout=timeout,
 | 
			
		||||
                             proxies=proxies,
 | 
			
		||||
                             verify=False)
 | 
			
		||||
 | 
			
		||||
        # If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
 | 
			
		||||
        # For example - some sites don't tell us it's utf-8, but return utf-8 content
 | 
			
		||||
        # This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably.
 | 
			
		||||
        # https://github.com/psf/requests/issues/1604 good info about requests encoding detection
 | 
			
		||||
        if not is_binary:
 | 
			
		||||
            # Don't run this for PDF (and requests identified as binary) takes a _long_ time
 | 
			
		||||
            if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
 | 
			
		||||
                encoding = chardet.detect(r.content)['encoding']
 | 
			
		||||
                if encoding:
 | 
			
		||||
                    r.encoding = encoding
 | 
			
		||||
 | 
			
		||||
        self.headers = r.headers
 | 
			
		||||
 | 
			
		||||
        if not r.content or not len(r.content):
 | 
			
		||||
            raise EmptyReply(url=url, status_code=r.status_code)
 | 
			
		||||
 | 
			
		||||
        # @todo test this
 | 
			
		||||
        # @todo maybe you really want to test zero-byte return pages?
 | 
			
		||||
        if r.status_code != 200 and not ignore_status_codes:
 | 
			
		||||
            # maybe check with content works?
 | 
			
		||||
            raise Non200ErrorCodeReceived(url=url, status_code=r.status_code, page_html=r.text)
 | 
			
		||||
 | 
			
		||||
        self.status_code = r.status_code
 | 
			
		||||
        if is_binary:
 | 
			
		||||
            # Binary files just return their checksum until we add something smarter
 | 
			
		||||
            self.content = hashlib.md5(r.content).hexdigest()
 | 
			
		||||
        else:
 | 
			
		||||
            self.content = r.text
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        self.raw_content = r.content
 | 
			
		||||
@@ -1,119 +0,0 @@
 | 
			
		||||
import os
 | 
			
		||||
import time
 | 
			
		||||
 | 
			
		||||
from loguru import logger
 | 
			
		||||
from changedetectionio.content_fetchers.base import Fetcher
 | 
			
		||||
 | 
			
		||||
class fetcher(Fetcher):
 | 
			
		||||
    if os.getenv("WEBDRIVER_URL"):
 | 
			
		||||
        fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))
 | 
			
		||||
    else:
 | 
			
		||||
        fetcher_description = "WebDriver Chrome/Javascript"
 | 
			
		||||
 | 
			
		||||
    # Configs for Proxy setup
 | 
			
		||||
    # In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy"
 | 
			
		||||
    selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy',
 | 
			
		||||
                                        'proxyAutoconfigUrl', 'sslProxy', 'autodetect',
 | 
			
		||||
                                        'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
 | 
			
		||||
    proxy = None
 | 
			
		||||
 | 
			
		||||
    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
 | 
			
		||||
        super().__init__()
 | 
			
		||||
        from selenium.webdriver.common.proxy import Proxy as SeleniumProxy
 | 
			
		||||
 | 
			
		||||
        # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
 | 
			
		||||
        if not custom_browser_connection_url:
 | 
			
		||||
            self.browser_connection_url = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')
 | 
			
		||||
        else:
 | 
			
		||||
            self.browser_connection_is_custom = True
 | 
			
		||||
            self.browser_connection_url = custom_browser_connection_url
 | 
			
		||||
 | 
			
		||||
        # If any proxy settings are enabled, then we should setup the proxy object
 | 
			
		||||
        proxy_args = {}
 | 
			
		||||
        for k in self.selenium_proxy_settings_mappings:
 | 
			
		||||
            v = os.getenv('webdriver_' + k, False)
 | 
			
		||||
            if v:
 | 
			
		||||
                proxy_args[k] = v.strip('"')
 | 
			
		||||
 | 
			
		||||
        # Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy
 | 
			
		||||
        if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy:
 | 
			
		||||
            proxy_args['httpProxy'] = self.system_http_proxy
 | 
			
		||||
        if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy:
 | 
			
		||||
            proxy_args['httpsProxy'] = self.system_https_proxy
 | 
			
		||||
 | 
			
		||||
        # Allows override the proxy on a per-request basis
 | 
			
		||||
        if proxy_override is not None:
 | 
			
		||||
            proxy_args['httpProxy'] = proxy_override
 | 
			
		||||
 | 
			
		||||
        if proxy_args:
 | 
			
		||||
            self.proxy = SeleniumProxy(raw=proxy_args)
 | 
			
		||||
 | 
			
		||||
    def run(self,
 | 
			
		||||
            url,
 | 
			
		||||
            timeout,
 | 
			
		||||
            request_headers,
 | 
			
		||||
            request_body,
 | 
			
		||||
            request_method,
 | 
			
		||||
            ignore_status_codes=False,
 | 
			
		||||
            current_include_filters=None,
 | 
			
		||||
            is_binary=False):
 | 
			
		||||
 | 
			
		||||
        from selenium import webdriver
 | 
			
		||||
        from selenium.webdriver.chrome.options import Options as ChromeOptions
 | 
			
		||||
        from selenium.common.exceptions import WebDriverException
 | 
			
		||||
        # request_body, request_method unused for now, until some magic in the future happens.
 | 
			
		||||
 | 
			
		||||
        options = ChromeOptions()
 | 
			
		||||
        if self.proxy:
 | 
			
		||||
            options.proxy = self.proxy
 | 
			
		||||
 | 
			
		||||
        self.driver = webdriver.Remote(
 | 
			
		||||
            command_executor=self.browser_connection_url,
 | 
			
		||||
            options=options)
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            self.driver.get(url)
 | 
			
		||||
        except WebDriverException as e:
 | 
			
		||||
            # Be sure we close the session window
 | 
			
		||||
            self.quit()
 | 
			
		||||
            raise
 | 
			
		||||
 | 
			
		||||
        self.driver.set_window_size(1280, 1024)
 | 
			
		||||
        self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
 | 
			
		||||
 | 
			
		||||
        if self.webdriver_js_execute_code is not None:
 | 
			
		||||
            self.driver.execute_script(self.webdriver_js_execute_code)
 | 
			
		||||
            # Selenium doesn't automatically wait for actions as good as Playwright, so wait again
 | 
			
		||||
            self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
 | 
			
		||||
 | 
			
		||||
        # @todo - how to check this? is it possible?
 | 
			
		||||
        self.status_code = 200
 | 
			
		||||
        # @todo somehow we should try to get this working for WebDriver
 | 
			
		||||
        # raise EmptyReply(url=url, status_code=r.status_code)
 | 
			
		||||
 | 
			
		||||
        # @todo - dom wait loaded?
 | 
			
		||||
        time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
 | 
			
		||||
        self.content = self.driver.page_source
 | 
			
		||||
        self.headers = {}
 | 
			
		||||
 | 
			
		||||
        self.screenshot = self.driver.get_screenshot_as_png()
 | 
			
		||||
 | 
			
		||||
    # Does the connection to the webdriver work? run a test connection.
 | 
			
		||||
    def is_ready(self):
 | 
			
		||||
        from selenium import webdriver
 | 
			
		||||
        from selenium.webdriver.chrome.options import Options as ChromeOptions
 | 
			
		||||
 | 
			
		||||
        self.driver = webdriver.Remote(
 | 
			
		||||
            command_executor=self.command_executor,
 | 
			
		||||
            options=ChromeOptions())
 | 
			
		||||
 | 
			
		||||
        # driver.quit() seems to cause better exceptions
 | 
			
		||||
        self.quit()
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    def quit(self):
 | 
			
		||||
        if self.driver:
 | 
			
		||||
            try:
 | 
			
		||||
                self.driver.quit()
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                logger.debug(f"Content Fetcher > Exception in chrome shutdown/quit {str(e)}")
 | 
			
		||||
@@ -1,19 +1,25 @@
 | 
			
		||||
#!/usr/bin/python3
 | 
			
		||||
 | 
			
		||||
from changedetectionio import queuedWatchMetaData
 | 
			
		||||
from copy import deepcopy
 | 
			
		||||
from distutils.util import strtobool
 | 
			
		||||
from feedgen.feed import FeedGenerator
 | 
			
		||||
from flask_compress import Compress as FlaskCompress
 | 
			
		||||
from flask_login import current_user
 | 
			
		||||
from flask_restful import abort, Api
 | 
			
		||||
from flask_wtf import CSRFProtect
 | 
			
		||||
from functools import wraps
 | 
			
		||||
from threading import Event
 | 
			
		||||
import datetime
 | 
			
		||||
import flask_login
 | 
			
		||||
from loguru import logger
 | 
			
		||||
import os
 | 
			
		||||
import pytz
 | 
			
		||||
import queue
 | 
			
		||||
import threading
 | 
			
		||||
import time
 | 
			
		||||
from copy import deepcopy
 | 
			
		||||
from changedetectionio.strtobool import strtobool
 | 
			
		||||
from functools import wraps
 | 
			
		||||
from threading import Event
 | 
			
		||||
 | 
			
		||||
import flask_login
 | 
			
		||||
import pytz
 | 
			
		||||
import timeago
 | 
			
		||||
from feedgen.feed import FeedGenerator
 | 
			
		||||
 | 
			
		||||
from flask import (
 | 
			
		||||
    Flask,
 | 
			
		||||
    abort,
 | 
			
		||||
@@ -26,16 +32,10 @@ from flask import (
 | 
			
		||||
    session,
 | 
			
		||||
    url_for,
 | 
			
		||||
)
 | 
			
		||||
from flask_compress import Compress as FlaskCompress
 | 
			
		||||
from flask_login import current_user
 | 
			
		||||
 | 
			
		||||
from flask_paginate import Pagination, get_page_parameter
 | 
			
		||||
from flask_restful import abort, Api
 | 
			
		||||
from flask_cors import CORS
 | 
			
		||||
from flask_wtf import CSRFProtect
 | 
			
		||||
from loguru import logger
 | 
			
		||||
 | 
			
		||||
from changedetectionio import html_tools, __version__
 | 
			
		||||
from changedetectionio import queuedWatchMetaData
 | 
			
		||||
from changedetectionio.api import api_v1
 | 
			
		||||
 | 
			
		||||
datastore = None
 | 
			
		||||
@@ -54,9 +54,6 @@ app = Flask(__name__,
 | 
			
		||||
            static_folder="static",
 | 
			
		||||
            template_folder="templates")
 | 
			
		||||
 | 
			
		||||
# Enable CORS, especially useful for the Chrome extension to operate from anywhere
 | 
			
		||||
CORS(app)
 | 
			
		||||
 | 
			
		||||
# Super handy for compressing large BrowserSteps responses and others
 | 
			
		||||
FlaskCompress(app)
 | 
			
		||||
 | 
			
		||||
@@ -408,21 +405,17 @@ def changedetection_app(config=None, datastore_o=None):
 | 
			
		||||
        global datastore
 | 
			
		||||
        from changedetectionio import forms
 | 
			
		||||
 | 
			
		||||
        active_tag_req = request.args.get('tag', '').lower().strip()
 | 
			
		||||
        active_tag_uuid = active_tag = None
 | 
			
		||||
        limit_tag = request.args.get('tag', '').lower().strip()
 | 
			
		||||
 | 
			
		||||
        # Be sure limit_tag is a uuid
 | 
			
		||||
        if active_tag_req:
 | 
			
		||||
            for uuid, tag in datastore.data['settings']['application'].get('tags', {}).items():
 | 
			
		||||
                if active_tag_req == tag.get('title', '').lower().strip() or active_tag_req == uuid:
 | 
			
		||||
                    active_tag = tag
 | 
			
		||||
                    active_tag_uuid = uuid
 | 
			
		||||
                    break
 | 
			
		||||
        for uuid, tag in datastore.data['settings']['application'].get('tags', {}).items():
 | 
			
		||||
            if limit_tag == tag.get('title', '').lower().strip():
 | 
			
		||||
                limit_tag = uuid
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        # Redirect for the old rss path which used the /?rss=true
 | 
			
		||||
        if request.args.get('rss'):
 | 
			
		||||
            return redirect(url_for('rss', tag=active_tag_uuid))
 | 
			
		||||
            return redirect(url_for('rss', tag=limit_tag))
 | 
			
		||||
 | 
			
		||||
        op = request.args.get('op')
 | 
			
		||||
        if op:
 | 
			
		||||
@@ -433,7 +426,7 @@ def changedetection_app(config=None, datastore_o=None):
 | 
			
		||||
                datastore.data['watching'][uuid].toggle_mute()
 | 
			
		||||
 | 
			
		||||
            datastore.needs_write = True
 | 
			
		||||
            return redirect(url_for('index', tag = active_tag_uuid))
 | 
			
		||||
            return redirect(url_for('index', tag = limit_tag))
 | 
			
		||||
 | 
			
		||||
        # Sort by last_changed and add the uuid which is usually the key..
 | 
			
		||||
        sorted_watches = []
 | 
			
		||||
@@ -444,7 +437,7 @@ def changedetection_app(config=None, datastore_o=None):
 | 
			
		||||
            if with_errors and not watch.get('last_error'):
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if active_tag_uuid and not active_tag_uuid in watch['tags']:
 | 
			
		||||
            if limit_tag and not limit_tag in watch['tags']:
 | 
			
		||||
                    continue
 | 
			
		||||
            if watch.get('last_error'):
 | 
			
		||||
                errored_count += 1
 | 
			
		||||
@@ -463,12 +456,11 @@ def changedetection_app(config=None, datastore_o=None):
 | 
			
		||||
                                total=total_count,
 | 
			
		||||
                                per_page=datastore.data['settings']['application'].get('pager_size', 50), css_framework="semantic")
 | 
			
		||||
 | 
			
		||||
        sorted_tags = sorted(datastore.data['settings']['application'].get('tags').items(), key=lambda x: x[1]['title'])
 | 
			
		||||
 | 
			
		||||
        output = render_template(
 | 
			
		||||
            "watch-overview.html",
 | 
			
		||||
                                 # Don't link to hosting when we're on the hosting environment
 | 
			
		||||
                                 active_tag=active_tag,
 | 
			
		||||
                                 active_tag_uuid=active_tag_uuid,
 | 
			
		||||
                                 active_tag=limit_tag,
 | 
			
		||||
                                 app_rss_token=datastore.data['settings']['application']['rss_access_token'],
 | 
			
		||||
                                 datastore=datastore,
 | 
			
		||||
                                 errored_count=errored_count,
 | 
			
		||||
@@ -483,7 +475,7 @@ def changedetection_app(config=None, datastore_o=None):
 | 
			
		||||
                                 sort_attribute=request.args.get('sort') if request.args.get('sort') else request.cookies.get('sort'),
 | 
			
		||||
                                 sort_order=request.args.get('order') if request.args.get('order') else request.cookies.get('order'),
 | 
			
		||||
                                 system_default_fetcher=datastore.data['settings']['application'].get('fetch_backend'),
 | 
			
		||||
                                 tags=sorted_tags,
 | 
			
		||||
                                 tags=datastore.data['settings']['application'].get('tags'),
 | 
			
		||||
                                 watches=sorted_watches
 | 
			
		||||
                                 )
 | 
			
		||||
 | 
			
		||||
@@ -516,38 +508,21 @@ def changedetection_app(config=None, datastore_o=None):
 | 
			
		||||
 | 
			
		||||
        watch = datastore.data['watching'].get(watch_uuid) if watch_uuid else None
 | 
			
		||||
 | 
			
		||||
        notification_urls = request.form['notification_urls'].strip().splitlines()
 | 
			
		||||
        # validate URLS
 | 
			
		||||
        if not len(request.form['notification_urls'].strip()):
 | 
			
		||||
            return make_response({'error': 'No Notification URLs set'}, 400)
 | 
			
		||||
 | 
			
		||||
        if not notification_urls:
 | 
			
		||||
            logger.debug("Test notification - Trying by group/tag in the edit form if available")
 | 
			
		||||
            # On an edit page, we should also fire off to the tags if they have notifications
 | 
			
		||||
            if request.form.get('tags') and request.form['tags'].strip():
 | 
			
		||||
                for k in request.form['tags'].split(','):
 | 
			
		||||
                    tag = datastore.tag_exists_by_name(k.strip())
 | 
			
		||||
                    notification_urls = tag.get('notifications_urls') if tag and tag.get('notifications_urls') else None
 | 
			
		||||
 | 
			
		||||
        is_global_settings_form = request.args.get('mode', '') == 'global-settings'
 | 
			
		||||
        is_group_settings_form = request.args.get('mode', '') == 'group-settings'
 | 
			
		||||
        if not notification_urls and not is_global_settings_form and not is_group_settings_form:
 | 
			
		||||
            # In the global settings, use only what is typed currently in the text box
 | 
			
		||||
            logger.debug("Test notification - Trying by global system settings notifications")
 | 
			
		||||
            if datastore.data['settings']['application'].get('notification_urls'):
 | 
			
		||||
                notification_urls = datastore.data['settings']['application']['notification_urls']
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        if not notification_urls:
 | 
			
		||||
            return 'No Notification URLs set/found'
 | 
			
		||||
 | 
			
		||||
        for n_url in notification_urls:
 | 
			
		||||
            if len(n_url.strip()):
 | 
			
		||||
                if not apobj.add(n_url):
 | 
			
		||||
                    return f'Error - {n_url} is not a valid AppRise URL.'
 | 
			
		||||
        for server_url in request.form['notification_urls'].splitlines():
 | 
			
		||||
            if len(server_url.strip()):
 | 
			
		||||
                if not apobj.add(server_url):
 | 
			
		||||
                    message = '{} is not a valid AppRise URL.'.format(server_url)
 | 
			
		||||
                    return make_response({'error': message}, 400)
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            # use the same as when it is triggered, but then override it with the form test values
 | 
			
		||||
            n_object = {
 | 
			
		||||
                'watch_url': request.form['window_url'],
 | 
			
		||||
                'notification_urls': notification_urls
 | 
			
		||||
                'notification_urls': request.form['notification_urls'].splitlines()
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            # Only use if present, if not set in n_object it should use the default system value
 | 
			
		||||
@@ -566,7 +541,7 @@ def changedetection_app(config=None, datastore_o=None):
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            return make_response({'error': str(e)}, 400)
 | 
			
		||||
 | 
			
		||||
        return 'OK - Sent test notifications'
 | 
			
		||||
        return 'OK'
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    @app.route("/clear_history/<string:uuid>", methods=['GET'])
 | 
			
		||||
@@ -603,12 +578,6 @@ def changedetection_app(config=None, datastore_o=None):
 | 
			
		||||
        output = render_template("clear_all_history.html")
 | 
			
		||||
        return output
 | 
			
		||||
 | 
			
		||||
    def _watch_has_tag_options_set(watch):
 | 
			
		||||
        """This should be fixed better so that Tag is some proper Model, a tag is just a Watch also"""
 | 
			
		||||
        for tag_uuid, tag in datastore.data['settings']['application'].get('tags', {}).items():
 | 
			
		||||
            if tag_uuid in watch.get('tags', []) and (tag.get('include_filters') or tag.get('subtractive_selectors')):
 | 
			
		||||
                return True
 | 
			
		||||
 | 
			
		||||
    @app.route("/edit/<string:uuid>", methods=['GET', 'POST'])
 | 
			
		||||
    @login_optionally_required
 | 
			
		||||
    # https://stackoverflow.com/questions/42984453/wtforms-populate-form-with-data-if-data-exists
 | 
			
		||||
@@ -779,7 +748,6 @@ def changedetection_app(config=None, datastore_o=None):
 | 
			
		||||
                                     has_default_notification_urls=True if len(datastore.data['settings']['application']['notification_urls']) else False,
 | 
			
		||||
                                     has_empty_checktime=using_default_check_time,
 | 
			
		||||
                                     has_extra_headers_file=len(datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) > 0,
 | 
			
		||||
                                     has_special_tag_options=_watch_has_tag_options_set(watch=watch),
 | 
			
		||||
                                     is_html_webdriver=is_html_webdriver,
 | 
			
		||||
                                     jq_support=jq_support,
 | 
			
		||||
                                     playwright_enabled=os.getenv('PLAYWRIGHT_DRIVER_URL', False),
 | 
			
		||||
@@ -795,7 +763,7 @@ def changedetection_app(config=None, datastore_o=None):
 | 
			
		||||
    @app.route("/settings", methods=['GET', "POST"])
 | 
			
		||||
    @login_optionally_required
 | 
			
		||||
    def settings_page():
 | 
			
		||||
        from changedetectionio import forms
 | 
			
		||||
        from changedetectionio import content_fetcher, forms
 | 
			
		||||
 | 
			
		||||
        default = deepcopy(datastore.data['settings'])
 | 
			
		||||
        if datastore.proxy_list is not None:
 | 
			
		||||
@@ -1303,8 +1271,9 @@ def changedetection_app(config=None, datastore_o=None):
 | 
			
		||||
 | 
			
		||||
        url = request.form.get('url').strip()
 | 
			
		||||
        if datastore.url_exists(url):
 | 
			
		||||
            flash(f'Warning, URL {url} already exists', "notice")
 | 
			
		||||
            
 | 
			
		||||
            flash('The URL {} already exists'.format(url), "error")
 | 
			
		||||
            return redirect(url_for('index'))
 | 
			
		||||
 | 
			
		||||
        add_paused = request.form.get('edit_and_watch_submit_button') != None
 | 
			
		||||
        processor = request.form.get('processor', 'text_json_diff')
 | 
			
		||||
        new_uuid = datastore.add_watch(url=url, tag=request.form.get('tags').strip(), extras={'paused': add_paused, 'processor': processor})
 | 
			
		||||
@@ -1454,13 +1423,6 @@ def changedetection_app(config=None, datastore_o=None):
 | 
			
		||||
                    update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
 | 
			
		||||
            flash("{} watches queued for rechecking".format(len(uuids)))
 | 
			
		||||
 | 
			
		||||
        elif (op == 'clear-errors'):
 | 
			
		||||
            for uuid in uuids:
 | 
			
		||||
                uuid = uuid.strip()
 | 
			
		||||
                if datastore.data['watching'].get(uuid):
 | 
			
		||||
                    datastore.data['watching'][uuid]["last_error"] = False
 | 
			
		||||
            flash(f"{len(uuids)} watches errors cleared")
 | 
			
		||||
 | 
			
		||||
        elif (op == 'clear-history'):
 | 
			
		||||
            for uuid in uuids:
 | 
			
		||||
                uuid = uuid.strip()
 | 
			
		||||
 
 | 
			
		||||
@@ -1,6 +1,6 @@
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
from changedetectionio.strtobool import strtobool
 | 
			
		||||
from distutils.util import strtobool
 | 
			
		||||
 | 
			
		||||
from wtforms import (
 | 
			
		||||
    BooleanField,
 | 
			
		||||
@@ -27,7 +27,7 @@ from validators.url import url as url_validator
 | 
			
		||||
# each select <option data-enabled="enabled-0-0"
 | 
			
		||||
from changedetectionio.blueprint.browser_steps.browser_steps import browser_step_ui_config
 | 
			
		||||
 | 
			
		||||
from changedetectionio import html_tools, content_fetchers
 | 
			
		||||
from changedetectionio import content_fetcher, html_tools
 | 
			
		||||
 | 
			
		||||
from changedetectionio.notification import (
 | 
			
		||||
    valid_notification_formats,
 | 
			
		||||
@@ -167,31 +167,33 @@ class ValidateContentFetcherIsReady(object):
 | 
			
		||||
        self.message = message
 | 
			
		||||
 | 
			
		||||
    def __call__(self, form, field):
 | 
			
		||||
        import urllib3.exceptions
 | 
			
		||||
        from changedetectionio import content_fetcher
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
# AttributeError: module 'changedetectionio.content_fetcher' has no attribute 'extra_browser_unlocked<>ASDF213r123r'
 | 
			
		||||
        # Better would be a radiohandler that keeps a reference to each class
 | 
			
		||||
        # if field.data is not None and field.data != 'system':
 | 
			
		||||
        #     klass = getattr(content_fetcher, field.data)
 | 
			
		||||
        #     some_object = klass()
 | 
			
		||||
        #     try:
 | 
			
		||||
        #         ready = some_object.is_ready()
 | 
			
		||||
        #
 | 
			
		||||
        #     except urllib3.exceptions.MaxRetryError as e:
 | 
			
		||||
        #         driver_url = some_object.command_executor
 | 
			
		||||
        #         message = field.gettext('Content fetcher \'%s\' did not respond.' % (field.data))
 | 
			
		||||
        #         message += '<br>' + field.gettext(
 | 
			
		||||
        #             'Be sure that the selenium/webdriver runner is running and accessible via network from this container/host.')
 | 
			
		||||
        #         message += '<br>' + field.gettext('Did you follow the instructions in the wiki?')
 | 
			
		||||
        #         message += '<br><br>' + field.gettext('WebDriver Host: %s' % (driver_url))
 | 
			
		||||
        #         message += '<br><a href="https://github.com/dgtlmoon/changedetection.io/wiki/Fetching-pages-with-WebDriver">Go here for more information</a>'
 | 
			
		||||
        #         message += '<br>'+field.gettext('Content fetcher did not respond properly, unable to use it.\n %s' % (str(e)))
 | 
			
		||||
        #
 | 
			
		||||
        #         raise ValidationError(message)
 | 
			
		||||
        #
 | 
			
		||||
        #     except Exception as e:
 | 
			
		||||
        #         message = field.gettext('Content fetcher \'%s\' did not respond properly, unable to use it.\n %s')
 | 
			
		||||
        #         raise ValidationError(message % (field.data, e))
 | 
			
		||||
        if field.data is not None and field.data != 'system':
 | 
			
		||||
            klass = getattr(content_fetcher, field.data)
 | 
			
		||||
            some_object = klass()
 | 
			
		||||
            try:
 | 
			
		||||
                ready = some_object.is_ready()
 | 
			
		||||
 | 
			
		||||
            except urllib3.exceptions.MaxRetryError as e:
 | 
			
		||||
                driver_url = some_object.command_executor
 | 
			
		||||
                message = field.gettext('Content fetcher \'%s\' did not respond.' % (field.data))
 | 
			
		||||
                message += '<br>' + field.gettext(
 | 
			
		||||
                    'Be sure that the selenium/webdriver runner is running and accessible via network from this container/host.')
 | 
			
		||||
                message += '<br>' + field.gettext('Did you follow the instructions in the wiki?')
 | 
			
		||||
                message += '<br><br>' + field.gettext('WebDriver Host: %s' % (driver_url))
 | 
			
		||||
                message += '<br><a href="https://github.com/dgtlmoon/changedetection.io/wiki/Fetching-pages-with-WebDriver">Go here for more information</a>'
 | 
			
		||||
                message += '<br>'+field.gettext('Content fetcher did not respond properly, unable to use it.\n %s' % (str(e)))
 | 
			
		||||
 | 
			
		||||
                raise ValidationError(message)
 | 
			
		||||
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                message = field.gettext('Content fetcher \'%s\' did not respond properly, unable to use it.\n %s')
 | 
			
		||||
                raise ValidationError(message % (field.data, e))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ValidateNotificationBodyAndTitleWhenURLisSet(object):
 | 
			
		||||
@@ -419,7 +421,7 @@ class commonSettingsForm(Form):
 | 
			
		||||
    notification_title = StringField('Notification Title', default='ChangeDetection.io Notification - {{ watch_url }}', validators=[validators.Optional(), ValidateJinja2Template()])
 | 
			
		||||
    notification_body = TextAreaField('Notification Body', default='{{ watch_url }} had a change.', validators=[validators.Optional(), ValidateJinja2Template()])
 | 
			
		||||
    notification_format = SelectField('Notification format', choices=valid_notification_formats.keys())
 | 
			
		||||
    fetch_backend = RadioField(u'Fetch Method', choices=content_fetchers.available_fetchers(), validators=[ValidateContentFetcherIsReady()])
 | 
			
		||||
    fetch_backend = RadioField(u'Fetch Method', choices=content_fetcher.available_fetchers(), validators=[ValidateContentFetcherIsReady()])
 | 
			
		||||
    extract_title_as_title = BooleanField('Extract <title> from document and use as watch title', default=False)
 | 
			
		||||
    webdriver_delay = IntegerField('Wait seconds before extracting text', validators=[validators.Optional(), validators.NumberRange(min=1,
 | 
			
		||||
                                                                                                                                    message="Should contain one or more seconds")])
 | 
			
		||||
@@ -550,7 +552,7 @@ class globalSettingsApplicationForm(commonSettingsForm):
 | 
			
		||||
                           render_kw={"placeholder": os.getenv('BASE_URL', 'Not set')}
 | 
			
		||||
                           )
 | 
			
		||||
    empty_pages_are_a_change =  BooleanField('Treat empty pages as a change?', default=False)
 | 
			
		||||
    fetch_backend = RadioField('Fetch Method', default="html_requests", choices=content_fetchers.available_fetchers(), validators=[ValidateContentFetcherIsReady()])
 | 
			
		||||
    fetch_backend = RadioField('Fetch Method', default="html_requests", choices=content_fetcher.available_fetchers(), validators=[ValidateContentFetcherIsReady()])
 | 
			
		||||
    global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
 | 
			
		||||
    global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
 | 
			
		||||
    ignore_whitespace = BooleanField('Ignore whitespace')
 | 
			
		||||
 
 | 
			
		||||
@@ -169,14 +169,14 @@ def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=Fals
 | 
			
		||||
        # And where the matched result doesn't include something that will cause Inscriptis to add a newline
 | 
			
		||||
        # (This way each 'match' reliably has a new-line in the diff)
 | 
			
		||||
        # Divs are converted to 4 whitespaces by inscriptis
 | 
			
		||||
        if append_pretty_line_formatting and len(html_block) and (not hasattr(element, 'tag') or not element.tag in (['br', 'hr', 'div', 'p'])):
 | 
			
		||||
        if append_pretty_line_formatting and len(html_block) and (not hasattr( element, 'tag' ) or not element.tag in (['br', 'hr', 'div', 'p'])):
 | 
			
		||||
            html_block += TEXT_FILTER_LIST_LINE_SUFFIX
 | 
			
		||||
 | 
			
		||||
        # Some kind of text, UTF-8 or other
 | 
			
		||||
        if isinstance(element, (str, bytes)):
 | 
			
		||||
            html_block += element
 | 
			
		||||
        if type(element) == etree._ElementStringResult:
 | 
			
		||||
            html_block += str(element)
 | 
			
		||||
        elif type(element) == etree._ElementUnicodeResult:
 | 
			
		||||
            html_block += str(element)
 | 
			
		||||
        else:
 | 
			
		||||
            # Return the HTML which will get parsed as text
 | 
			
		||||
            html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
 | 
			
		||||
 | 
			
		||||
    return html_block
 | 
			
		||||
 
 | 
			
		||||
@@ -57,7 +57,7 @@ class import_url_list(Importer):
 | 
			
		||||
 | 
			
		||||
            # Flask wtform validators wont work with basic auth, use validators package
 | 
			
		||||
            # Up to 5000 per batch so we dont flood the server
 | 
			
		||||
            # @todo validators.url will fail when you add your own IP etc
 | 
			
		||||
            # @todo validators.url failed on local hostnames (such as referring to ourself when using browserless)
 | 
			
		||||
            if len(url) and 'http' in url.lower() and good < 5000:
 | 
			
		||||
                extras = None
 | 
			
		||||
                if processor:
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,4 @@
 | 
			
		||||
from changedetectionio.strtobool import strtobool
 | 
			
		||||
from distutils.util import strtobool
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
import time
 | 
			
		||||
@@ -362,7 +362,6 @@ class model(dict):
 | 
			
		||||
        # @todo bump static cache of the last timestamp so we dont need to examine the file to set a proper ''viewed'' status
 | 
			
		||||
        return snapshot_fname
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    @property
 | 
			
		||||
    def has_empty_checktime(self):
 | 
			
		||||
        # using all() + dictionary comprehension
 | 
			
		||||
 
 | 
			
		||||
@@ -2,8 +2,9 @@ from abc import abstractmethod
 | 
			
		||||
import os
 | 
			
		||||
import hashlib
 | 
			
		||||
import re
 | 
			
		||||
from changedetectionio import content_fetcher
 | 
			
		||||
from copy import deepcopy
 | 
			
		||||
from changedetectionio.strtobool import strtobool
 | 
			
		||||
from distutils.util import strtobool
 | 
			
		||||
from loguru import logger
 | 
			
		||||
 | 
			
		||||
class difference_detection_processor():
 | 
			
		||||
@@ -49,7 +50,7 @@ class difference_detection_processor():
 | 
			
		||||
            connection = list(
 | 
			
		||||
                filter(lambda s: (s['browser_name'] == key), self.datastore.data['settings']['requests'].get('extra_browsers', [])))
 | 
			
		||||
            if connection:
 | 
			
		||||
                prefer_fetch_backend = 'html_webdriver'
 | 
			
		||||
                prefer_fetch_backend = 'base_html_playwright'
 | 
			
		||||
                custom_browser_connection_url = connection[0].get('browser_connection_url')
 | 
			
		||||
 | 
			
		||||
        # PDF should be html_requests because playwright will serve it up (so far) in a embedded page
 | 
			
		||||
@@ -59,28 +60,17 @@ class difference_detection_processor():
 | 
			
		||||
           prefer_fetch_backend = "html_requests"
 | 
			
		||||
 | 
			
		||||
        # Grab the right kind of 'fetcher', (playwright, requests, etc)
 | 
			
		||||
        from changedetectionio import content_fetchers
 | 
			
		||||
        if hasattr(content_fetchers, prefer_fetch_backend):
 | 
			
		||||
            # @todo TEMPORARY HACK - SWITCH BACK TO PLAYWRIGHT FOR BROWSERSTEPS
 | 
			
		||||
            if prefer_fetch_backend == 'html_webdriver' and self.watch.has_browser_steps:
 | 
			
		||||
                # This is never supported in selenium anyway
 | 
			
		||||
                logger.warning("Using playwright fetcher override for possible puppeteer request in browsersteps, because puppetteer:browser steps is incomplete.")
 | 
			
		||||
                from changedetectionio.content_fetchers.playwright import fetcher as playwright_fetcher
 | 
			
		||||
                fetcher_obj = playwright_fetcher
 | 
			
		||||
            else:
 | 
			
		||||
                fetcher_obj = getattr(content_fetchers, prefer_fetch_backend)
 | 
			
		||||
        if hasattr(content_fetcher, prefer_fetch_backend):
 | 
			
		||||
            fetcher_obj = getattr(content_fetcher, prefer_fetch_backend)
 | 
			
		||||
        else:
 | 
			
		||||
            # What it referenced doesnt exist, Just use a default
 | 
			
		||||
            fetcher_obj = getattr(content_fetchers, "html_requests")
 | 
			
		||||
            # If the klass doesnt exist, just use a default
 | 
			
		||||
            fetcher_obj = getattr(content_fetcher, "html_requests")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        proxy_url = None
 | 
			
		||||
        if preferred_proxy_id:
 | 
			
		||||
            # Custom browser endpoints should NOT have a proxy added
 | 
			
		||||
            if not prefer_fetch_backend.startswith('extra_browser_'):
 | 
			
		||||
                proxy_url = self.datastore.proxy_list.get(preferred_proxy_id).get('url')
 | 
			
		||||
                logger.debug(f"Selected proxy key '{preferred_proxy_id}' as proxy URL '{proxy_url}' for {url}")
 | 
			
		||||
            else:
 | 
			
		||||
                logger.debug(f"Skipping adding proxy data when custom Browser endpoint is specified. ")
 | 
			
		||||
            proxy_url = self.datastore.proxy_list.get(preferred_proxy_id).get('url')
 | 
			
		||||
            logger.debug(f"Selected proxy key '{preferred_proxy_id}' as proxy URL '{proxy_url}' for {url}")
 | 
			
		||||
 | 
			
		||||
        # Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need.
 | 
			
		||||
        # When browser_connection_url is None, it method should default to working out whats the best defaults (os env vars etc)
 | 
			
		||||
 
 | 
			
		||||
@@ -8,9 +8,8 @@ import urllib3
 | 
			
		||||
 | 
			
		||||
from . import difference_detection_processor
 | 
			
		||||
from ..html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text
 | 
			
		||||
from changedetectionio import html_tools, content_fetchers
 | 
			
		||||
from changedetectionio import content_fetcher, html_tools
 | 
			
		||||
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
 | 
			
		||||
import changedetectionio.content_fetchers
 | 
			
		||||
from copy import deepcopy
 | 
			
		||||
from loguru import logger
 | 
			
		||||
 | 
			
		||||
@@ -61,7 +60,7 @@ class perform_site_check(difference_detection_processor):
 | 
			
		||||
        update_obj['previous_md5_before_filters'] = hashlib.md5(self.fetcher.content.encode('utf-8')).hexdigest()
 | 
			
		||||
        if skip_when_checksum_same:
 | 
			
		||||
            if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'):
 | 
			
		||||
                raise content_fetchers.exceptions.checksumFromPreviousCheckWasTheSame()
 | 
			
		||||
                raise content_fetcher.checksumFromPreviousCheckWasTheSame()
 | 
			
		||||
 | 
			
		||||
        # Fetching complete, now filters
 | 
			
		||||
 | 
			
		||||
@@ -119,7 +118,7 @@ class perform_site_check(difference_detection_processor):
 | 
			
		||||
        include_filters_from_tags = self.datastore.get_tag_overrides_for_watch(uuid=uuid, attr='include_filters')
 | 
			
		||||
 | 
			
		||||
        # 1845 - remove duplicated filters in both group and watch include filter
 | 
			
		||||
        include_filters_rule = list(dict.fromkeys(watch.get('include_filters', []) + include_filters_from_tags))
 | 
			
		||||
        include_filters_rule = list({*watch.get('include_filters', []), *include_filters_from_tags})
 | 
			
		||||
 | 
			
		||||
        subtractive_selectors = [*self.datastore.get_tag_overrides_for_watch(uuid=uuid, attr='subtractive_selectors'),
 | 
			
		||||
                                 *watch.get("subtractive_selectors", []),
 | 
			
		||||
@@ -244,7 +243,7 @@ class perform_site_check(difference_detection_processor):
 | 
			
		||||
        # Treat pages with no renderable text content as a change? No by default
 | 
			
		||||
        empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
 | 
			
		||||
        if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
 | 
			
		||||
            raise content_fetchers.exceptions.ReplyWithContentButNoText(url=url,
 | 
			
		||||
            raise content_fetcher.ReplyWithContentButNoText(url=url,
 | 
			
		||||
                                                            status_code=self.fetcher.get_last_status_code(),
 | 
			
		||||
                                                            screenshot=screenshot,
 | 
			
		||||
                                                            has_filters=has_filter_rule,
 | 
			
		||||
 
 | 
			
		||||
@@ -146,7 +146,7 @@ module.exports = async ({page, context}) => {
 | 
			
		||||
    var xpath_data;
 | 
			
		||||
    var instock_data;
 | 
			
		||||
    try {
 | 
			
		||||
        // Not sure the best way here, in the future this should be a new package added to npm then run in evaluatedCode
 | 
			
		||||
        // Not sure the best way here, in the future this should be a new package added to npm then run in browserless
 | 
			
		||||
        // (Once the old playwright is removed)
 | 
			
		||||
        xpath_data = await page.evaluate((include_filters) => {%xpath_scrape_code%}, include_filters);
 | 
			
		||||
        instock_data = await page.evaluate(() => {%instock_scrape_code%});
 | 
			
		||||
@@ -10,15 +10,12 @@ function isItemInStock() {
 | 
			
		||||
    const outOfStockTexts = [
 | 
			
		||||
        ' أخبرني عندما يتوفر',
 | 
			
		||||
        '0 in stock',
 | 
			
		||||
        'actuellement indisponible',
 | 
			
		||||
        'agotado',
 | 
			
		||||
        'article épuisé',
 | 
			
		||||
        'artikel zurzeit vergriffen',
 | 
			
		||||
        'as soon as stock is available',
 | 
			
		||||
        'ausverkauft', // sold out
 | 
			
		||||
        'available for back order',
 | 
			
		||||
        'awaiting stock',
 | 
			
		||||
        'back in stock soon',
 | 
			
		||||
        'back-order or out of stock',
 | 
			
		||||
        'backordered',
 | 
			
		||||
        'benachrichtigt mich', // notify me
 | 
			
		||||
@@ -27,7 +24,6 @@ function isItemInStock() {
 | 
			
		||||
        'coming soon',
 | 
			
		||||
        'currently have any tickets for this',
 | 
			
		||||
        'currently unavailable',
 | 
			
		||||
        'dieser artikel ist bald wieder verfügbar',
 | 
			
		||||
        'dostępne wkrótce',
 | 
			
		||||
        'en rupture de stock',
 | 
			
		||||
        'ist derzeit nicht auf lager',
 | 
			
		||||
@@ -46,9 +42,9 @@ function isItemInStock() {
 | 
			
		||||
        'no tickets available',
 | 
			
		||||
        'not available',
 | 
			
		||||
        'not currently available',
 | 
			
		||||
        'not in stock',
 | 
			
		||||
        'not in stock',        
 | 
			
		||||
        'notify me when available',
 | 
			
		||||
        'notify when available',
 | 
			
		||||
        'notify when available',            
 | 
			
		||||
        'não estamos a aceitar encomendas',
 | 
			
		||||
        'out of stock',
 | 
			
		||||
        'out-of-stock',
 | 
			
		||||
@@ -58,26 +54,18 @@ function isItemInStock() {
 | 
			
		||||
        'sold-out',
 | 
			
		||||
        'temporarily out of stock',
 | 
			
		||||
        'temporarily unavailable',
 | 
			
		||||
        'there were no search results for',
 | 
			
		||||
        'this item is currently unavailable',
 | 
			
		||||
        'tickets unavailable',
 | 
			
		||||
        'tijdelijk uitverkocht',
 | 
			
		||||
        'unavailable tickets',
 | 
			
		||||
        'vorbestellung ist bald möglich',
 | 
			
		||||
        'we couldn\'t find any products that match',
 | 
			
		||||
        'we do not currently have an estimate of when this product will be back in stock.',
 | 
			
		||||
        'we don\'t know when or if this item will be back in stock.',
 | 
			
		||||
        'we were not able to find a match',
 | 
			
		||||
        'zur zeit nicht an lager',
 | 
			
		||||
        '品切れ',
 | 
			
		||||
        '已售',
 | 
			
		||||
        '已售完',
 | 
			
		||||
        '품절'
 | 
			
		||||
    ];
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    const vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0);
 | 
			
		||||
 | 
			
		||||
    function getElementBaseText(element) {
 | 
			
		||||
        // .textContent can include text from children which may give the wrong results
 | 
			
		||||
        // scan only immediate TEXT_NODEs, which will be a child of the element
 | 
			
		||||
@@ -88,69 +76,29 @@ function isItemInStock() {
 | 
			
		||||
        return text.toLowerCase().trim();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const negateOutOfStockRegex = new RegExp('^([0-9] in stock|add to cart|in stock)', 'ig');
 | 
			
		||||
    const negateOutOfStockRegex = new RegExp('([0-9] in stock|add to cart)', 'ig');
 | 
			
		||||
 | 
			
		||||
    // The out-of-stock or in-stock-text is generally always above-the-fold
 | 
			
		||||
    // and often below-the-fold is a list of related products that may or may not contain trigger text
 | 
			
		||||
    // so it's good to filter to just the 'above the fold' elements
 | 
			
		||||
    // and it should be atleast 100px from the top to ignore items in the toolbar, sometimes menu items like "Coming soon" exist
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
// @todo - if it's SVG or IMG, go into image diff mode
 | 
			
		||||
// %ELEMENTS% replaced at injection time because different interfaces use it with different settings
 | 
			
		||||
 | 
			
		||||
    console.log("Scanning %ELEMENTS%");
 | 
			
		||||
 | 
			
		||||
    function collectVisibleElements(parent, visibleElements) {
 | 
			
		||||
        if (!parent) return; // Base case: if parent is null or undefined, return
 | 
			
		||||
 | 
			
		||||
        // Add the parent itself to the visible elements array if it's of the specified types
 | 
			
		||||
        visibleElements.push(parent);
 | 
			
		||||
 | 
			
		||||
        // Iterate over the parent's children
 | 
			
		||||
        const children = parent.children;
 | 
			
		||||
        for (let i = 0; i < children.length; i++) {
 | 
			
		||||
            const child = children[i];
 | 
			
		||||
            if (
 | 
			
		||||
                child.nodeType === Node.ELEMENT_NODE &&
 | 
			
		||||
                window.getComputedStyle(child).display !== 'none' &&
 | 
			
		||||
                window.getComputedStyle(child).visibility !== 'hidden' &&
 | 
			
		||||
                child.offsetWidth >= 0 &&
 | 
			
		||||
                child.offsetHeight >= 0 &&
 | 
			
		||||
                window.getComputedStyle(child).contentVisibility !== 'hidden'
 | 
			
		||||
            ) {
 | 
			
		||||
                // If the child is an element and is visible, recursively collect visible elements
 | 
			
		||||
                collectVisibleElements(child, visibleElements);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const elementsToScan = [];
 | 
			
		||||
    collectVisibleElements(document.body, elementsToScan);
 | 
			
		||||
    const elementsToScan = Array.from(document.getElementsByTagName('*')).filter(element => element.getBoundingClientRect().top + window.scrollY <= vh && element.getBoundingClientRect().top + window.scrollY >= 100);
 | 
			
		||||
 | 
			
		||||
    var elementText = "";
 | 
			
		||||
 | 
			
		||||
    // REGEXS THAT REALLY MEAN IT'S IN STOCK
 | 
			
		||||
    for (let i = elementsToScan.length - 1; i >= 0; i--) {
 | 
			
		||||
        const element = elementsToScan[i];
 | 
			
		||||
 | 
			
		||||
        // outside the 'fold' or some weird text in the heading area
 | 
			
		||||
        // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
 | 
			
		||||
        if (element.getBoundingClientRect().top + window.scrollY >= vh || element.getBoundingClientRect().top + window.scrollY <= 100) {
 | 
			
		||||
            continue
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        elementText = "";
 | 
			
		||||
        if (element.tagName.toLowerCase() === "input") {
 | 
			
		||||
            elementText = element.value.toLowerCase().trim();
 | 
			
		||||
            elementText = element.value.toLowerCase();
 | 
			
		||||
        } else {
 | 
			
		||||
            elementText = getElementBaseText(element);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if (elementText.length) {
 | 
			
		||||
            // try which ones could mean its in stock
 | 
			
		||||
            if (negateOutOfStockRegex.test(elementText) && !elementText.includes('(0 products)')) {
 | 
			
		||||
                console.log(`Negating/overriding 'Out of Stock' back to "Possibly in stock" found "${elementText}"`)
 | 
			
		||||
            if (negateOutOfStockRegex.test(elementText)) {
 | 
			
		||||
                return 'Possibly in stock';
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
@@ -159,34 +107,28 @@ function isItemInStock() {
 | 
			
		||||
    // OTHER STUFF THAT COULD BE THAT IT'S OUT OF STOCK
 | 
			
		||||
    for (let i = elementsToScan.length - 1; i >= 0; i--) {
 | 
			
		||||
        const element = elementsToScan[i];
 | 
			
		||||
        // outside the 'fold' or some weird text in the heading area
 | 
			
		||||
        // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
 | 
			
		||||
        if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) {
 | 
			
		||||
            continue
 | 
			
		||||
        }
 | 
			
		||||
        elementText = "";
 | 
			
		||||
        if (element.tagName.toLowerCase() === "input") {
 | 
			
		||||
            elementText = element.value.toLowerCase().trim();
 | 
			
		||||
        } else {
 | 
			
		||||
            elementText = getElementBaseText(element);
 | 
			
		||||
        }
 | 
			
		||||
        if (element.offsetWidth > 0 || element.offsetHeight > 0 || element.getClientRects().length > 0) {
 | 
			
		||||
            elementText = "";
 | 
			
		||||
            if (element.tagName.toLowerCase() === "input") {
 | 
			
		||||
                elementText = element.value.toLowerCase();
 | 
			
		||||
            } else {
 | 
			
		||||
                elementText = getElementBaseText(element);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
        if (elementText.length) {
 | 
			
		||||
            // and these mean its out of stock
 | 
			
		||||
            for (const outOfStockText of outOfStockTexts) {
 | 
			
		||||
                if (elementText.includes(outOfStockText)) {
 | 
			
		||||
                    console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`)
 | 
			
		||||
                    return outOfStockText; // item is out of stock
 | 
			
		||||
            if (elementText.length) {
 | 
			
		||||
                // and these mean its out of stock
 | 
			
		||||
                for (const outOfStockText of outOfStockTexts) {
 | 
			
		||||
                    if (elementText.includes(outOfStockText)) {
 | 
			
		||||
                        return outOfStockText; // item is out of stock
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    console.log(`Returning 'Possibly in stock' - cant' find any useful matching text`)
 | 
			
		||||
    return 'Possibly in stock'; // possibly in stock, cant decide otherwise.
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// returns the element text that makes it think it's out of stock
 | 
			
		||||
return isItemInStock().trim()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -16,23 +16,24 @@ try {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
// Include the getXpath script directly, easier than fetching
 | 
			
		||||
function getxpath(e) {
 | 
			
		||||
    var n = e;
 | 
			
		||||
    if (n && n.id) return '//*[@id="' + n.id + '"]';
 | 
			
		||||
    for (var o = []; n && Node.ELEMENT_NODE === n.nodeType;) {
 | 
			
		||||
        for (var i = 0, r = !1, d = n.previousSibling; d;) d.nodeType !== Node.DOCUMENT_TYPE_NODE && d.nodeName === n.nodeName && i++, d = d.previousSibling;
 | 
			
		||||
        for (d = n.nextSibling; d;) {
 | 
			
		||||
            if (d.nodeName === n.nodeName) {
 | 
			
		||||
                r = !0;
 | 
			
		||||
                break
 | 
			
		||||
        var n = e;
 | 
			
		||||
        if (n && n.id) return '//*[@id="' + n.id + '"]';
 | 
			
		||||
        for (var o = []; n && Node.ELEMENT_NODE === n.nodeType;) {
 | 
			
		||||
            for (var i = 0, r = !1, d = n.previousSibling; d;) d.nodeType !== Node.DOCUMENT_TYPE_NODE && d.nodeName === n.nodeName && i++, d = d.previousSibling;
 | 
			
		||||
            for (d = n.nextSibling; d;) {
 | 
			
		||||
                if (d.nodeName === n.nodeName) {
 | 
			
		||||
                    r = !0;
 | 
			
		||||
                    break
 | 
			
		||||
                }
 | 
			
		||||
                d = d.nextSibling
 | 
			
		||||
            }
 | 
			
		||||
            d = d.nextSibling
 | 
			
		||||
            o.push((n.prefix ? n.prefix + ":" : "") + n.localName + (i || r ? "[" + (i + 1) + "]" : "")), n = n.parentNode
 | 
			
		||||
        }
 | 
			
		||||
        o.push((n.prefix ? n.prefix + ":" : "") + n.localName + (i || r ? "[" + (i + 1) + "]" : "")), n = n.parentNode
 | 
			
		||||
        return o.length ? "/" + o.reverse().join("/") : ""
 | 
			
		||||
    }
 | 
			
		||||
    return o.length ? "/" + o.reverse().join("/") : ""
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
const findUpTag = (el) => {
 | 
			
		||||
    let r = el
 | 
			
		||||
@@ -58,14 +59,14 @@ const findUpTag = (el) => {
 | 
			
		||||
 | 
			
		||||
    // Strategy 2: Keep going up until we hit an ID tag, imagine it's like  #list-widget div h4
 | 
			
		||||
    while (r.parentNode) {
 | 
			
		||||
        if (depth === 5) {
 | 
			
		||||
        if (depth == 5) {
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
        if ('' !== r.id) {
 | 
			
		||||
            chained_css.unshift("#" + CSS.escape(r.id));
 | 
			
		||||
            final_selector = chained_css.join(' > ');
 | 
			
		||||
            // Be sure theres only one, some sites have multiples of the same ID tag :-(
 | 
			
		||||
            if (window.document.querySelectorAll(final_selector).length === 1) {
 | 
			
		||||
            if (window.document.querySelectorAll(final_selector).length == 1) {
 | 
			
		||||
                return final_selector;
 | 
			
		||||
            }
 | 
			
		||||
            return null;
 | 
			
		||||
@@ -81,60 +82,30 @@ const findUpTag = (el) => {
 | 
			
		||||
 | 
			
		||||
// @todo - if it's SVG or IMG, go into image diff mode
 | 
			
		||||
// %ELEMENTS% replaced at injection time because different interfaces use it with different settings
 | 
			
		||||
 | 
			
		||||
var elements = window.document.querySelectorAll("%ELEMENTS%");
 | 
			
		||||
var size_pos = [];
 | 
			
		||||
// after page fetch, inject this JS
 | 
			
		||||
// build a map of all elements and their positions (maybe that only include text?)
 | 
			
		||||
var bbox;
 | 
			
		||||
console.log("Scanning %ELEMENTS%");
 | 
			
		||||
for (var i = 0; i < elements.length; i++) {
 | 
			
		||||
    bbox = elements[i].getBoundingClientRect();
 | 
			
		||||
 | 
			
		||||
function collectVisibleElements(parent, visibleElements) {
 | 
			
		||||
    if (!parent) return; // Base case: if parent is null or undefined, return
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    // Add the parent itself to the visible elements array if it's of the specified types
 | 
			
		||||
    const tagName = parent.tagName.toLowerCase();
 | 
			
		||||
    if ("%ELEMENTS%".split(',').includes(tagName)) {
 | 
			
		||||
        visibleElements.push(parent);
 | 
			
		||||
    // Exclude items that are not interactable or visible
 | 
			
		||||
    if(elements[i].style.opacity === "0") {
 | 
			
		||||
        continue
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Iterate over the parent's children
 | 
			
		||||
    const children = parent.children;
 | 
			
		||||
    for (let i = 0; i < children.length; i++) {
 | 
			
		||||
        const child = children[i];
 | 
			
		||||
        if (
 | 
			
		||||
            child.nodeType === Node.ELEMENT_NODE &&
 | 
			
		||||
            window.getComputedStyle(child).display !== 'none' &&
 | 
			
		||||
            window.getComputedStyle(child).visibility !== 'hidden' &&
 | 
			
		||||
            child.offsetWidth >= 0 &&
 | 
			
		||||
            child.offsetHeight >= 0 &&
 | 
			
		||||
            window.getComputedStyle(child).contentVisibility !== 'hidden'
 | 
			
		||||
        ) {
 | 
			
		||||
            // If the child is an element and is visible, recursively collect visible elements
 | 
			
		||||
            collectVisibleElements(child, visibleElements);
 | 
			
		||||
        }
 | 
			
		||||
    if(elements[i].style.display === "none" || elements[i].style.pointerEvents === "none" ) {
 | 
			
		||||
        continue
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Create an array to hold the visible elements
 | 
			
		||||
const visibleElementsArray = [];
 | 
			
		||||
 | 
			
		||||
// Call collectVisibleElements with the starting parent element
 | 
			
		||||
collectVisibleElements(document.body, visibleElementsArray);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
visibleElementsArray.forEach(function (element) {
 | 
			
		||||
 | 
			
		||||
    bbox = element.getBoundingClientRect();
 | 
			
		||||
 | 
			
		||||
    // Skip really small ones, and where width or height ==0
 | 
			
		||||
    if (bbox['width'] * bbox['height'] < 10) {
 | 
			
		||||
        return
 | 
			
		||||
    if (bbox['width'] * bbox['height'] < 100) {
 | 
			
		||||
        continue;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Don't include elements that are offset from canvas
 | 
			
		||||
    if (bbox['top'] + scroll_y < 0 || bbox['left'] < 0) {
 | 
			
		||||
        return
 | 
			
		||||
    if (bbox['top']+scroll_y < 0 || bbox['left'] < 0) {
 | 
			
		||||
        continue;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes
 | 
			
		||||
@@ -143,41 +114,46 @@ visibleElementsArray.forEach(function (element) {
 | 
			
		||||
 | 
			
		||||
    // 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us.
 | 
			
		||||
    xpath_result = false;
 | 
			
		||||
 | 
			
		||||
    try {
 | 
			
		||||
        var d = findUpTag(element);
 | 
			
		||||
        var d = findUpTag(elements[i]);
 | 
			
		||||
        if (d) {
 | 
			
		||||
            xpath_result = d;
 | 
			
		||||
        }
 | 
			
		||||
    } catch (e) {
 | 
			
		||||
        console.log(e);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // You could swap it and default to getXpath and then try the smarter one
 | 
			
		||||
    // default back to the less intelligent one
 | 
			
		||||
    if (!xpath_result) {
 | 
			
		||||
        try {
 | 
			
		||||
            // I've seen on FB and eBay that this doesnt work
 | 
			
		||||
            // ReferenceError: getXPath is not defined at eval (eval at evaluate (:152:29), <anonymous>:67:20) at UtilityScript.evaluate (<anonymous>:159:18) at UtilityScript.<anonymous> (<anonymous>:1:44)
 | 
			
		||||
            xpath_result = getxpath(element);
 | 
			
		||||
            xpath_result = getxpath(elements[i]);
 | 
			
		||||
        } catch (e) {
 | 
			
		||||
            console.log(e);
 | 
			
		||||
            return
 | 
			
		||||
            continue;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (window.getComputedStyle(elements[i]).visibility === "hidden") {
 | 
			
		||||
        continue;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // @todo Possible to ONLY list where it's clickable to save JSON xfer size
 | 
			
		||||
    size_pos.push({
 | 
			
		||||
        xpath: xpath_result,
 | 
			
		||||
        width: Math.round(bbox['width']),
 | 
			
		||||
        height: Math.round(bbox['height']),
 | 
			
		||||
        left: Math.floor(bbox['left']),
 | 
			
		||||
        top: Math.floor(bbox['top']) + scroll_y,
 | 
			
		||||
        tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
 | 
			
		||||
        tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
 | 
			
		||||
        isClickable: window.getComputedStyle(element).cursor == "pointer"
 | 
			
		||||
        top: Math.floor(bbox['top'])+scroll_y,
 | 
			
		||||
        tagName: (elements[i].tagName) ? elements[i].tagName.toLowerCase() : '',
 | 
			
		||||
        tagtype: (elements[i].tagName == 'INPUT' && elements[i].type) ? elements[i].type.toLowerCase() : '',
 | 
			
		||||
        isClickable: (elements[i].onclick) || window.getComputedStyle(elements[i]).cursor == "pointer"
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
});
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Inject the current one set in the include_filters, which may be a CSS rule
 | 
			
		||||
// used for displaying the current one in VisualSelector, where its not one we generated.
 | 
			
		||||
@@ -204,7 +180,7 @@ if (include_filters.length) {
 | 
			
		||||
            }
 | 
			
		||||
        } catch (e) {
 | 
			
		||||
            // Maybe catch DOMException and alert?
 | 
			
		||||
            console.log("xpath_element_scraper: Exception selecting element from filter " + f);
 | 
			
		||||
            console.log("xpath_element_scraper: Exception selecting element from filter "+f);
 | 
			
		||||
            console.log(e);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
@@ -234,8 +210,8 @@ if (include_filters.length) {
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if (!q) {
 | 
			
		||||
        
 | 
			
		||||
        if(!q) {
 | 
			
		||||
            console.log("xpath_element_scraper: filter element " + f + " was not found");
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
@@ -245,7 +221,7 @@ if (include_filters.length) {
 | 
			
		||||
                width: parseInt(bbox['width']),
 | 
			
		||||
                height: parseInt(bbox['height']),
 | 
			
		||||
                left: parseInt(bbox['left']),
 | 
			
		||||
                top: parseInt(bbox['top']) + scroll_y
 | 
			
		||||
                top: parseInt(bbox['top'])+scroll_y
 | 
			
		||||
            });
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
@@ -253,7 +229,7 @@ if (include_filters.length) {
 | 
			
		||||
 | 
			
		||||
// Sort the elements so we find the smallest one first, in other words, we find the smallest one matching in that area
 | 
			
		||||
// so that we dont select the wrapping element by mistake and be unable to select what we want
 | 
			
		||||
size_pos.sort((a, b) => (a.width * a.height > b.width * b.height) ? 1 : -1)
 | 
			
		||||
size_pos.sort((a, b) => (a.width*a.height > b.width*b.height) ? 1 : -1)
 | 
			
		||||
 | 
			
		||||
// Window.width required for proper scaling in the frontend
 | 
			
		||||
return {'size_pos': size_pos, 'browser_width': window.innerWidth};
 | 
			
		||||
@@ -2,22 +2,20 @@
 | 
			
		||||
 | 
			
		||||
# run some tests and look if the 'custom-browser-search-string=1' connect string appeared in the correct containers
 | 
			
		||||
 | 
			
		||||
# @todo do it again but with the puppeteer one
 | 
			
		||||
 | 
			
		||||
# enable debug
 | 
			
		||||
set -x
 | 
			
		||||
 | 
			
		||||
# A extra browser is configured, but we never chose to use it, so it should NOT show in the logs
 | 
			
		||||
docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/custom_browser_url/test_custom_browser_url.py::test_request_not_via_custom_browser_url'
 | 
			
		||||
docker logs sockpuppetbrowser-custom-url &>log-custom.txt
 | 
			
		||||
grep 'custom-browser-search-string=1' log-custom.txt
 | 
			
		||||
docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/custom_browser_url/test_custom_browser_url.py::test_request_not_via_custom_browser_url'
 | 
			
		||||
docker logs browserless-custom-url &>log.txt
 | 
			
		||||
grep 'custom-browser-search-string=1' log.txt
 | 
			
		||||
if [ $? -ne 1 ]
 | 
			
		||||
then
 | 
			
		||||
  echo "Saw a request in 'sockpuppetbrowser-custom-url' container with 'custom-browser-search-string=1' when I should not - log-custom.txt"
 | 
			
		||||
  echo "Saw a request in 'browserless-custom-url' container with 'custom-browser-search-string=1' when I should not"
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
docker logs sockpuppetbrowser &>log.txt
 | 
			
		||||
docker logs browserless &>log.txt
 | 
			
		||||
grep 'custom-browser-search-string=1' log.txt
 | 
			
		||||
if [ $? -ne 1 ]
 | 
			
		||||
then
 | 
			
		||||
@@ -26,16 +24,16 @@ then
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# Special connect string should appear in the custom-url container, but not in the 'default' one
 | 
			
		||||
docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/custom_browser_url/test_custom_browser_url.py::test_request_via_custom_browser_url'
 | 
			
		||||
docker logs sockpuppetbrowser-custom-url &>log-custom.txt
 | 
			
		||||
grep 'custom-browser-search-string=1' log-custom.txt
 | 
			
		||||
docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/custom_browser_url/test_custom_browser_url.py::test_request_via_custom_browser_url'
 | 
			
		||||
docker logs browserless-custom-url &>log.txt
 | 
			
		||||
grep 'custom-browser-search-string=1' log.txt
 | 
			
		||||
if [ $? -ne 0 ]
 | 
			
		||||
then
 | 
			
		||||
  echo "Did not see request in 'sockpuppetbrowser-custom-url' container with 'custom-browser-search-string=1' when I should - log-custom.txt"
 | 
			
		||||
  echo "Did not see request in 'browserless-custom-url' container with 'custom-browser-search-string=1' when I should"
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
docker logs sockpuppetbrowser &>log.txt
 | 
			
		||||
docker logs browserless &>log.txt
 | 
			
		||||
grep 'custom-browser-search-string=1' log.txt
 | 
			
		||||
if [ $? -ne 1 ]
 | 
			
		||||
then
 | 
			
		||||
 
 | 
			
		||||
@@ -10,7 +10,41 @@ set -x
 | 
			
		||||
docker run --network changedet-network -d --name squid-one --hostname squid-one --rm -v `pwd`/tests/proxy_list/squid.conf:/etc/squid/conf.d/debian.conf ubuntu/squid:4.13-21.10_edge
 | 
			
		||||
docker run --network changedet-network -d --name squid-two --hostname squid-two --rm -v `pwd`/tests/proxy_list/squid.conf:/etc/squid/conf.d/debian.conf ubuntu/squid:4.13-21.10_edge
 | 
			
		||||
 | 
			
		||||
# Used for configuring a custom proxy URL via the UI - with username+password auth
 | 
			
		||||
# SOCKS5 related - start simple Socks5 proxy server
 | 
			
		||||
# SOCKSTEST=xyz should show in the logs of this service to confirm it fetched
 | 
			
		||||
docker run --network changedet-network -d --hostname socks5proxy --name socks5proxy -p 1080:1080 -e PROXY_USER=proxy_user123 -e PROXY_PASSWORD=proxy_pass123 serjs/go-socks5-proxy
 | 
			
		||||
docker run --network changedet-network -d --hostname socks5proxy-noauth -p 1081:1080 --name socks5proxy-noauth  serjs/go-socks5-proxy
 | 
			
		||||
 | 
			
		||||
echo "---------------------------------- SOCKS5 -------------------"
 | 
			
		||||
# SOCKS5 related - test from proxies.json
 | 
			
		||||
docker run --network changedet-network \
 | 
			
		||||
  -v `pwd`/tests/proxy_socks5/proxies.json-example:/app/changedetectionio/test-datastore/proxies.json \
 | 
			
		||||
  --rm \
 | 
			
		||||
  -e "SOCKSTEST=proxiesjson" \
 | 
			
		||||
  test-changedetectionio \
 | 
			
		||||
  bash -c 'cd changedetectionio && pytest tests/proxy_socks5/test_socks5_proxy_sources.py'
 | 
			
		||||
 | 
			
		||||
# SOCKS5 related - by manually entering in UI
 | 
			
		||||
docker run --network changedet-network \
 | 
			
		||||
  --rm \
 | 
			
		||||
  -e "SOCKSTEST=manual" \
 | 
			
		||||
  test-changedetectionio \
 | 
			
		||||
  bash -c 'cd changedetectionio && pytest tests/proxy_socks5/test_socks5_proxy.py'
 | 
			
		||||
 | 
			
		||||
# SOCKS5 related - test from proxies.json via playwright - NOTE- PLAYWRIGHT DOESNT SUPPORT AUTHENTICATING PROXY
 | 
			
		||||
docker run --network changedet-network \
 | 
			
		||||
  -e "SOCKSTEST=manual-playwright" \
 | 
			
		||||
  -v `pwd`/tests/proxy_socks5/proxies.json-example-noauth:/app/changedetectionio/test-datastore/proxies.json \
 | 
			
		||||
  -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" \
 | 
			
		||||
  --rm \
 | 
			
		||||
  test-changedetectionio \
 | 
			
		||||
  bash -c 'cd changedetectionio && pytest tests/proxy_socks5/test_socks5_proxy_sources.py'
 | 
			
		||||
 | 
			
		||||
echo "socks5 server logs"
 | 
			
		||||
docker logs socks5proxy
 | 
			
		||||
echo "----------------------------------"
 | 
			
		||||
 | 
			
		||||
# Used for configuring a custom proxy URL via the UI
 | 
			
		||||
docker run --network changedet-network -d \
 | 
			
		||||
  --name squid-custom \
 | 
			
		||||
  --hostname squid-custom \
 | 
			
		||||
@@ -26,17 +60,15 @@ docker run --network changedet-network \
 | 
			
		||||
  test-changedetectionio \
 | 
			
		||||
  bash -c 'cd changedetectionio && pytest tests/proxy_list/test_multiple_proxy.py'
 | 
			
		||||
 | 
			
		||||
set +e
 | 
			
		||||
echo "- Looking for chosen.changedetection.io request in squid-one - it should NOT be here"
 | 
			
		||||
 | 
			
		||||
## Should be a request in the default "first" squid
 | 
			
		||||
docker logs squid-one 2>/dev/null|grep chosen.changedetection.io
 | 
			
		||||
if [ $? -ne 1 ]
 | 
			
		||||
if [ $? -ne 0 ]
 | 
			
		||||
then
 | 
			
		||||
  echo "Saw a request to chosen.changedetection.io in the squid logs (while checking preferred proxy - squid one) WHEN I SHOULD NOT"
 | 
			
		||||
  echo "Did not see a request to chosen.changedetection.io in the squid logs (while checking preferred proxy - squid one)"
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
set -e
 | 
			
		||||
echo "- Looking for chosen.changedetection.io request in squid-two"
 | 
			
		||||
# And one in the 'second' squid (user selects this as preferred)
 | 
			
		||||
docker logs squid-two 2>/dev/null|grep chosen.changedetection.io
 | 
			
		||||
if [ $? -ne 0 ]
 | 
			
		||||
@@ -45,6 +77,7 @@ then
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Test the UI configurable proxies
 | 
			
		||||
docker run --network changedet-network \
 | 
			
		||||
  test-changedetectionio \
 | 
			
		||||
@@ -52,7 +85,6 @@ docker run --network changedet-network \
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Should see a request for one.changedetection.io in there
 | 
			
		||||
echo "- Looking for .changedetection.io request in squid-custom"
 | 
			
		||||
docker logs squid-custom 2>/dev/null|grep "TCP_TUNNEL.200.*changedetection.io"
 | 
			
		||||
if [ $? -ne 0 ]
 | 
			
		||||
then
 | 
			
		||||
@@ -69,7 +101,7 @@ docker run --network changedet-network \
 | 
			
		||||
set +e
 | 
			
		||||
# Check request was never seen in any container
 | 
			
		||||
for c in $(echo "squid-one squid-two squid-custom"); do
 | 
			
		||||
  echo ....Checking $c
 | 
			
		||||
  echo Checking $c
 | 
			
		||||
  docker logs $c &> $c.txt
 | 
			
		||||
  grep noproxy $c.txt
 | 
			
		||||
  if [ $? -ne 1 ]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,43 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# exit when any command fails
 | 
			
		||||
set -e
 | 
			
		||||
# enable debug
 | 
			
		||||
set -x
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# SOCKS5 related - start simple Socks5 proxy server
 | 
			
		||||
# SOCKSTEST=xyz should show in the logs of this service to confirm it fetched
 | 
			
		||||
docker run --network changedet-network -d --hostname socks5proxy --rm  --name socks5proxy -p 1080:1080 -e PROXY_USER=proxy_user123 -e PROXY_PASSWORD=proxy_pass123 serjs/go-socks5-proxy
 | 
			
		||||
docker run --network changedet-network -d --hostname socks5proxy-noauth --rm  -p 1081:1080 --name socks5proxy-noauth  serjs/go-socks5-proxy
 | 
			
		||||
 | 
			
		||||
echo "---------------------------------- SOCKS5 -------------------"
 | 
			
		||||
# SOCKS5 related - test from proxies.json
 | 
			
		||||
docker run --network changedet-network \
 | 
			
		||||
  -v `pwd`/tests/proxy_socks5/proxies.json-example:/app/changedetectionio/test-datastore/proxies.json \
 | 
			
		||||
  --rm \
 | 
			
		||||
  -e "SOCKSTEST=proxiesjson" \
 | 
			
		||||
  test-changedetectionio \
 | 
			
		||||
  bash -c 'cd changedetectionio && pytest tests/proxy_socks5/test_socks5_proxy_sources.py'
 | 
			
		||||
 | 
			
		||||
# SOCKS5 related - by manually entering in UI
 | 
			
		||||
docker run --network changedet-network \
 | 
			
		||||
  --rm \
 | 
			
		||||
  -e "SOCKSTEST=manual" \
 | 
			
		||||
  test-changedetectionio \
 | 
			
		||||
  bash -c 'cd changedetectionio && pytest tests/proxy_socks5/test_socks5_proxy.py'
 | 
			
		||||
 | 
			
		||||
# SOCKS5 related - test from proxies.json via playwright - NOTE- PLAYWRIGHT DOESNT SUPPORT AUTHENTICATING PROXY
 | 
			
		||||
docker run --network changedet-network \
 | 
			
		||||
  -e "SOCKSTEST=manual-playwright" \
 | 
			
		||||
  -v `pwd`/tests/proxy_socks5/proxies.json-example-noauth:/app/changedetectionio/test-datastore/proxies.json \
 | 
			
		||||
  -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" \
 | 
			
		||||
  --rm \
 | 
			
		||||
  test-changedetectionio \
 | 
			
		||||
  bash -c 'cd changedetectionio && pytest tests/proxy_socks5/test_socks5_proxy_sources.py'
 | 
			
		||||
 | 
			
		||||
echo "socks5 server logs"
 | 
			
		||||
docker logs socks5proxy
 | 
			
		||||
echo "----------------------------------"
 | 
			
		||||
 | 
			
		||||
docker kill socks5proxy socks5proxy-noauth
 | 
			
		||||
@@ -10,7 +10,7 @@ $(document).ready(function () {
 | 
			
		||||
        }
 | 
			
		||||
    })
 | 
			
		||||
    var browsersteps_session_id;
 | 
			
		||||
    var browser_interface_seconds_remaining = 0;
 | 
			
		||||
    var browserless_seconds_remaining = 0;
 | 
			
		||||
    var apply_buttons_disabled = false;
 | 
			
		||||
    var include_text_elements = $("#include_text_elements");
 | 
			
		||||
    var xpath_data = false;
 | 
			
		||||
@@ -49,7 +49,7 @@ $(document).ready(function () {
 | 
			
		||||
        $('#browsersteps-img').removeAttr('src');
 | 
			
		||||
        $("#browsersteps-click-start").show();
 | 
			
		||||
        $("#browsersteps-selector-wrapper .spinner").hide();
 | 
			
		||||
        browser_interface_seconds_remaining = 0;
 | 
			
		||||
        browserless_seconds_remaining = 0;
 | 
			
		||||
        browsersteps_session_id = false;
 | 
			
		||||
        apply_buttons_disabled = false;
 | 
			
		||||
        ctx.clearRect(0, 0, c.width, c.height);
 | 
			
		||||
@@ -61,12 +61,12 @@ $(document).ready(function () {
 | 
			
		||||
        $('#browser_steps >li:first-child').css('opacity', '0.5');
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Show seconds remaining until the browser interface needs to restart the session
 | 
			
		||||
    // Show seconds remaining until playwright/browserless needs to restart the session
 | 
			
		||||
    // (See comment at the top of changedetectionio/blueprint/browser_steps/__init__.py )
 | 
			
		||||
    setInterval(() => {
 | 
			
		||||
        if (browser_interface_seconds_remaining >= 1) {
 | 
			
		||||
            document.getElementById('browser-seconds-remaining').innerText = browser_interface_seconds_remaining + " seconds remaining in session";
 | 
			
		||||
            browser_interface_seconds_remaining -= 1;
 | 
			
		||||
        if (browserless_seconds_remaining >= 1) {
 | 
			
		||||
            document.getElementById('browserless-seconds-remaining').innerText = browserless_seconds_remaining + " seconds remaining in session";
 | 
			
		||||
            browserless_seconds_remaining -= 1;
 | 
			
		||||
        }
 | 
			
		||||
    }, "1000")
 | 
			
		||||
 | 
			
		||||
@@ -160,12 +160,6 @@ $(document).ready(function () {
 | 
			
		||||
                    e.offsetX > item.left * y_scale && e.offsetX < item.left * y_scale + item.width * y_scale
 | 
			
		||||
 | 
			
		||||
                ) {
 | 
			
		||||
                    // Ignore really large ones, because we are scraping 'div' also from xpath_element_scraper but
 | 
			
		||||
                    // that div or whatever could be some wrapper and would generally make you select the whole page
 | 
			
		||||
                    if (item.width > 800 && item.height > 400) {
 | 
			
		||||
                        return
 | 
			
		||||
                    }
 | 
			
		||||
 | 
			
		||||
                    // There could be many elements here, record them all and then we'll find out which is the most 'useful'
 | 
			
		||||
                    // (input, textarea, button, A etc)
 | 
			
		||||
                    if (item.width < xpath_data['browser_width']) {
 | 
			
		||||
@@ -267,7 +261,7 @@ $(document).ready(function () {
 | 
			
		||||
            // This should trigger 'Goto site'
 | 
			
		||||
            console.log("Got startup response, requesting Goto-Site (first) step fake click");
 | 
			
		||||
            $('#browser_steps >li:first-child .apply').click();
 | 
			
		||||
            browser_interface_seconds_remaining = 500;
 | 
			
		||||
            browserless_seconds_remaining = 500;
 | 
			
		||||
            set_first_gotosite_disabled();
 | 
			
		||||
        }).fail(function (data) {
 | 
			
		||||
            console.log(data);
 | 
			
		||||
 
 | 
			
		||||
@@ -28,11 +28,15 @@ $(document).ready(function() {
 | 
			
		||||
      notification_format: $('#notification_format').val(),
 | 
			
		||||
      notification_title: $('#notification_title').val(),
 | 
			
		||||
      notification_urls: $('.notification-urls').val(),
 | 
			
		||||
      tags: $('#tags').val(),
 | 
			
		||||
      window_url: window.location.href,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    if (!data['notification_urls'].length) {
 | 
			
		||||
      alert("Notification URL list is empty, cannot send test.")
 | 
			
		||||
      return;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    $.ajax({
 | 
			
		||||
      type: "POST",
 | 
			
		||||
      url: notification_base_url,
 | 
			
		||||
@@ -45,7 +49,7 @@ $(document).ready(function() {
 | 
			
		||||
      }
 | 
			
		||||
    }).done(function(data){
 | 
			
		||||
      console.log(data);
 | 
			
		||||
      alert(data);
 | 
			
		||||
      alert('Sent');
 | 
			
		||||
    }).fail(function(data){
 | 
			
		||||
      console.log(data);
 | 
			
		||||
      alert('There was an error communicating with the server.');
 | 
			
		||||
 
 | 
			
		||||
@@ -1096,16 +1096,3 @@ ul {
 | 
			
		||||
  white-space: nowrap;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#chrome-extension-link {
 | 
			
		||||
  img {
 | 
			
		||||
    height: 21px;
 | 
			
		||||
    padding: 2px;
 | 
			
		||||
    vertical-align: middle;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  padding: 9px;
 | 
			
		||||
  border: 1px solid var(--color-grey-800);
 | 
			
		||||
  border-radius: 10px;
 | 
			
		||||
  vertical-align: middle;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1180,13 +1180,3 @@ ul {
 | 
			
		||||
  .restock-label.not-in-stock {
 | 
			
		||||
    background-color: var(--color-background-button-cancel);
 | 
			
		||||
    color: #777; }
 | 
			
		||||
 | 
			
		||||
#chrome-extension-link {
 | 
			
		||||
  padding: 9px;
 | 
			
		||||
  border: 1px solid var(--color-grey-800);
 | 
			
		||||
  border-radius: 10px;
 | 
			
		||||
  vertical-align: middle; }
 | 
			
		||||
  #chrome-extension-link img {
 | 
			
		||||
    height: 21px;
 | 
			
		||||
    padding: 2px;
 | 
			
		||||
    vertical-align: middle; }
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,4 @@
 | 
			
		||||
from changedetectionio.strtobool import strtobool
 | 
			
		||||
from distutils.util import strtobool
 | 
			
		||||
 | 
			
		||||
from flask import (
 | 
			
		||||
    flash
 | 
			
		||||
@@ -657,10 +657,7 @@ class ChangeDetectionStore:
 | 
			
		||||
        return res
 | 
			
		||||
 | 
			
		||||
    def tag_exists_by_name(self, tag_name):
 | 
			
		||||
        # Check if any tag dictionary has a 'title' attribute matching the provided tag_name
 | 
			
		||||
        tags = self.__data['settings']['application']['tags'].values()
 | 
			
		||||
        return next((v for v in tags if v.get('title', '').lower() == tag_name.lower()),
 | 
			
		||||
                    None)
 | 
			
		||||
        return any(v.get('title', '').lower() == tag_name.lower() for k, v in self.__data['settings']['application']['tags'].items())
 | 
			
		||||
 | 
			
		||||
    def get_updates_available(self):
 | 
			
		||||
        import inspect
 | 
			
		||||
 
 | 
			
		||||
@@ -1,23 +0,0 @@
 | 
			
		||||
# Because strtobool was removed in python 3.12 distutils
 | 
			
		||||
 | 
			
		||||
_MAP = {
 | 
			
		||||
    'y': True,
 | 
			
		||||
    'yes': True,
 | 
			
		||||
    't': True,
 | 
			
		||||
    'true': True,
 | 
			
		||||
    'on': True,
 | 
			
		||||
    '1': True,
 | 
			
		||||
    'n': False,
 | 
			
		||||
    'no': False,
 | 
			
		||||
    'f': False,
 | 
			
		||||
    'false': False,
 | 
			
		||||
    'off': False,
 | 
			
		||||
    '0': False
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def strtobool(value):
 | 
			
		||||
    try:
 | 
			
		||||
        return _MAP[str(value).lower()]
 | 
			
		||||
    except KeyError:
 | 
			
		||||
        raise ValueError('"{}" is not a valid bool value'.format(value))
 | 
			
		||||
@@ -147,19 +147,7 @@
 | 
			
		||||
    <section class="content">
 | 
			
		||||
        <div id="overlay">
 | 
			
		||||
            <div class="content">
 | 
			
		||||
                <h4>Try our Chrome extension</h4>
 | 
			
		||||
                <p>
 | 
			
		||||
                    <a id="chrome-extension-link"
 | 
			
		||||
                       title="Try our new Chrome Extension!"
 | 
			
		||||
                       href="https://chromewebstore.google.com/detail/changedetectionio-website/kefcfmgmlhmankjmnbijimhofdjekbop">
 | 
			
		||||
                        <img src="{{url_for('static_content', group='images', filename='Google-Chrome-icon.png')}}">
 | 
			
		||||
                        Chrome Webstore
 | 
			
		||||
                    </a>
 | 
			
		||||
                </p>
 | 
			
		||||
 | 
			
		||||
                Easily add the current web-page from your browser directly into your changedetection.io tool, more great features coming soon!
 | 
			
		||||
 | 
			
		||||
                <h4>Changedetection.io needs your support!</h4>
 | 
			
		||||
                <strong>changedetection.io needs your support!</strong><br>
 | 
			
		||||
                <p>
 | 
			
		||||
                    You can help us by supporting changedetection.io on these platforms;
 | 
			
		||||
                </p>
 | 
			
		||||
 
 | 
			
		||||
@@ -7,8 +7,7 @@
 | 
			
		||||
<script>
 | 
			
		||||
    const browser_steps_available_screenshots=JSON.parse('{{ watch.get_browsersteps_available_screenshots|tojson }}');
 | 
			
		||||
    const browser_steps_config=JSON.parse('{{ browser_steps_config|tojson }}');
 | 
			
		||||
    <!-- Should be _external so that firefox and others load it more reliably -->
 | 
			
		||||
    const browser_steps_fetch_screenshot_image_url="{{url_for('browser_steps.browser_steps_fetch_screenshot_image', uuid=uuid, _external=True)}}";
 | 
			
		||||
    const browser_steps_fetch_screenshot_image_url="{{url_for('browser_steps.browser_steps_fetch_screenshot_image', uuid=uuid)}}";
 | 
			
		||||
    const browser_steps_last_error_step={{ watch.browser_steps_last_error_step|tojson }};
 | 
			
		||||
    const browser_steps_start_url="{{url_for('browser_steps.browsersteps_start_session', uuid=uuid)}}";
 | 
			
		||||
    const browser_steps_sync_url="{{url_for('browser_steps.browsersteps_ui_update', uuid=uuid)}}";
 | 
			
		||||
@@ -32,7 +31,6 @@
 | 
			
		||||
<script src="{{url_for('static_content', group='js', filename='browser-steps.js')}}" defer></script>
 | 
			
		||||
{% endif %}
 | 
			
		||||
 | 
			
		||||
{% set has_tag_filters_extra="WARNING: Watch has tag/groups set with special filters\n" if has_special_tag_options else '' %}
 | 
			
		||||
<script src="{{url_for('static_content', group='js', filename='recheck-proxy.js')}}" defer></script>
 | 
			
		||||
 | 
			
		||||
<div class="edit-form monospaced-textarea">
 | 
			
		||||
@@ -230,7 +228,7 @@ User-Agent: wonderbra 1.0") }}
 | 
			
		||||
                                </div>
 | 
			
		||||
                            </div>
 | 
			
		||||
                            <div id="browser-steps-fieldlist" style="padding-left: 1em;  width: 350px; font-size: 80%;" >
 | 
			
		||||
                                <span id="browser-seconds-remaining">Loading</span> <span style="font-size: 80%;"> (<a target=_new href="https://github.com/dgtlmoon/changedetection.io/pull/478/files#diff-1a79d924d1840c485238e66772391268a89c95b781d69091384cf1ea1ac146c9R4">?</a>) </span>
 | 
			
		||||
                                <span id="browserless-seconds-remaining">Loading</span> <span style="font-size: 80%;"> (<a target=_new href="https://github.com/dgtlmoon/changedetection.io/pull/478/files#diff-1a79d924d1840c485238e66772391268a89c95b781d69091384cf1ea1ac146c9R4">?</a>) </span>
 | 
			
		||||
                                {{ render_field(form.browser_steps) }}
 | 
			
		||||
                            </div>
 | 
			
		||||
                        </div>
 | 
			
		||||
@@ -282,7 +280,7 @@ User-Agent: wonderbra 1.0") }}
 | 
			
		||||
                    <div class="pure-control-group">
 | 
			
		||||
                        {% set field = render_field(form.include_filters,
 | 
			
		||||
                            rows=5,
 | 
			
		||||
                            placeholder=has_tag_filters_extra+"#example
 | 
			
		||||
                            placeholder="#example
 | 
			
		||||
xpath://body/div/span[contains(@class, 'example-class')]",
 | 
			
		||||
                            class="m-d")
 | 
			
		||||
                        %}
 | 
			
		||||
@@ -318,14 +316,13 @@ xpath://body/div/span[contains(@class, 'example-class')]",
 | 
			
		||||
                </span>
 | 
			
		||||
                    </div>
 | 
			
		||||
                <fieldset class="pure-control-group">
 | 
			
		||||
                    {{ render_field(form.subtractive_selectors, rows=5, placeholder=has_tag_filters_extra+"header
 | 
			
		||||
                    {{ render_field(form.subtractive_selectors, rows=5, placeholder="header
 | 
			
		||||
footer
 | 
			
		||||
nav
 | 
			
		||||
.stockticker") }}
 | 
			
		||||
                    <span class="pure-form-message-inline">
 | 
			
		||||
                        <ul>
 | 
			
		||||
                          <li> Remove HTML element(s) by CSS selector before text conversion. </li>
 | 
			
		||||
                          <li> Don't paste HTML here, use only CSS selectors </li>
 | 
			
		||||
                          <li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li>
 | 
			
		||||
                        </ul>
 | 
			
		||||
                      </span>
 | 
			
		||||
@@ -439,7 +436,7 @@ Unavailable") }}
 | 
			
		||||
                    <div class="pure-control-group">
 | 
			
		||||
                        {% if visualselector_enabled %}
 | 
			
		||||
                            <span class="pure-form-message-inline">
 | 
			
		||||
                                The Visual Selector tool lets you select the <i>text</i> elements that will be used for the change detection ‐ after the <i>Browser Steps</i> has completed, this tool is a helper to manage filters in the  "CSS/JSONPath/JQ/XPath Filters" box of the <a href="#filters-and-triggers">Filters & Triggers</a> tab.
 | 
			
		||||
                                The Visual Selector tool lets you select the <i>text</i> elements that will be used for the change detection ‐ after the <i>Browser Steps</i> has completed.<br><br>
 | 
			
		||||
                            </span>
 | 
			
		||||
 | 
			
		||||
                            <div id="selector-header">
 | 
			
		||||
 
 | 
			
		||||
@@ -107,7 +107,7 @@
 | 
			
		||||
                                    <option value="" style="color: #aaa"> -- none --</option>
 | 
			
		||||
                                    <option value="url">URL</option>
 | 
			
		||||
                                    <option value="title">Title</option>
 | 
			
		||||
                                    <option value="include_filters">CSS/xPath filter</option>
 | 
			
		||||
                                    <option value="include_filter">CSS/xPath filter</option>
 | 
			
		||||
                                    <option value="tag">Group / Tag name(s)</option>
 | 
			
		||||
                                    <option value="interval_minutes">Recheck time (minutes)</option>
 | 
			
		||||
                                </select></td>
 | 
			
		||||
 
 | 
			
		||||
@@ -4,7 +4,7 @@
 | 
			
		||||
{% from '_helpers.jinja' import render_field, render_checkbox_field, render_button %}
 | 
			
		||||
{% from '_common_fields.jinja' import render_common_settings_form %}
 | 
			
		||||
<script>
 | 
			
		||||
    const notification_base_url="{{url_for('ajax_callback_send_notification_test', mode="global-settings")}}";
 | 
			
		||||
    const notification_base_url="{{url_for('ajax_callback_send_notification_test', watch_uuid=uuid)}}";
 | 
			
		||||
{% if emailprefix %}
 | 
			
		||||
    const email_notification_prefix=JSON.parse('{{emailprefix|tojson}}');
 | 
			
		||||
{% endif %}
 | 
			
		||||
@@ -168,12 +168,12 @@ nav
 | 
			
		||||
           </div>
 | 
			
		||||
 | 
			
		||||
            <div class="tab-pane-inner" id="api">
 | 
			
		||||
                <h4>API Access</h4>
 | 
			
		||||
 | 
			
		||||
                <p>Drive your changedetection.io via API, More about <a href="https://github.com/dgtlmoon/changedetection.io/wiki/API-Reference">API access here</a></p>
 | 
			
		||||
 | 
			
		||||
                <div class="pure-control-group">
 | 
			
		||||
                    {{ render_checkbox_field(form.application.form.api_access_token_enabled) }}
 | 
			
		||||
                    <div class="pure-form-message-inline">Restrict API access limit by using <code>x-api-key</code> header - required for the Chrome Extension to work</div><br>
 | 
			
		||||
                    <div class="pure-form-message-inline">Restrict API access limit by using <code>x-api-key</code> header</div><br>
 | 
			
		||||
                    <div class="pure-form-message-inline"><br>API Key <span id="api-key">{{api_key}}</span>
 | 
			
		||||
                        <span style="display:none;" id="api-key-copy" >copy</span>
 | 
			
		||||
                    </div>
 | 
			
		||||
@@ -181,20 +181,6 @@ nav
 | 
			
		||||
                <div class="pure-control-group">
 | 
			
		||||
                    <a href="{{url_for('settings_reset_api_key')}}" class="pure-button button-small button-cancel">Regenerate API key</a>
 | 
			
		||||
                </div>
 | 
			
		||||
                <div class="pure-control-group">
 | 
			
		||||
                    <h4>Chrome Extension</h4>
 | 
			
		||||
                    <p>Easily add any web-page to your changedetection.io installation from within Chrome.</p>
 | 
			
		||||
                    <strong>Step 1</strong> Install the extension, <strong>Step 2</strong> Navigate to this page,
 | 
			
		||||
                    <strong>Step 3</strong> Open the extension from the toolbar and click "<i>Sync API Access</i>"
 | 
			
		||||
                    <p>
 | 
			
		||||
                        <a id="chrome-extension-link"
 | 
			
		||||
                           title="Try our new Chrome Extension!"
 | 
			
		||||
                           href="https://chromewebstore.google.com/detail/changedetectionio-website/kefcfmgmlhmankjmnbijimhofdjekbop">
 | 
			
		||||
                            <img src="{{ url_for('static_content', group='images', filename='Google-Chrome-icon.png') }}">
 | 
			
		||||
                            Chrome Webstore
 | 
			
		||||
                        </a>
 | 
			
		||||
                    </p>
 | 
			
		||||
                </div>
 | 
			
		||||
            </div>
 | 
			
		||||
            <div class="tab-pane-inner" id="proxies">
 | 
			
		||||
                <div id="recommended-proxy">
 | 
			
		||||
 
 | 
			
		||||
@@ -1,6 +1,6 @@
 | 
			
		||||
{% extends 'base.html' %}
 | 
			
		||||
{% block content %}
 | 
			
		||||
{% from '_helpers.jinja' import render_simple_field, render_field, render_nolabel_field, sort_by_title %}
 | 
			
		||||
{% from '_helpers.jinja' import render_simple_field, render_field, render_nolabel_field %}
 | 
			
		||||
<script src="{{url_for('static_content', group='js', filename='jquery-3.6.0.min.js')}}"></script>
 | 
			
		||||
<script src="{{url_for('static_content', group='js', filename='watch-overview.js')}}" defer></script>
 | 
			
		||||
 | 
			
		||||
@@ -13,7 +13,7 @@
 | 
			
		||||
            <div id="watch-add-wrapper-zone">
 | 
			
		||||
 | 
			
		||||
                    {{ render_nolabel_field(form.url, placeholder="https://...", required=true) }}
 | 
			
		||||
                    {{ render_nolabel_field(form.tags, value=active_tag.title if active_tag else '', placeholder="watch label / tag") }}
 | 
			
		||||
                    {{ render_nolabel_field(form.tags, value=tags[active_tag].title if active_tag else '', placeholder="watch label / tag") }}
 | 
			
		||||
                    {{ render_nolabel_field(form.watch_submit_button, title="Watch this URL!" ) }}
 | 
			
		||||
                    {{ render_nolabel_field(form.edit_and_watch_submit_button, title="Edit first then Watch") }}
 | 
			
		||||
            </div>
 | 
			
		||||
@@ -37,7 +37,6 @@
 | 
			
		||||
        <button class="pure-button button-secondary button-xsmall" name="op" value="assign-tag" id="checkbox-assign-tag">Tag</button>
 | 
			
		||||
        <button class="pure-button button-secondary button-xsmall" name="op" value="mark-viewed">Mark viewed</button>
 | 
			
		||||
        <button class="pure-button button-secondary button-xsmall" name="op" value="notification-default">Use default notification</button>
 | 
			
		||||
        <button class="pure-button button-secondary button-xsmall" name="op" value="clear-errors">Clear errors</button>
 | 
			
		||||
        <button class="pure-button button-secondary button-xsmall" style="background: #dd4242;" name="op" value="clear-history">Clear/reset history</button>
 | 
			
		||||
        <button class="pure-button button-secondary button-xsmall" style="background: #dd4242;" name="op" value="delete">Delete</button>
 | 
			
		||||
    </div>
 | 
			
		||||
@@ -47,13 +46,11 @@
 | 
			
		||||
    {% if search_q %}<div id="search-result-info">Searching "<strong><i>{{search_q}}</i></strong>"</div>{% endif %}
 | 
			
		||||
    <div>
 | 
			
		||||
        <a href="{{url_for('index')}}" class="pure-button button-tag {{'active' if not active_tag }}">All</a>
 | 
			
		||||
 | 
			
		||||
    <!-- tag list -->
 | 
			
		||||
    {% for uuid, tag in tags %}
 | 
			
		||||
        {% if tag != "" %}
 | 
			
		||||
            <a href="{{url_for('index', tag=uuid) }}" class="pure-button button-tag {{'active' if active_tag_uuid == uuid }}">{{ tag.title }}</a>
 | 
			
		||||
        {% endif %}
 | 
			
		||||
    {% endfor %}
 | 
			
		||||
        {% for uuid, tag in tags.items() %}
 | 
			
		||||
            {% if tag != "" %}
 | 
			
		||||
                <a href="{{url_for('index', tag=uuid) }}" class="pure-button button-tag {{'active' if active_tag == uuid }}">{{ tag.title }}</a>
 | 
			
		||||
            {% endif %}
 | 
			
		||||
        {% endfor %}
 | 
			
		||||
    </div>
 | 
			
		||||
 | 
			
		||||
    {% set sort_order = sort_order or 'asc' %}
 | 
			
		||||
@@ -200,8 +197,8 @@
 | 
			
		||||
            </li>
 | 
			
		||||
            {% endif %}
 | 
			
		||||
            <li>
 | 
			
		||||
               <a href="{{ url_for('form_watch_checknow', tag=active_tag_uuid, with_errors=request.args.get('with_errors',0)) }}" class="pure-button button-tag ">Recheck
 | 
			
		||||
                all {% if active_tag_uuid %} in "{{active_tag.title}}"{%endif%}</a>
 | 
			
		||||
               <a href="{{ url_for('form_watch_checknow', tag=active_tag, with_errors=request.args.get('with_errors',0)) }}" class="pure-button button-tag ">Recheck
 | 
			
		||||
                all {% if active_tag%} in "{{tags[active_tag].title}}"{%endif%}</a>
 | 
			
		||||
            </li>
 | 
			
		||||
            <li>
 | 
			
		||||
                <a href="{{ url_for('rss', tag=active_tag , token=app_rss_token)}}"><img alt="RSS Feed" id="feed-icon" src="{{url_for('static_content', group='images', filename='Generic_Feed-icon.svg')}}" height="15"></a>
 | 
			
		||||
 
 | 
			
		||||
@@ -7,11 +7,10 @@ from ..util import live_server_setup, wait_for_all_checks
 | 
			
		||||
def do_test(client, live_server, make_test_use_extra_browser=False):
 | 
			
		||||
 | 
			
		||||
    # Grep for this string in the logs?
 | 
			
		||||
    test_url = f"https://changedetection.io/ci-test.html?non-custom-default=true"
 | 
			
		||||
    # "non-custom-default" should not appear in the custom browser connection
 | 
			
		||||
    test_url = f"https://changedetection.io/ci-test.html"
 | 
			
		||||
    custom_browser_name = 'custom browser URL'
 | 
			
		||||
 | 
			
		||||
    # needs to be set and something like 'ws://127.0.0.1:3000'
 | 
			
		||||
    # needs to be set and something like 'ws://127.0.0.1:3000?stealth=1&--disable-web-security=true'
 | 
			
		||||
    assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test"
 | 
			
		||||
 | 
			
		||||
    #####################
 | 
			
		||||
@@ -20,7 +19,9 @@ def do_test(client, live_server, make_test_use_extra_browser=False):
 | 
			
		||||
        data={"application-empty_pages_are_a_change": "",
 | 
			
		||||
              "requests-time_between_check-minutes": 180,
 | 
			
		||||
              'application-fetch_backend': "html_webdriver",
 | 
			
		||||
              'requests-extra_browsers-0-browser_connection_url': 'ws://sockpuppetbrowser-custom-url:3000',
 | 
			
		||||
              # browserless-custom-url is setup in  .github/workflows/test-only.yml
 | 
			
		||||
              # the test script run_custom_browser_url_test.sh will look for 'custom-browser-search-string' in the container logs
 | 
			
		||||
              'requests-extra_browsers-0-browser_connection_url': 'ws://browserless-custom-url:3000?stealth=1&--disable-web-security=true&custom-browser-search-string=1',
 | 
			
		||||
              'requests-extra_browsers-0-browser_name': custom_browser_name
 | 
			
		||||
              },
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
@@ -50,8 +51,7 @@ def do_test(client, live_server, make_test_use_extra_browser=False):
 | 
			
		||||
        res = client.post(
 | 
			
		||||
            url_for("edit_page", uuid="first"),
 | 
			
		||||
            data={
 | 
			
		||||
                # 'run_customer_browser_url_tests.sh' will search for this string to know if we hit the right browser container or not
 | 
			
		||||
                  "url": f"https://changedetection.io/ci-test.html?custom-browser-search-string=1",
 | 
			
		||||
                  "url": test_url,
 | 
			
		||||
                  "tags": "",
 | 
			
		||||
                  "headers": "",
 | 
			
		||||
                  'fetch_backend': f"extra_browser_{custom_browser_name}",
 | 
			
		||||
 
 | 
			
		||||
@@ -1,56 +0,0 @@
 | 
			
		||||
import os
 | 
			
		||||
from flask import url_for
 | 
			
		||||
from ..util import live_server_setup, wait_for_all_checks, extract_UUID_from_client
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_execute_custom_js(client, live_server):
 | 
			
		||||
 | 
			
		||||
    live_server_setup(live_server)
 | 
			
		||||
    assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test"
 | 
			
		||||
 | 
			
		||||
    test_url = url_for('test_interactive_html_endpoint', _external=True)
 | 
			
		||||
    test_url = test_url.replace('localhost.localdomain', 'cdio')
 | 
			
		||||
    test_url = test_url.replace('localhost', 'cdio')
 | 
			
		||||
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("form_quick_watch_add"),
 | 
			
		||||
        data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    assert b"Watch added in Paused state, saving will unpause" in res.data
 | 
			
		||||
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("edit_page", uuid="first", unpause_on_save=1),
 | 
			
		||||
        data={
 | 
			
		||||
            "url": test_url,
 | 
			
		||||
            "tags": "",
 | 
			
		||||
            'fetch_backend': "html_webdriver",
 | 
			
		||||
            'webdriver_js_execute_code': 'document.querySelector("button[name=test-button]").click();',
 | 
			
		||||
            'headers': "testheader: yes\buser-agent: MyCustomAgent",
 | 
			
		||||
        },
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"unpaused" in res.data
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
 | 
			
		||||
    uuid = extract_UUID_from_client(client)
 | 
			
		||||
    assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n >= 1, "Watch history had atleast 1 (everything fetched OK)"
 | 
			
		||||
 | 
			
		||||
    assert b"This text should be removed" not in res.data
 | 
			
		||||
 | 
			
		||||
    # Check HTML conversion detected and workd
 | 
			
		||||
    res = client.get(
 | 
			
		||||
        url_for("preview_page", uuid=uuid),
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"This text should be removed" not in res.data
 | 
			
		||||
    assert b"I smell JavaScript because the button was pressed" in res.data
 | 
			
		||||
 | 
			
		||||
    assert b"testheader: yes" in res.data
 | 
			
		||||
    assert b"user-agent: mycustomagent" in res.data
 | 
			
		||||
 | 
			
		||||
    client.get(
 | 
			
		||||
        url_for("form_delete", uuid="all"),
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
@@ -1,6 +1,6 @@
 | 
			
		||||
#!/usr/bin/python3
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
import time
 | 
			
		||||
from flask import url_for
 | 
			
		||||
from ..util import live_server_setup, wait_for_all_checks
 | 
			
		||||
 | 
			
		||||
@@ -9,20 +9,22 @@ def test_preferred_proxy(client, live_server):
 | 
			
		||||
    live_server_setup(live_server)
 | 
			
		||||
    url = "http://chosen.changedetection.io"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("form_quick_watch_add"),
 | 
			
		||||
        data={"url": url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
 | 
			
		||||
        url_for("import_page"),
 | 
			
		||||
        # Because a URL wont show in squid/proxy logs due it being SSLed
 | 
			
		||||
        # Use plain HTTP or a specific domain-name here
 | 
			
		||||
        data={"urls": url},
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"Watch added in Paused state, saving will unpause" in res.data
 | 
			
		||||
 | 
			
		||||
    assert b"1 Imported" in res.data
 | 
			
		||||
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("edit_page", uuid="first", unpause_on_save=1),
 | 
			
		||||
        url_for("edit_page", uuid="first"),
 | 
			
		||||
        data={
 | 
			
		||||
                "include_filters": "",
 | 
			
		||||
                "fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests',
 | 
			
		||||
                "fetch_backend": "html_requests",
 | 
			
		||||
                "headers": "",
 | 
			
		||||
                "proxy": "proxy-two",
 | 
			
		||||
                "tags": "",
 | 
			
		||||
@@ -30,6 +32,6 @@ def test_preferred_proxy(client, live_server):
 | 
			
		||||
              },
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"unpaused" in res.data
 | 
			
		||||
    assert b"Updated watch." in res.data
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
    # Now the request should appear in the second-squid logs
 | 
			
		||||
 
 | 
			
		||||
@@ -3,7 +3,6 @@
 | 
			
		||||
import time
 | 
			
		||||
from flask import url_for
 | 
			
		||||
from ..util import live_server_setup, wait_for_all_checks
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
# just make a request, we will grep in the docker logs to see it actually got called
 | 
			
		||||
def test_select_custom(client, live_server):
 | 
			
		||||
@@ -15,7 +14,7 @@ def test_select_custom(client, live_server):
 | 
			
		||||
        data={
 | 
			
		||||
            "requests-time_between_check-minutes": 180,
 | 
			
		||||
            "application-ignore_whitespace": "y",
 | 
			
		||||
            "application-fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests',
 | 
			
		||||
            "application-fetch_backend": "html_requests",
 | 
			
		||||
            "requests-extra_proxies-0-proxy_name": "custom-test-proxy",
 | 
			
		||||
            # test:awesome is set in tests/proxy_list/squid-passwords.txt
 | 
			
		||||
            "requests-extra_proxies-0-proxy_url": "http://test:awesome@squid-custom:3128",
 | 
			
		||||
 
 | 
			
		||||
@@ -95,7 +95,7 @@ def test_restock_detection(client, live_server):
 | 
			
		||||
 | 
			
		||||
    # We should have a notification
 | 
			
		||||
    time.sleep(2)
 | 
			
		||||
    assert os.path.isfile("test-datastore/notification.txt"), "Notification received"
 | 
			
		||||
    assert os.path.isfile("test-datastore/notification.txt")
 | 
			
		||||
    os.unlink("test-datastore/notification.txt")
 | 
			
		||||
 | 
			
		||||
    # Default behaviour is to only fire notification when it goes OUT OF STOCK -> IN STOCK
 | 
			
		||||
@@ -103,9 +103,4 @@ def test_restock_detection(client, live_server):
 | 
			
		||||
    set_original_response()
 | 
			
		||||
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
    assert not os.path.isfile("test-datastore/notification.txt"), "No notification should have fired when it went OUT OF STOCK by default"
 | 
			
		||||
 | 
			
		||||
    # BUT we should see that it correctly shows "not in stock"
 | 
			
		||||
    res = client.get(url_for("index"))
 | 
			
		||||
    assert b'not-in-stock' in res.data, "Correctly showing NOT IN STOCK in the list after it changed from IN STOCK"
 | 
			
		||||
 | 
			
		||||
    assert not os.path.isfile("test-datastore/notification.txt")
 | 
			
		||||
 
 | 
			
		||||
@@ -29,7 +29,7 @@ def test_check_basic_change_detection_functionality(client, live_server):
 | 
			
		||||
 | 
			
		||||
    assert b"1 Imported" in res.data
 | 
			
		||||
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
    time.sleep(sleep_time_for_fetch_thread)
 | 
			
		||||
 | 
			
		||||
    # Do this a few times.. ensures we dont accidently set the status
 | 
			
		||||
    for n in range(3):
 | 
			
		||||
 
 | 
			
		||||
@@ -3,7 +3,7 @@
 | 
			
		||||
import time
 | 
			
		||||
 | 
			
		||||
from flask import url_for
 | 
			
		||||
from .util import live_server_setup, wait_for_all_checks
 | 
			
		||||
from . util import live_server_setup
 | 
			
		||||
 | 
			
		||||
from ..html_tools import *
 | 
			
		||||
 | 
			
		||||
@@ -30,7 +30,7 @@ def _runner_test_http_errors(client, live_server, http_code, expected_text):
 | 
			
		||||
    assert b"1 Imported" in res.data
 | 
			
		||||
 | 
			
		||||
    # Give the thread time to pick it up
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
    time.sleep(2)
 | 
			
		||||
 | 
			
		||||
    res = client.get(url_for("index"))
 | 
			
		||||
    # no change
 | 
			
		||||
@@ -57,7 +57,7 @@ def _runner_test_http_errors(client, live_server, http_code, expected_text):
 | 
			
		||||
def test_http_error_handler(client, live_server):
 | 
			
		||||
    _runner_test_http_errors(client, live_server, 403, 'Access denied')
 | 
			
		||||
    _runner_test_http_errors(client, live_server, 404, 'Page not found')
 | 
			
		||||
    _runner_test_http_errors(client, live_server, 500, '(Internal server error) received')
 | 
			
		||||
    _runner_test_http_errors(client, live_server, 500, '(Internal server Error) received')
 | 
			
		||||
    _runner_test_http_errors(client, live_server, 400, 'Error - Request returned a HTTP error code 400')
 | 
			
		||||
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
 | 
			
		||||
    assert b'Deleted' in res.data
 | 
			
		||||
@@ -76,7 +76,7 @@ def test_DNS_errors(client, live_server):
 | 
			
		||||
    assert b"1 Imported" in res.data
 | 
			
		||||
 | 
			
		||||
    # Give the thread time to pick it up
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
    time.sleep(3)
 | 
			
		||||
 | 
			
		||||
    res = client.get(url_for("index"))
 | 
			
		||||
    found_name_resolution_error = b"Temporary failure in name resolution" in res.data or b"Name or service not known" in res.data
 | 
			
		||||
@@ -104,7 +104,7 @@ def test_low_level_errors_clear_correctly(client, live_server):
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"1 Imported" in res.data
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
    time.sleep(2)
 | 
			
		||||
 | 
			
		||||
    # We should see the DNS error
 | 
			
		||||
    res = client.get(url_for("index"))
 | 
			
		||||
@@ -121,7 +121,7 @@ def test_low_level_errors_clear_correctly(client, live_server):
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # Now the error should be gone
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
    time.sleep(2)
 | 
			
		||||
    res = client.get(url_for("index"))
 | 
			
		||||
    found_name_resolution_error = b"Temporary failure in name resolution" in res.data or b"Name or service not known" in res.data
 | 
			
		||||
    assert not found_name_resolution_error
 | 
			
		||||
 
 | 
			
		||||
@@ -100,12 +100,6 @@ def test_setup_group_tag(client, live_server):
 | 
			
		||||
    assert b'Should be only this' in res.data
 | 
			
		||||
    assert b'And never this' not in res.data
 | 
			
		||||
 | 
			
		||||
    res = client.get(
 | 
			
		||||
        url_for("edit_page", uuid="first"),
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    # 2307 the UI notice should appear in the placeholder
 | 
			
		||||
    assert b'WARNING: Watch has tag/groups set with special filters' in res.data
 | 
			
		||||
 | 
			
		||||
    # RSS Group tag filter
 | 
			
		||||
    # An extra one that should be excluded
 | 
			
		||||
@@ -327,154 +321,3 @@ def test_clone_tag_on_quickwatchform_add(client, live_server):
 | 
			
		||||
 | 
			
		||||
    res = client.get(url_for("tags.delete_all"), follow_redirects=True)
 | 
			
		||||
    assert b'All tags deleted' in res.data
 | 
			
		||||
 | 
			
		||||
def test_order_of_filters_tag_filter_and_watch_filter(client, live_server):
 | 
			
		||||
 | 
			
		||||
    # Add a tag with some config, import a tag and it should roughly work
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("tags.form_tag_add"),
 | 
			
		||||
        data={"name": "test-tag-keep-order"},
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"Tag added" in res.data
 | 
			
		||||
    assert b"test-tag-keep-order" in res.data
 | 
			
		||||
    tag_filters = [
 | 
			
		||||
            '#only-this', # duplicated filters
 | 
			
		||||
            '#only-this',
 | 
			
		||||
            '#only-this',
 | 
			
		||||
            '#only-this',
 | 
			
		||||
            ]
 | 
			
		||||
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("tags.form_tag_edit_submit", uuid="first"),
 | 
			
		||||
        data={"name": "test-tag-keep-order",
 | 
			
		||||
              "include_filters": '\n'.join(tag_filters) },
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"Updated" in res.data
 | 
			
		||||
    tag_uuid = get_UUID_for_tag_name(client, name="test-tag-keep-order")
 | 
			
		||||
    res = client.get(
 | 
			
		||||
        url_for("tags.form_tag_edit", uuid="first")
 | 
			
		||||
    )
 | 
			
		||||
    assert b"#only-this" in res.data
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    d = """<html>
 | 
			
		||||
       <body>
 | 
			
		||||
     Some initial text<br>
 | 
			
		||||
     <p id="only-this">And 1 this</p>
 | 
			
		||||
     <br>
 | 
			
		||||
     <p id="not-this">And 2 this</p>
 | 
			
		||||
     <p id="">And 3 this</p><!--/html/body/p[3]/-->
 | 
			
		||||
     <p id="">And 4 this</p><!--/html/body/p[4]/-->
 | 
			
		||||
     <p id="">And 5 this</p><!--/html/body/p[5]/-->
 | 
			
		||||
     <p id="">And 6 this</p><!--/html/body/p[6]/-->
 | 
			
		||||
     <p id="">And 7 this</p><!--/html/body/p[7]/-->
 | 
			
		||||
     <p id="">And 8 this</p><!--/html/body/p[8]/-->
 | 
			
		||||
     <p id="">And 9 this</p><!--/html/body/p[9]/-->
 | 
			
		||||
     <p id="">And 10 this</p><!--/html/body/p[10]/-->
 | 
			
		||||
     <p id="">And 11 this</p><!--/html/body/p[11]/-->
 | 
			
		||||
     <p id="">And 12 this</p><!--/html/body/p[12]/-->
 | 
			
		||||
     <p id="">And 13 this</p><!--/html/body/p[13]/-->
 | 
			
		||||
     <p id="">And 14 this</p><!--/html/body/p[14]/-->
 | 
			
		||||
     <p id="not-this">And 15 this</p><!--/html/body/p[15]/-->
 | 
			
		||||
     </body>
 | 
			
		||||
     </html>
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    with open("test-datastore/endpoint-content.txt", "w") as f:
 | 
			
		||||
        f.write(d)
 | 
			
		||||
 | 
			
		||||
    test_url = url_for('test_endpoint', _external=True)
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("import_page"),
 | 
			
		||||
        data={"urls": test_url},
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"1 Imported" in res.data
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
 | 
			
		||||
    filters = [
 | 
			
		||||
            '/html/body/p[3]',
 | 
			
		||||
            '/html/body/p[4]',
 | 
			
		||||
            '/html/body/p[5]',
 | 
			
		||||
            '/html/body/p[6]',
 | 
			
		||||
            '/html/body/p[7]',
 | 
			
		||||
            '/html/body/p[8]',
 | 
			
		||||
            '/html/body/p[9]',
 | 
			
		||||
            '/html/body/p[10]',
 | 
			
		||||
            '/html/body/p[11]',
 | 
			
		||||
            '/html/body/p[12]',
 | 
			
		||||
            '/html/body/p[13]', # duplicated tags
 | 
			
		||||
            '/html/body/p[13]',
 | 
			
		||||
            '/html/body/p[13]',
 | 
			
		||||
            '/html/body/p[13]',
 | 
			
		||||
            '/html/body/p[13]',
 | 
			
		||||
            '/html/body/p[14]',
 | 
			
		||||
            ]
 | 
			
		||||
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("edit_page", uuid="first"),
 | 
			
		||||
        data={"include_filters": '\n'.join(filters),
 | 
			
		||||
            "url": test_url,
 | 
			
		||||
            "tags": "test-tag-keep-order",
 | 
			
		||||
            "headers": "",
 | 
			
		||||
            'fetch_backend': "html_requests"},
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"Updated watch." in res.data
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
 | 
			
		||||
    res = client.get(
 | 
			
		||||
        url_for("preview_page", uuid="first"),
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    assert b"And 1 this" in res.data  # test-tag-keep-order
 | 
			
		||||
 | 
			
		||||
    a_tag_filter_check = b'And 1 this' #'#only-this' of tag_filters
 | 
			
		||||
    # check there is no duplication of tag_filters
 | 
			
		||||
    assert res.data.count(a_tag_filter_check) == 1, f"duplicated filters didn't removed {res.data.count(a_tag_filter_check)} of {a_tag_filter_check} in {res.data=}"
 | 
			
		||||
 | 
			
		||||
    a_filter_check = b"And 13 this" # '/html/body/p[13]'
 | 
			
		||||
    # check there is no duplication of filters
 | 
			
		||||
    assert res.data.count(a_filter_check) == 1, f"duplicated filters didn't removed. {res.data.count(a_filter_check)} of {a_filter_check} in {res.data=}"
 | 
			
		||||
 | 
			
		||||
    a_filter_check_not_include = b"And 2 this" # '/html/body/p[2]'
 | 
			
		||||
    assert a_filter_check_not_include not in res.data
 | 
			
		||||
 | 
			
		||||
    checklist = [
 | 
			
		||||
            b"And 3 this",
 | 
			
		||||
            b"And 4 this",
 | 
			
		||||
            b"And 5 this",
 | 
			
		||||
            b"And 6 this",
 | 
			
		||||
            b"And 7 this",
 | 
			
		||||
            b"And 8 this",
 | 
			
		||||
            b"And 9 this",
 | 
			
		||||
            b"And 10 this",
 | 
			
		||||
            b"And 11 this",
 | 
			
		||||
            b"And 12 this",
 | 
			
		||||
            b"And 13 this",
 | 
			
		||||
            b"And 14 this",
 | 
			
		||||
            b"And 1 this", # result of filter from tag.
 | 
			
		||||
            ]
 | 
			
		||||
    # check whether everything a user requested is there
 | 
			
		||||
    for test in checklist:
 | 
			
		||||
        assert test in res.data
 | 
			
		||||
 | 
			
		||||
    # check whether everything a user requested is in order of filters.
 | 
			
		||||
    n = 0
 | 
			
		||||
    for test in checklist:
 | 
			
		||||
        t_index = res.data[n:].find(test)
 | 
			
		||||
        # if the text is not searched, return -1.
 | 
			
		||||
        assert t_index >= 0, f"""failed because {test=} not in {res.data[n:]=}
 | 
			
		||||
#####################
 | 
			
		||||
Looks like some feature changed the order of result of filters.
 | 
			
		||||
#####################
 | 
			
		||||
the {test} appeared before. {test in res.data[:n]=}
 | 
			
		||||
{res.data[:n]=}
 | 
			
		||||
        """
 | 
			
		||||
        n += t_index + len(test)
 | 
			
		||||
 | 
			
		||||
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
 | 
			
		||||
    assert b'Deleted' in res.data
 | 
			
		||||
 
 | 
			
		||||
@@ -456,7 +456,7 @@ def test_ignore_json_order(client, live_server):
 | 
			
		||||
 | 
			
		||||
def test_correct_header_detect(client, live_server):
 | 
			
		||||
    # Like in https://github.com/dgtlmoon/changedetection.io/pull/1593
 | 
			
		||||
    # Specify extra html that JSON is sometimes wrapped in - when using SockpuppetBrowser / Puppeteer / Playwrightetc
 | 
			
		||||
    # Specify extra html that JSON is sometimes wrapped in - when using Browserless/Puppeteer etc
 | 
			
		||||
    with open("test-datastore/endpoint-content.txt", "w") as f:
 | 
			
		||||
        f.write('<html><body>{"hello" : 123, "world": 123}')
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -29,8 +29,7 @@ def test_fetch_pdf(client, live_server):
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # PDF header should not be there (it was converted to text)
 | 
			
		||||
    assert b'PDF' not in res.data[:10]
 | 
			
		||||
    assert b'PDF-1.5' not in res.data
 | 
			
		||||
    assert b'hello world' in res.data
 | 
			
		||||
 | 
			
		||||
    # So we know if the file changes in other ways
 | 
			
		||||
 
 | 
			
		||||
@@ -14,7 +14,7 @@ def test_headers_in_request(client, live_server):
 | 
			
		||||
    # Add our URL to the import page
 | 
			
		||||
    test_url = url_for('test_headers', _external=True)
 | 
			
		||||
    if os.getenv('PLAYWRIGHT_DRIVER_URL'):
 | 
			
		||||
        # Because its no longer calling back to localhost but from the browser container, set in test-only.yml
 | 
			
		||||
        # Because its no longer calling back to localhost but from browserless, set in test-only.yml
 | 
			
		||||
        test_url = test_url.replace('localhost', 'changedet')
 | 
			
		||||
 | 
			
		||||
    # Add the test URL twice, we will check
 | 
			
		||||
@@ -89,7 +89,7 @@ def test_body_in_request(client, live_server):
 | 
			
		||||
    # Add our URL to the import page
 | 
			
		||||
    test_url = url_for('test_body', _external=True)
 | 
			
		||||
    if os.getenv('PLAYWRIGHT_DRIVER_URL'):
 | 
			
		||||
        # Because its no longer calling back to localhost but from the browser container, set in test-only.yml
 | 
			
		||||
        # Because its no longer calling back to localhost but from browserless, set in test-only.yml
 | 
			
		||||
        test_url = test_url.replace('localhost', 'cdio')
 | 
			
		||||
 | 
			
		||||
    res = client.post(
 | 
			
		||||
@@ -181,7 +181,7 @@ def test_method_in_request(client, live_server):
 | 
			
		||||
    # Add our URL to the import page
 | 
			
		||||
    test_url = url_for('test_method', _external=True)
 | 
			
		||||
    if os.getenv('PLAYWRIGHT_DRIVER_URL'):
 | 
			
		||||
        # Because its no longer calling back to localhost but from the browser container, set in test-only.yml
 | 
			
		||||
        # Because its no longer calling back to localhost but from browserless, set in test-only.yml
 | 
			
		||||
        test_url = test_url.replace('localhost', 'cdio')
 | 
			
		||||
 | 
			
		||||
    # Add the test URL twice, we will check
 | 
			
		||||
@@ -258,7 +258,7 @@ def test_headers_textfile_in_request(client, live_server):
 | 
			
		||||
    # Add our URL to the import page
 | 
			
		||||
    test_url = url_for('test_headers', _external=True)
 | 
			
		||||
    if os.getenv('PLAYWRIGHT_DRIVER_URL'):
 | 
			
		||||
        # Because its no longer calling back to localhost but from the browser container, set in test-only.yml
 | 
			
		||||
        # Because its no longer calling back to localhost but from browserless, set in test-only.yml
 | 
			
		||||
        test_url = test_url.replace('localhost', 'cdio')
 | 
			
		||||
 | 
			
		||||
    print ("TEST URL IS ",test_url)
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,4 @@
 | 
			
		||||
# -*- coding: utf-8 -*-
 | 
			
		||||
#!/usr/bin/python3
 | 
			
		||||
 | 
			
		||||
import time
 | 
			
		||||
from flask import url_for
 | 
			
		||||
@@ -255,69 +255,6 @@ def test_xpath23_prefix_validation(client, live_server):
 | 
			
		||||
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
 | 
			
		||||
    assert b'Deleted' in res.data
 | 
			
		||||
 | 
			
		||||
def test_xpath1_lxml(client, live_server):
 | 
			
		||||
    #live_server_setup(live_server)
 | 
			
		||||
 | 
			
		||||
    d = '''<?xml version="1.0" encoding="UTF-8"?>
 | 
			
		||||
    <rss xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
 | 
			
		||||
    	<channel>
 | 
			
		||||
    		<title>rpilocator.com</title>
 | 
			
		||||
    		<link>https://rpilocator.com</link>
 | 
			
		||||
    		<description>Find Raspberry Pi Computers in Stock</description>
 | 
			
		||||
    		<lastBuildDate>Thu, 19 May 2022 23:27:30 GMT</lastBuildDate>
 | 
			
		||||
    		<image>
 | 
			
		||||
    			<url>https://rpilocator.com/favicon.png</url>
 | 
			
		||||
    			<title>rpilocator.com</title>
 | 
			
		||||
    			<link>https://rpilocator.com/</link>
 | 
			
		||||
    			<width>32</width>
 | 
			
		||||
    			<height>32</height>
 | 
			
		||||
    		</image>
 | 
			
		||||
    		<item>
 | 
			
		||||
    			<title>Stock Alert (UK): RPi CM4</title>
 | 
			
		||||
    			<foo>something else unrelated</foo>
 | 
			
		||||
    		</item>
 | 
			
		||||
    		<item>
 | 
			
		||||
    			<title>Stock Alert (UK): Big monitorěěěě</title>
 | 
			
		||||
    			<foo>something else unrelated</foo>
 | 
			
		||||
    		</item>		
 | 
			
		||||
    	</channel>
 | 
			
		||||
    </rss>'''.encode('utf-8')
 | 
			
		||||
 | 
			
		||||
    with open("test-datastore/endpoint-content.txt", "wb") as f:
 | 
			
		||||
        f.write(d)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    test_url = url_for('test_endpoint', _external=True)
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("import_page"),
 | 
			
		||||
        data={"urls": test_url},
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"1 Imported" in res.data
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("edit_page", uuid="first"),
 | 
			
		||||
        data={"include_filters": "xpath1://title/text()", "url": test_url, "tags": "", "headers": "",
 | 
			
		||||
              'fetch_backend': "html_requests"},
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    ##### #2312
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
    res = client.get(url_for("index"))
 | 
			
		||||
    assert b'_ElementStringResult' not in res.data # tested with 5.1.1 when it was removed and 5.1.0
 | 
			
		||||
    assert b'Exception' not in res.data
 | 
			
		||||
    res = client.get(
 | 
			
		||||
        url_for("preview_page", uuid="first"),
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    assert b"rpilocator.com" in res.data  # in selector
 | 
			
		||||
    assert "Stock Alert (UK): Big monitorěěěě".encode('utf-8') in res.data  # not in selector
 | 
			
		||||
 | 
			
		||||
    #####
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_xpath1_validation(client, live_server):
 | 
			
		||||
    # Add our URL to the import page
 | 
			
		||||
 
 | 
			
		||||
@@ -242,28 +242,5 @@ def live_server_setup(live_server):
 | 
			
		||||
            resp.headers['Content-Type'] = 'application/pdf'
 | 
			
		||||
            return resp
 | 
			
		||||
 | 
			
		||||
    @live_server.app.route('/test-interactive-html-endpoint')
 | 
			
		||||
    def test_interactive_html_endpoint():
 | 
			
		||||
        header_text=""
 | 
			
		||||
        for k,v in request.headers.items():
 | 
			
		||||
            header_text += f"{k}: {v}<br>"
 | 
			
		||||
 | 
			
		||||
        resp = make_response(f"""
 | 
			
		||||
        <html>
 | 
			
		||||
          <body>
 | 
			
		||||
          Primitive JS check for <pre>changedetectionio/tests/visualselector/test_fetch_data.py</pre>
 | 
			
		||||
            <p id="remove">This text should be removed</p>
 | 
			
		||||
              <form onsubmit="event.preventDefault();">
 | 
			
		||||
            <!-- obfuscated text so that we dont accidentally get a false positive due to conversion of the source :) --->
 | 
			
		||||
                <button name="test-button" onclick="getElementById('remove').remove();getElementById('some-content').innerHTML = atob('SSBzbWVsbCBKYXZhU2NyaXB0IGJlY2F1c2UgdGhlIGJ1dHRvbiB3YXMgcHJlc3NlZCE=')">Click here</button>
 | 
			
		||||
                <div id=some-content></div>
 | 
			
		||||
                <pre>
 | 
			
		||||
                {header_text.lower()}
 | 
			
		||||
                </pre>
 | 
			
		||||
              </body>
 | 
			
		||||
         </html>""", 200)
 | 
			
		||||
        resp.headers['Content-Type'] = 'text/html'
 | 
			
		||||
        return resp
 | 
			
		||||
 | 
			
		||||
    live_server.start()
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,5 +1,6 @@
 | 
			
		||||
#!/usr/bin/python3
 | 
			
		||||
 | 
			
		||||
import time
 | 
			
		||||
import os
 | 
			
		||||
from flask import url_for
 | 
			
		||||
from ..util import live_server_setup, wait_for_all_checks, extract_UUID_from_client
 | 
			
		||||
@@ -7,19 +8,15 @@ from ..util import live_server_setup, wait_for_all_checks, extract_UUID_from_cli
 | 
			
		||||
def test_setup(client, live_server):
 | 
			
		||||
    live_server_setup(live_server)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Add a site in paused mode, add an invalid filter, we should still have visual selector data ready
 | 
			
		||||
def test_visual_selector_content_ready(client, live_server):
 | 
			
		||||
 | 
			
		||||
    import os
 | 
			
		||||
    import json
 | 
			
		||||
 | 
			
		||||
    assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test"
 | 
			
		||||
 | 
			
		||||
    # Add our URL to the import page, because the docker container (playwright/selenium) wont be able to connect to our usual test url
 | 
			
		||||
    test_url = url_for('test_interactive_html_endpoint', _external=True)
 | 
			
		||||
    test_url = test_url.replace('localhost.localdomain', 'cdio')
 | 
			
		||||
    test_url = test_url.replace('localhost', 'cdio')
 | 
			
		||||
    test_url = "https://changedetection.io/ci-test/test-runjs.html"
 | 
			
		||||
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("form_quick_watch_add"),
 | 
			
		||||
@@ -27,31 +24,28 @@ def test_visual_selector_content_ready(client, live_server):
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"Watch added in Paused state, saving will unpause" in res.data
 | 
			
		||||
    uuid = extract_UUID_from_client(client)
 | 
			
		||||
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("edit_page", uuid=uuid, unpause_on_save=1),
 | 
			
		||||
        url_for("edit_page", uuid="first", unpause_on_save=1),
 | 
			
		||||
        data={
 | 
			
		||||
            "url": test_url,
 | 
			
		||||
            "tags": "",
 | 
			
		||||
            # For now, cookies doesnt work in headers because it must be a full cookiejar object
 | 
			
		||||
            'headers': "testheader: yes\buser-agent: MyCustomAgent",
 | 
			
		||||
            'fetch_backend': "html_webdriver",
 | 
			
		||||
              "url": test_url,
 | 
			
		||||
              "tags": "",
 | 
			
		||||
              "headers": "",
 | 
			
		||||
              'fetch_backend': "html_webdriver",
 | 
			
		||||
              'webdriver_js_execute_code': 'document.querySelector("button[name=test-button]").click();'
 | 
			
		||||
        },
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"unpaused" in res.data
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
    uuid = extract_UUID_from_client(client)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n >= 1, "Watch history had atleast 1 (everything fetched OK)"
 | 
			
		||||
 | 
			
		||||
    # Check the JS execute code before extract worked
 | 
			
		||||
    res = client.get(
 | 
			
		||||
        url_for("preview_page", uuid=uuid),
 | 
			
		||||
        url_for("preview_page", uuid="first"),
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"testheader: yes" in res.data
 | 
			
		||||
    assert b"user-agent: mycustomagent" in res.data
 | 
			
		||||
 | 
			
		||||
    assert b'I smell JavaScript' in res.data
 | 
			
		||||
 | 
			
		||||
    assert os.path.isfile(os.path.join('test-datastore', uuid, 'last-screenshot.png')), "last-screenshot.png should exist"
 | 
			
		||||
    assert os.path.isfile(os.path.join('test-datastore', uuid, 'elements.json')), "xpath elements.json data should exist"
 | 
			
		||||
@@ -81,33 +75,30 @@ def test_visual_selector_content_ready(client, live_server):
 | 
			
		||||
 | 
			
		||||
def test_basic_browserstep(client, live_server):
 | 
			
		||||
 | 
			
		||||
    #live_server_setup(live_server)
 | 
			
		||||
    assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test"
 | 
			
		||||
    #live_server_setup(live_server)
 | 
			
		||||
 | 
			
		||||
    test_url = url_for('test_interactive_html_endpoint', _external=True)
 | 
			
		||||
    test_url = test_url.replace('localhost.localdomain', 'cdio')
 | 
			
		||||
    test_url = test_url.replace('localhost', 'cdio')
 | 
			
		||||
    # Add our URL to the import page, because the docker container (playwright/selenium) wont be able to connect to our usual test url
 | 
			
		||||
    test_url = "https://changedetection.io/ci-test/test-runjs.html"
 | 
			
		||||
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("form_quick_watch_add"),
 | 
			
		||||
        data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    assert b"Watch added in Paused state, saving will unpause" in res.data
 | 
			
		||||
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("edit_page", uuid="first", unpause_on_save=1),
 | 
			
		||||
        data={
 | 
			
		||||
            "url": test_url,
 | 
			
		||||
            "tags": "",
 | 
			
		||||
            'fetch_backend': "html_webdriver",
 | 
			
		||||
            'browser_steps-0-operation': 'Goto site',
 | 
			
		||||
            'browser_steps-1-operation': 'Click element',
 | 
			
		||||
            'browser_steps-1-selector': 'button[name=test-button]',
 | 
			
		||||
            'browser_steps-1-optional_value': '',
 | 
			
		||||
            # For now, cookies doesnt work in headers because it must be a full cookiejar object
 | 
			
		||||
            'headers': "testheader: yes\buser-agent: MyCustomAgent",
 | 
			
		||||
              "url": test_url,
 | 
			
		||||
              "tags": "",
 | 
			
		||||
              "headers": "",
 | 
			
		||||
              'fetch_backend': "html_webdriver",
 | 
			
		||||
              'browser_steps-0-operation': 'Goto site',
 | 
			
		||||
              'browser_steps-1-operation': 'Click element',
 | 
			
		||||
              'browser_steps-1-selector': 'button[name=test-button]',
 | 
			
		||||
              'browser_steps-1-optional_value': ''
 | 
			
		||||
        },
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
@@ -115,9 +106,6 @@ def test_basic_browserstep(client, live_server):
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
 | 
			
		||||
    uuid = extract_UUID_from_client(client)
 | 
			
		||||
    assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n >= 1, "Watch history had atleast 1 (everything fetched OK)"
 | 
			
		||||
 | 
			
		||||
    assert b"This text should be removed" not in res.data
 | 
			
		||||
 | 
			
		||||
    # Check HTML conversion detected and workd
 | 
			
		||||
    res = client.get(
 | 
			
		||||
@@ -127,19 +115,13 @@ def test_basic_browserstep(client, live_server):
 | 
			
		||||
    assert b"This text should be removed" not in res.data
 | 
			
		||||
    assert b"I smell JavaScript because the button was pressed" in res.data
 | 
			
		||||
 | 
			
		||||
    assert b"testheader: yes" in res.data
 | 
			
		||||
    assert b"user-agent: mycustomagent" in res.data
 | 
			
		||||
 | 
			
		||||
    four_o_four_url =  url_for('test_endpoint', status_code=404, _external=True)
 | 
			
		||||
    four_o_four_url = four_o_four_url.replace('localhost.localdomain', 'cdio')
 | 
			
		||||
    four_o_four_url = four_o_four_url.replace('localhost', 'cdio')
 | 
			
		||||
 | 
			
		||||
    # now test for 404 errors
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("edit_page", uuid=uuid, unpause_on_save=1),
 | 
			
		||||
        data={
 | 
			
		||||
              "url": four_o_four_url,
 | 
			
		||||
              "url": "https://changedetection.io/404",
 | 
			
		||||
              "tags": "",
 | 
			
		||||
              "headers": "",
 | 
			
		||||
              'fetch_backend': "html_webdriver",
 | 
			
		||||
              'browser_steps-0-operation': 'Goto site',
 | 
			
		||||
              'browser_steps-1-operation': 'Click element',
 | 
			
		||||
 
 | 
			
		||||
@@ -2,8 +2,8 @@ import os
 | 
			
		||||
import threading
 | 
			
		||||
import queue
 | 
			
		||||
import time
 | 
			
		||||
from . import content_fetchers
 | 
			
		||||
from changedetectionio import html_tools
 | 
			
		||||
 | 
			
		||||
from changedetectionio import content_fetcher, html_tools
 | 
			
		||||
from .processors.text_json_diff import FilterNotFoundInResponse
 | 
			
		||||
from .processors.restock_diff import UnableToExtractRestockData
 | 
			
		||||
 | 
			
		||||
@@ -290,7 +290,7 @@ class update_worker(threading.Thread):
 | 
			
		||||
                        logger.critical(f"File permission error updating file, watch: {uuid}")
 | 
			
		||||
                        logger.critical(str(e))
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
                    except content_fetchers.exceptions.ReplyWithContentButNoText as e:
 | 
			
		||||
                    except content_fetcher.ReplyWithContentButNoText as e:
 | 
			
		||||
                        # Totally fine, it's by choice - just continue on, nothing more to care about
 | 
			
		||||
                        # Page had elements/content but no renderable text
 | 
			
		||||
                        # Backend (not filters) gave zero output
 | 
			
		||||
@@ -312,15 +312,13 @@ class update_worker(threading.Thread):
 | 
			
		||||
                            self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot)
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
 | 
			
		||||
                    except content_fetchers.exceptions.Non200ErrorCodeReceived as e:
 | 
			
		||||
                    except content_fetcher.Non200ErrorCodeReceived as e:
 | 
			
		||||
                        if e.status_code == 403:
 | 
			
		||||
                            err_text = "Error - 403 (Access denied) received"
 | 
			
		||||
                        elif e.status_code == 404:
 | 
			
		||||
                            err_text = "Error - 404 (Page not found) received"
 | 
			
		||||
                        elif e.status_code == 407:
 | 
			
		||||
                            err_text = "Error - 407 (Proxy authentication required) received, did you need a username and password for the proxy?"
 | 
			
		||||
                        elif e.status_code == 500:
 | 
			
		||||
                            err_text = "Error - 500 (Internal server error) received from the web site"
 | 
			
		||||
                            err_text = "Error - 500 (Internal server Error) received"
 | 
			
		||||
                        else:
 | 
			
		||||
                            err_text = "Error - Request returned a HTTP error code {}".format(str(e.status_code))
 | 
			
		||||
 | 
			
		||||
@@ -358,24 +356,13 @@ class update_worker(threading.Thread):
 | 
			
		||||
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
 | 
			
		||||
                    except content_fetchers.exceptions.checksumFromPreviousCheckWasTheSame as e:
 | 
			
		||||
                    except content_fetcher.checksumFromPreviousCheckWasTheSame as e:
 | 
			
		||||
                        # Yes fine, so nothing todo, don't continue to process.
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
                        changed_detected = False
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': False})
 | 
			
		||||
                    except content_fetchers.exceptions.BrowserConnectError as e:
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid,
 | 
			
		||||
                                                    update_obj={'last_error': e.msg
 | 
			
		||||
                                                                }
 | 
			
		||||
                                                    )
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
                    except content_fetchers.exceptions.BrowserFetchTimedOut as e:
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid,
 | 
			
		||||
                                                    update_obj={'last_error': e.msg
 | 
			
		||||
                                                                }
 | 
			
		||||
                                                    )
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
                    except content_fetchers.exceptions.BrowserStepsStepException as e:
 | 
			
		||||
 | 
			
		||||
                    except content_fetcher.BrowserStepsStepException as e:
 | 
			
		||||
 | 
			
		||||
                        if not self.datastore.data['watching'].get(uuid):
 | 
			
		||||
                            continue
 | 
			
		||||
@@ -417,25 +404,25 @@ class update_worker(threading.Thread):
 | 
			
		||||
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
 | 
			
		||||
                    except content_fetchers.exceptions.EmptyReply as e:
 | 
			
		||||
                    except content_fetcher.EmptyReply as e:
 | 
			
		||||
                        # Some kind of custom to-str handler in the exception handler that does this?
 | 
			
		||||
                        err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
 | 
			
		||||
                                                                           'last_check_status': e.status_code})
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
                    except content_fetchers.exceptions.ScreenshotUnavailable as e:
 | 
			
		||||
                        err_text = "Screenshot unavailable, page did not render fully in the expected time or page was too long - try increasing 'Wait seconds before extracting text'"
 | 
			
		||||
                    except content_fetcher.ScreenshotUnavailable as e:
 | 
			
		||||
                        err_text = "Screenshot unavailable, page did not render fully in the expected time - try increasing 'Wait seconds before extracting text'"
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
 | 
			
		||||
                                                                           'last_check_status': e.status_code})
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
                    except content_fetchers.exceptions.JSActionExceptions as e:
 | 
			
		||||
                    except content_fetcher.JSActionExceptions as e:
 | 
			
		||||
                        err_text = "Error running JS Actions - Page request - "+e.message
 | 
			
		||||
                        if e.screenshot:
 | 
			
		||||
                            self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot, as_error=True)
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
 | 
			
		||||
                                                                           'last_check_status': e.status_code})
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
                    except content_fetchers.exceptions.PageUnloadable as e:
 | 
			
		||||
                    except content_fetcher.PageUnloadable as e:
 | 
			
		||||
                        err_text = "Page request from server didnt respond correctly"
 | 
			
		||||
                        if e.message:
 | 
			
		||||
                            err_text = "{} - {}".format(err_text, e.message)
 | 
			
		||||
@@ -447,7 +434,7 @@ class update_worker(threading.Thread):
 | 
			
		||||
                                                                           'last_check_status': e.status_code,
 | 
			
		||||
                                                                           'has_ldjson_price_data': None})
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
                    except content_fetchers.exceptions.BrowserStepsInUnsupportedFetcher as e:
 | 
			
		||||
                    except content_fetcher.BrowserStepsInUnsupportedFetcher as e:
 | 
			
		||||
                        err_text = "This watch has Browser Steps configured and so it cannot run with the 'Basic fast Plaintext/HTTP Client', either remove the Browser Steps or select a Chrome fetcher."
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
@@ -462,7 +449,7 @@ class update_worker(threading.Thread):
 | 
			
		||||
                    except Exception as e:
 | 
			
		||||
                        logger.error(f"Exception reached processing watch UUID: {uuid}")
 | 
			
		||||
                        logger.error(str(e))
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Exception: " + str(e)})
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
 | 
			
		||||
                        # Other serious error
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
#                        import traceback
 | 
			
		||||
 
 | 
			
		||||
@@ -30,7 +30,7 @@ services:
 | 
			
		||||
  #             https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
 | 
			
		||||
  #
 | 
			
		||||
  #       Alternative Playwright URL, do not use "'s or 's!
 | 
			
		||||
  #      - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000
 | 
			
		||||
  #      - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000/?stealth=1&--disable-web-security=true
 | 
			
		||||
  #
 | 
			
		||||
  #       Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
 | 
			
		||||
  #
 | 
			
		||||
@@ -71,23 +71,32 @@ services:
 | 
			
		||||
#            condition: service_started
 | 
			
		||||
 | 
			
		||||
     # Used for fetching pages via Playwright+Chrome where you need Javascript support.
 | 
			
		||||
     # Note: Playwright/browserless not supported on ARM type devices (rPi etc)
 | 
			
		||||
     # RECOMMENDED FOR FETCHING PAGES WITH CHROME
 | 
			
		||||
#    playwright-chrome:
 | 
			
		||||
#        hostname: playwright-chrome
 | 
			
		||||
#        image: dgtlmoon/sockpuppetbrowser:latest
 | 
			
		||||
#        cap_add:
 | 
			
		||||
#            - SYS_ADMIN
 | 
			
		||||
## SYS_ADMIN might be too much, but it can be needed on your platform https://github.com/puppeteer/puppeteer/blob/main/docs/troubleshooting.md#running-puppeteer-on-gitlabci
 | 
			
		||||
#        image: browserless/chrome:1.60-chrome-stable
 | 
			
		||||
#        restart: unless-stopped
 | 
			
		||||
#        environment:
 | 
			
		||||
#            - SCREEN_WIDTH=1920
 | 
			
		||||
#            - SCREEN_HEIGHT=1024
 | 
			
		||||
#            - SCREEN_DEPTH=16
 | 
			
		||||
#            - MAX_CONCURRENT_CHROME_PROCESSES=10
 | 
			
		||||
#            - ENABLE_DEBUGGER=false
 | 
			
		||||
#            - PREBOOT_CHROME=true
 | 
			
		||||
#            - CONNECTION_TIMEOUT=300000
 | 
			
		||||
#            - MAX_CONCURRENT_SESSIONS=10
 | 
			
		||||
#            - CHROME_REFRESH_TIME=600000
 | 
			
		||||
#            - DEFAULT_BLOCK_ADS=true
 | 
			
		||||
#            - DEFAULT_STEALTH=true
 | 
			
		||||
#
 | 
			
		||||
#             Ignore HTTPS errors, like for self-signed certs
 | 
			
		||||
#            - DEFAULT_IGNORE_HTTPS_ERRORS=true
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
     # Used for fetching pages via Playwright+Chrome where you need Javascript support.
 | 
			
		||||
     # Note: Works well but is deprecated, does not fetch full page screenshots (doesnt work with Visual Selector)
 | 
			
		||||
     #       Does not report status codes (200, 404, 403) and other issues
 | 
			
		||||
     # More information about the advantages of playwright/browserless https://www.browserless.io/blog/2023/12/13/migrating-selenium-to-playwright/
 | 
			
		||||
#    browser-chrome:
 | 
			
		||||
#        hostname: browser-chrome
 | 
			
		||||
#        image: selenium/standalone-chrome:4
 | 
			
		||||
 
 | 
			
		||||
										
											Binary file not shown.
										
									
								
							| 
		 Before Width: | Height: | Size: 125 KiB  | 
@@ -1,7 +1,4 @@
 | 
			
		||||
# Used by Pyppeteer
 | 
			
		||||
pyee
 | 
			
		||||
 | 
			
		||||
eventlet==0.33.3 # related to dnspython fixes
 | 
			
		||||
eventlet>=0.33.3 # related to dnspython fixes
 | 
			
		||||
feedgen~=0.9
 | 
			
		||||
flask-compress
 | 
			
		||||
# 0.6.3 included compatibility fix for werkzeug 3.x (2.x had deprecation of url handlers)
 | 
			
		||||
@@ -9,7 +6,6 @@ flask-login>=0.6.3
 | 
			
		||||
flask-paginate
 | 
			
		||||
flask_expects_json~=1.7
 | 
			
		||||
flask_restful
 | 
			
		||||
flask_cors # For the Chrome extension to operate
 | 
			
		||||
flask_wtf~=1.2
 | 
			
		||||
flask~=2.3
 | 
			
		||||
inscriptis~=2.2
 | 
			
		||||
@@ -23,25 +19,21 @@ validators~=0.21
 | 
			
		||||
brotli~=1.0
 | 
			
		||||
requests[socks]
 | 
			
		||||
 | 
			
		||||
urllib3==1.26.18
 | 
			
		||||
urllib3>1.26
 | 
			
		||||
chardet>2.3.0
 | 
			
		||||
 | 
			
		||||
wtforms~=3.0
 | 
			
		||||
jsonpath-ng~=1.5.3
 | 
			
		||||
 | 
			
		||||
# Pinned: module 'eventlet.green.select' has no attribute 'epoll'
 | 
			
		||||
# https://github.com/eventlet/eventlet/issues/805#issuecomment-1640463482
 | 
			
		||||
dnspython==2.3.0 # related to eventlet fixes
 | 
			
		||||
dnspython~=2.4 # related to eventlet fixes
 | 
			
		||||
 | 
			
		||||
# jq not available on Windows so must be installed manually
 | 
			
		||||
 | 
			
		||||
# Notification library
 | 
			
		||||
apprise~=1.7.4
 | 
			
		||||
apprise~=1.7.1
 | 
			
		||||
 | 
			
		||||
# apprise mqtt https://github.com/dgtlmoon/changedetection.io/issues/315
 | 
			
		||||
# and 2.0.0 https://github.com/dgtlmoon/changedetection.io/issues/2241 not yet compatible
 | 
			
		||||
# use v1.x due to https://github.com/eclipse/paho.mqtt.python/issues/814
 | 
			
		||||
paho-mqtt < 2.0.0
 | 
			
		||||
paho-mqtt
 | 
			
		||||
 | 
			
		||||
# This mainly affects some ARM builds, which unlike the other builds ignores "ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1"
 | 
			
		||||
# so without this pinning, the newer versions on ARM will forcefully try to build rust, which results in "rust compiler not found"
 | 
			
		||||
@@ -52,7 +44,7 @@ cryptography~=3.4
 | 
			
		||||
beautifulsoup4
 | 
			
		||||
 | 
			
		||||
# XPath filtering, lxml is required by bs4 anyway, but put it here to be safe.
 | 
			
		||||
lxml >=4.8.0,<6
 | 
			
		||||
lxml
 | 
			
		||||
 | 
			
		||||
# XPath 2.0-3.1 support - 4.2.0 broke something?
 | 
			
		||||
elementpath==4.1.5
 | 
			
		||||
@@ -74,9 +66,6 @@ jq~=1.3; python_version >= "3.8" and sys_platform == "linux"
 | 
			
		||||
pillow
 | 
			
		||||
# playwright is installed at Dockerfile build time because it's not available on all platforms
 | 
			
		||||
 | 
			
		||||
# experimental release
 | 
			
		||||
pyppeteer-ng==2.0.0rc5
 | 
			
		||||
 | 
			
		||||
# Include pytest, so if theres a support issue we can ask them to run these tests on their setup
 | 
			
		||||
pytest ~=7.2
 | 
			
		||||
pytest-flask ~=1.2
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user