best to not let it process this

update test
Updating inscriptis library, removing fixes from 2.2
2025-11-29 21:03:21 +00:00 · 2024-02-02 09:52:44 +01:00 · 2024-02-02 09:45:09 +01:00 · 2024-02-02 09:28:24 +01:00
22 changed files with 231 additions and 173 deletions
--- a/.github/workflows/test-only.yml
+++ b/.github/workflows/test-only.yml
@@ -28,12 +28,12 @@ jobs:
          
          docker network create changedet-network
          
-          # Selenium and sockpuppetbrowser
+          # Selenium+browserless
          docker run --network changedet-network -d --hostname selenium  -p 4444:4444 --rm --shm-size="2g"  selenium/standalone-chrome:4
-          docker run --network changedet-network -d --cap-add=SYS_ADMIN --name sockpuppetbrowser --hostname sockpuppetbrowser --rm -p 3000:3000 dgtlmoon/sockpuppetbrowser:latest
+          docker run --network changedet-network -d --name browserless --hostname browserless -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm  -p 3000:3000  --shm-size="2g"  browserless/chrome:1.60-chrome-stable
          
          # For accessing custom browser tests
-          docker run --network changedet-network -d --cap-add=SYS_ADMIN --name sockpuppetbrowser-custom-url --hostname sockpuppetbrowser-custom-url --rm dgtlmoon/sockpuppetbrowser:latest
+          docker run --network changedet-network -d --name browserless-custom-url --hostname browserless-custom-url -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm --shm-size="2g"  browserless/chrome:1.60-chrome-stable

      - name: Build changedetection.io container for testing
        run: |         
@@ -47,12 +47,6 @@ jobs:
          # Debug SMTP server/echo message back server
          docker run --network changedet-network -d -p 11025:11025 -p 11080:11080  --hostname mailserver test-changedetectionio  bash -c 'python changedetectionio/tests/smtp/smtp-test-server.py' 

-      - name: Show docker container state and other debug info
-        run: |
-          set -x
-          echo "Running processes in docker..."
-          docker ps
-
      - name: Test built container with Pytest (generally as requests/plaintext fetching)
        run: |
          # Unit tests
@@ -69,33 +63,43 @@ jobs:

      - name: Specific tests in built container for Selenium
        run: |
+          
          # Selenium fetch
          docker run --rm -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py'

-      - name: Specific tests in built container for Playwright and SocketPuppetBrowser
-        run: |
-          # Playwright via Sockpuppetbrowser fetch
-          docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py'
+      - name: Specific tests in built container for Playwright
+        run: |         
+          # Playwright/Browserless fetch
+          docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py'

      - name: Specific tests in built container for headers and requests checks with Playwright
-        run: |       
-          # Settings headers playwright tests - Call back in from Sockpuppetbrowser, check headers
-          docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000?dumpio=true" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py'
+        run: |                  
+          # Settings headers playwright tests - Call back in from Browserless, check headers
+          docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000?dumpio=true" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py'

      - name: Specific tests in built container for headers and requests checks with Selenium
-        run: |
+        run: |                  
          docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py'

+      - name: Specific tests in built container with Playwright as Puppeteer experimental fetcher
+        run: |                  
+          docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000?dumpio=true" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py'          
+
      - name: Test built container restock detection via Playwright
        run: |                            
-          # restock detection via playwright - added name=changedet here so that playwright and sockpuppetbrowser can connect to it
-          docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py'
+          # restock detection via playwright - added name=changedet here so that playwright/browserless can connect to it
+          docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py'

      - name: Test SMTP notification mime types
        run: |
          # SMTP content types - needs the 'Debug SMTP server/echo message back server' container from above
          docker run --rm  --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/smtp/test_notification_smtp.py'

+      - name: Test with puppeteer fetcher and disk cache
+        run: |
+          docker run --rm -e "PUPPETEER_DISK_CACHE=/tmp/data/" -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py'
+          # Browserless would have had -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" added above
+
      - name: Test proxy interaction
        run: |
          cd changedetectionio
--- a/changedetectionio/blueprint/browser_steps/init.py
+++ b/changedetectionio/blueprint/browser_steps/init.py
@@ -4,13 +4,22 @@
 # Why?
 # `browsersteps_playwright_browser_interface.chromium.connect_over_cdp()` will only run once without async()
 # - this flask app is not async()
-# - A single timeout/keepalive which applies to the session made at .connect_over_cdp()
+# - browserless has a single timeout/keepalive which applies to the session made at .connect_over_cdp()
 #
 # So it means that we must unfortunately for now just keep a single timer since .connect_over_cdp() was run
 # and know when that reaches timeout/keepalive :( when that time is up, restart the connection and tell the user
 # that their time is up, insert another coin. (reload)
 #
+# Bigger picture
+# - It's horrible that we have this click+wait deal, some nice socket.io solution using something similar
+# to what the browserless debug UI already gives us would be smarter..
 #
+# OR
+# - Some API call that should be hacked into browserless or playwright that we can "/api/bump-keepalive/{session_id}/60"
+# So we can tell it that we need more time (run this on each action)
+#
+# OR
+# - use multiprocessing to bump this over to its own process and add some transport layer (queue/pipes)

 from distutils.util import strtobool
 from flask import Blueprint, request, make_response
--- a/changedetectionio/blueprint/browser_steps/browser_steps.py
+++ b/changedetectionio/blueprint/browser_steps/browser_steps.py
@@ -169,7 +169,7 @@ class steppable_browser_interface():
        self.page.locator(selector, timeout=1000).uncheck(timeout=1000)


-# Responsible for maintaining a live 'context' with the chrome CDP
+# Responsible for maintaining a live 'context' with browserless
 # @todo - how long do contexts live for anyway?
 class browsersteps_live_ui(steppable_browser_interface):
    context = None
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@@ -311,6 +311,125 @@ class base_html_playwright(Fetcher):
        with open(destination, 'w') as f:
            f.write(content)

+    def run_fetch_browserless_puppeteer(self,
+            url,
+            timeout,
+            request_headers,
+            request_body,
+            request_method,
+            ignore_status_codes=False,
+            current_include_filters=None,
+            is_binary=False):
+
+        from pkg_resources import resource_string
+
+        extra_wait_ms = (int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) * 1000
+
+        self.xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
+        code = resource_string(__name__, "res/puppeteer_fetch.js").decode('utf-8')
+        # In the future inject this is a proper JS package
+        code = code.replace('%xpath_scrape_code%', self.xpath_element_js)
+        code = code.replace('%instock_scrape_code%', self.instock_data_js)
+
+        from requests.exceptions import ConnectTimeout, ReadTimeout
+        wait_browserless_seconds = 240
+
+        browserless_function_url = os.getenv('BROWSERLESS_FUNCTION_URL')
+        from urllib.parse import urlparse
+        if not browserless_function_url:
+            # Convert/try to guess from PLAYWRIGHT_DRIVER_URL
+            o = urlparse(os.getenv('PLAYWRIGHT_DRIVER_URL'))
+            browserless_function_url = o._replace(scheme="http")._replace(path="function").geturl()
+
+
+        # Append proxy connect string
+        if self.proxy:
+            # Remove username/password if it exists in the URL or you will receive "ERR_NO_SUPPORTED_PROXIES" error
+            # Actual authentication handled by Puppeteer/node
+            o = urlparse(self.proxy.get('server'))
+            proxy_url = urllib.parse.quote(o._replace(netloc="{}:{}".format(o.hostname, o.port)).geturl())
+            browserless_function_url = f"{browserless_function_url}&--proxy-server={proxy_url}"
+
+        try:
+            amp = '&' if '?' in browserless_function_url else '?'
+            response = requests.request(
+                method="POST",
+                json={
+                    "code": code,
+                    "context": {
+                        # Very primitive disk cache - USE WITH EXTREME CAUTION
+                        # Run browserless container  with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]"
+                        'disk_cache_dir': os.getenv("PUPPETEER_DISK_CACHE", False), # or path to disk cache ending in /, ie /tmp/cache/
+                        'execute_js': self.webdriver_js_execute_code,
+                        'extra_wait_ms': extra_wait_ms,
+                        'include_filters': current_include_filters,
+                        'req_headers': request_headers,
+                        'screenshot_quality': int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)),
+                        'url': url,
+                        'user_agent': {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None),
+                        'proxy_username': self.proxy.get('username', '') if self.proxy else False,
+                        'proxy_password': self.proxy.get('password', '') if self.proxy and self.proxy.get('username') else False,
+                        'no_cache_list': [
+                            'twitter',
+                            '.pdf'
+                        ],
+                        # Could use https://github.com/easylist/easylist here, or install a plugin
+                        'block_url_list': [
+                            'adnxs.com',
+                            'analytics.twitter.com',
+                            'doubleclick.net',
+                            'google-analytics.com',
+                            'googletagmanager',
+                            'trustpilot.com'
+                        ]
+                    }
+                },
+                # @todo /function needs adding ws:// to http:// rebuild this
+                url=browserless_function_url+f"{amp}--disable-features=AudioServiceOutOfProcess&dumpio=true&--disable-remote-fonts",
+                timeout=wait_browserless_seconds)
+
+        except ReadTimeout:
+            raise PageUnloadable(url=url, status_code=None, message=f"No response from browserless in {wait_browserless_seconds}s")
+        except ConnectTimeout:
+            raise PageUnloadable(url=url, status_code=None, message=f"Timed out connecting to browserless, retrying..")
+        else:
+            # 200 Here means that the communication to browserless worked only, not the page state
+            try:
+                x = response.json()
+            except Exception as e:
+                raise PageUnloadable(url=url, message="Error reading JSON response from browserless")
+
+            try:
+                self.status_code = response.status_code
+            except Exception as e:
+                raise PageUnloadable(url=url, message="Error reading status_code code response from browserless")
+
+            self.headers = x.get('headers')
+
+            if self.status_code != 200 and not ignore_status_codes:
+                raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, page_html=x.get('content',''))
+
+            if self.status_code == 200:
+                import base64
+
+                if not x.get('screenshot'):
+                    # https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips
+                    # https://github.com/puppeteer/puppeteer/issues/1834
+                    # https://github.com/puppeteer/puppeteer/issues/1834#issuecomment-381047051
+                    # Check your memory is shared and big enough
+                    raise ScreenshotUnavailable(url=url, status_code=None)
+
+                if not x.get('content', '').strip():
+                    raise EmptyReply(url=url, status_code=None)
+
+                self.content = x.get('content')
+                self.instock_data = x.get('instock_data')
+                self.screenshot = base64.b64decode(x.get('screenshot'))
+                self.xpath_data = x.get('xpath_data')
+            else:
+                # Some other error from browserless
+                raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8'))
+
    def run(self,
            url,
            timeout,
@@ -322,6 +441,21 @@ class base_html_playwright(Fetcher):
            is_binary=False):


+        # For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!)
+        # browser_connection_is_custom doesnt work with puppeteer style fetch (use playwright native too in this case)
+        if not self.browser_connection_is_custom and not self.browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
+            if strtobool(os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH')):
+                # Temporary backup solution until we rewrite the playwright code
+                return self.run_fetch_browserless_puppeteer(
+                    url,
+                    timeout,
+                    request_headers,
+                    request_body,
+                    request_method,
+                    ignore_status_codes,
+                    current_include_filters,
+                    is_binary)
+
        from playwright.sync_api import sync_playwright
        import playwright._impl._errors

@@ -394,7 +528,7 @@ class base_html_playwright(Fetcher):
                self.status_code = response.status
            except Exception as e:
                # https://github.com/dgtlmoon/changedetection.io/discussions/2122#discussioncomment-8241962
-                logger.critical(f"Response from the browser/Playwright did not have a status_code! Response follows.")
+                logger.critical(f"Response from browserless/playwright did not have a status_code! Response follows.")
                logger.critical(response)
                raise PageUnloadable(url=url, status_code=None, message=str(e))

--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -465,7 +465,6 @@ class watchForm(commonSettingsForm):
    method = SelectField('Request method', choices=valid_method, default=default_method)
    ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False)
    check_unique_lines = BooleanField('Only trigger when unique lines appear', default=False)
-    sort_text_alphabetically =  BooleanField('Sort text alphabetically', default=False)

    filter_text_added = BooleanField('Added lines', default=True)
    filter_text_replaced = BooleanField('Replaced/changed lines', default=True)
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -409,23 +409,6 @@ def has_ldjson_product_info(content):
    x=bool(pricing_data)
    return x

-
-def workarounds_for_obfuscations(content):
-    """
-    Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
-    This could go into its own Pip package in the future, for faster updates
-    """
-
-    # HomeDepot.com style <span>$<!-- -->90<!-- -->.<!-- -->74</span>
-    # https://github.com/weblyzard/inscriptis/issues/45
-    if not content:
-        return content
-
-    content = re.sub('<!--\s+-->', '', content)
-
-    return content
-
-
 def get_triggered_text(content, trigger_text):
    triggered_text = []
    result = strip_ignore_text(content=content,
--- a/changedetectionio/importer.py
+++ b/changedetectionio/importer.py
@@ -57,7 +57,7 @@ class import_url_list(Importer):

            # Flask wtform validators wont work with basic auth, use validators package
            # Up to 5000 per batch so we dont flood the server
-            # @todo validators.url will fail when you add your own IP etc
+            # @todo validators.url failed on local hostnames (such as referring to ourself when using browserless)
            if len(url) and 'http' in url.lower() and good < 5000:
                extras = None
                if processor:
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -58,7 +58,6 @@ base_config = {
    'previous_md5_before_filters': False,  # Used for skipping changedetection entirely
    'proxy': None,  # Preferred proxy connection
    'remote_server_reply': None, # From 'server' reply header
-    'sort_text_alphabetically': False,
    'subtractive_selectors': [],
    'tag': '', # Old system of text name for a tag, to be removed
    'tags': [], # list of UUIDs to App.Tags
--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@@ -116,9 +116,7 @@ class perform_site_check(difference_detection_processor):
        # and then use getattr https://docs.python.org/3/reference/datamodel.html#object.__getitem__
        # https://realpython.com/inherit-python-dict/ instead of doing it procedurely
        include_filters_from_tags = self.datastore.get_tag_overrides_for_watch(uuid=uuid, attr='include_filters')
-
-        # 1845 - remove duplicated filters in both group and watch include filter
-        include_filters_rule = list({*watch.get('include_filters', []), *include_filters_from_tags})
+        include_filters_rule = [*watch.get('include_filters', []), *include_filters_from_tags]

        subtractive_selectors = [*self.datastore.get_tag_overrides_for_watch(uuid=uuid, attr='subtractive_selectors'),
                                 *watch.get("subtractive_selectors", []),
@@ -153,7 +151,6 @@ class perform_site_check(difference_detection_processor):
        if is_html or watch.is_source_type_url:

            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
-            self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
            html_content = self.fetcher.content

            # If not JSON,  and if it's not text/plain..
@@ -204,12 +201,6 @@ class perform_site_check(difference_detection_processor):
                            is_rss=is_rss # #1874 activate the <title workaround hack
                        )

-        if watch.get('sort_text_alphabetically') and stripped_text_from_html:
-            # Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
-            # we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
-            stripped_text_from_html = stripped_text_from_html.replace('\n\n', '\n')
-            stripped_text_from_html = '\n'.join( sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower() ))
-
        # Re #340 - return the content before the 'ignore text' was applied
        text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')

--- a/changedetectionio/res/puppeteer_fetch.js
+++ b/changedetectionio/res/puppeteer_fetch.js
@@ -146,7 +146,7 @@ module.exports = async ({page, context}) => {
    var xpath_data;
    var instock_data;
    try {
-        // Not sure the best way here, in the future this should be a new package added to npm then run in evaluatedCode
+        // Not sure the best way here, in the future this should be a new package added to npm then run in browserless
        // (Once the old playwright is removed)
        xpath_data = await page.evaluate((include_filters) => {%xpath_scrape_code%}, include_filters);
        instock_data = await page.evaluate(() => {%instock_scrape_code%});
--- a/changedetectionio/res/stock-not-in-stock.js
+++ b/changedetectionio/res/stock-not-in-stock.js
@@ -36,7 +36,6 @@ function isItemInStock() {
        'nicht zur verfügung',
        'niet beschikbaar',
        'niet leverbaar',
-        'niet op voorraad',
        'no disponible temporalmente',
        'no longer in stock',
        'no tickets available',
--- a/changedetectionio/run_custom_browser_url_tests.sh
+++ b/changedetectionio/run_custom_browser_url_tests.sh
@@ -6,16 +6,16 @@
 set -x

 # A extra browser is configured, but we never chose to use it, so it should NOT show in the logs
-docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/custom_browser_url/test_custom_browser_url.py::test_request_not_via_custom_browser_url'
-docker logs sockpuppetbrowser-custom-url &>log.txt
+docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/custom_browser_url/test_custom_browser_url.py::test_request_not_via_custom_browser_url'
+docker logs browserless-custom-url &>log.txt
 grep 'custom-browser-search-string=1' log.txt
 if [ $? -ne 1 ]
 then
-  echo "Saw a request in 'sockpuppetbrowser-custom-url' container with 'custom-browser-search-string=1' when I should not"
+  echo "Saw a request in 'browserless-custom-url' container with 'custom-browser-search-string=1' when I should not"
  exit 1
 fi

-docker logs sockpuppetbrowser &>log.txt
+docker logs browserless &>log.txt
 grep 'custom-browser-search-string=1' log.txt
 if [ $? -ne 1 ]
 then
@@ -24,16 +24,16 @@ then
 fi

 # Special connect string should appear in the custom-url container, but not in the 'default' one
-docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/custom_browser_url/test_custom_browser_url.py::test_request_via_custom_browser_url'
-docker logs sockpuppetbrowser-custom-url &>log.txt
+docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/custom_browser_url/test_custom_browser_url.py::test_request_via_custom_browser_url'
+docker logs browserless-custom-url &>log.txt
 grep 'custom-browser-search-string=1' log.txt
 if [ $? -ne 0 ]
 then
-  echo "Did not see request in 'sockpuppetbrowser-custom-url' container with 'custom-browser-search-string=1' when I should"
+  echo "Did not see request in 'browserless-custom-url' container with 'custom-browser-search-string=1' when I should"
  exit 1
 fi

-docker logs sockpuppetbrowser &>log.txt
+docker logs browserless &>log.txt
 grep 'custom-browser-search-string=1' log.txt
 if [ $? -ne 1 ]
 then
--- a/changedetectionio/run_proxy_tests.sh
+++ b/changedetectionio/run_proxy_tests.sh
@@ -35,7 +35,7 @@ docker run --network changedet-network \
 docker run --network changedet-network \
  -e "SOCKSTEST=manual-playwright" \
  -v `pwd`/tests/proxy_socks5/proxies.json-example-noauth:/app/changedetectionio/test-datastore/proxies.json \
-  -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" \
+  -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" \
  --rm \
  test-changedetectionio \
  bash -c 'cd changedetectionio && pytest tests/proxy_socks5/test_socks5_proxy_sources.py'
--- a/changedetectionio/static/js/browser-steps.js
+++ b/changedetectionio/static/js/browser-steps.js
@@ -10,7 +10,7 @@ $(document).ready(function () {
        }
    })
    var browsersteps_session_id;
-    var browser_interface_seconds_remaining = 0;
+    var browserless_seconds_remaining = 0;
    var apply_buttons_disabled = false;
    var include_text_elements = $("#include_text_elements");
    var xpath_data = false;
@@ -49,7 +49,7 @@ $(document).ready(function () {
        $('#browsersteps-img').removeAttr('src');
        $("#browsersteps-click-start").show();
        $("#browsersteps-selector-wrapper .spinner").hide();
-        browser_interface_seconds_remaining = 0;
+        browserless_seconds_remaining = 0;
        browsersteps_session_id = false;
        apply_buttons_disabled = false;
        ctx.clearRect(0, 0, c.width, c.height);
@@ -61,12 +61,12 @@ $(document).ready(function () {
        $('#browser_steps >li:first-child').css('opacity', '0.5');
    }

-    // Show seconds remaining until the browser interface needs to restart the session
+    // Show seconds remaining until playwright/browserless needs to restart the session
    // (See comment at the top of changedetectionio/blueprint/browser_steps/__init__.py )
    setInterval(() => {
-        if (browser_interface_seconds_remaining >= 1) {
-            document.getElementById('browser-seconds-remaining').innerText = browser_interface_seconds_remaining + " seconds remaining in session";
-            browser_interface_seconds_remaining -= 1;
+        if (browserless_seconds_remaining >= 1) {
+            document.getElementById('browserless-seconds-remaining').innerText = browserless_seconds_remaining + " seconds remaining in session";
+            browserless_seconds_remaining -= 1;
        }
    }, "1000")

@@ -261,7 +261,7 @@ $(document).ready(function () {
            // This should trigger 'Goto site'
            console.log("Got startup response, requesting Goto-Site (first) step fake click");
            $('#browser_steps >li:first-child .apply').click();
-            browser_interface_seconds_remaining = 500;
+            browserless_seconds_remaining = 500;
            set_first_gotosite_disabled();
        }).fail(function (data) {
            console.log(data);
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -228,7 +228,7 @@ User-Agent: wonderbra 1.0") }}
                                </div>
                            </div>
                            <div id="browser-steps-fieldlist" style="padding-left: 1em;  width: 350px; font-size: 80%;" >
-                                <span id="browser-seconds-remaining">Loading</span> <span style="font-size: 80%;"> (<a target=_new href="https://github.com/dgtlmoon/changedetection.io/pull/478/files#diff-1a79d924d1840c485238e66772391268a89c95b781d69091384cf1ea1ac146c9R4">?</a>) </span>
+                                <span id="browserless-seconds-remaining">Loading</span> <span style="font-size: 80%;"> (<a target=_new href="https://github.com/dgtlmoon/changedetection.io/pull/478/files#diff-1a79d924d1840c485238e66772391268a89c95b781d69091384cf1ea1ac146c9R4">?</a>) </span>
                                {{ render_field(form.browser_steps) }}
                            </div>
                        </div>
@@ -339,10 +339,6 @@ nav
                    <span class="pure-form-message-inline">When content is merely moved in a list, it will also trigger an <strong>addition</strong>, consider enabling <code><strong>Only trigger when unique lines appear</strong></code></span>
                </fieldset>

-                <fieldset class="pure-control-group">
-                    {{ render_checkbox_field(form.sort_text_alphabetically) }}
-                    <span class="pure-form-message-inline">Helps reduce changes detected caused by sites shuffling lines around, combine with <i>check unique lines</i> below.</span>
-                </fieldset>
                <fieldset class="pure-control-group">
                    {{ render_checkbox_field(form.check_unique_lines) }}
                    <span class="pure-form-message-inline">Good for websites that just move the content around, and you want to know when NEW content is added, compares new lines against all history for this watch.</span>
--- a/changedetectionio/tests/custom_browser_url/test_custom_browser_url.py
+++ b/changedetectionio/tests/custom_browser_url/test_custom_browser_url.py
@@ -7,11 +7,10 @@ from ..util import live_server_setup, wait_for_all_checks
 def do_test(client, live_server, make_test_use_extra_browser=False):

    # Grep for this string in the logs?
-    test_url = f"https://changedetection.io/ci-test.html?non-custom-default=true"
-    # "non-custom-default" should not appear in the custom browser connection
+    test_url = f"https://changedetection.io/ci-test.html"
    custom_browser_name = 'custom browser URL'

-    # needs to be set and something like 'ws://127.0.0.1:3000'
+    # needs to be set and something like 'ws://127.0.0.1:3000?stealth=1&--disable-web-security=true'
    assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test"

    #####################
@@ -20,7 +19,9 @@ def do_test(client, live_server, make_test_use_extra_browser=False):
        data={"application-empty_pages_are_a_change": "",
              "requests-time_between_check-minutes": 180,
              'application-fetch_backend': "html_webdriver",
-              'requests-extra_browsers-0-browser_connection_url': 'ws://sockpuppetbrowser-custom-url:3000',
+              # browserless-custom-url is setup in  .github/workflows/test-only.yml
+              # the test script run_custom_browser_url_test.sh will look for 'custom-browser-search-string' in the container logs
+              'requests-extra_browsers-0-browser_connection_url': 'ws://browserless-custom-url:3000?stealth=1&--disable-web-security=true&custom-browser-search-string=1',
              'requests-extra_browsers-0-browser_name': custom_browser_name
              },
        follow_redirects=True
@@ -50,8 +51,7 @@ def do_test(client, live_server, make_test_use_extra_browser=False):
        res = client.post(
            url_for("edit_page", uuid="first"),
            data={
-                # 'run_customer_browser_url_tests.sh' will search for this string to know if we hit the right browser container or not
-                  "url": f"https://changedetection.io/ci-test.html?custom-browser-search-string=1",
+                  "url": test_url,
                  "tags": "",
                  "headers": "",
                  'fetch_backend': f"extra_browser_{custom_browser_name}",
--- a/changedetectionio/tests/test_jsonpath_jq_selector.py
+++ b/changedetectionio/tests/test_jsonpath_jq_selector.py
@@ -456,7 +456,7 @@ def test_ignore_json_order(client, live_server):

 def test_correct_header_detect(client, live_server):
    # Like in https://github.com/dgtlmoon/changedetection.io/pull/1593
-    # Specify extra html that JSON is sometimes wrapped in - when using SockpuppetBrowser / Puppeteer / Playwrightetc
+    # Specify extra html that JSON is sometimes wrapped in - when using Browserless/Puppeteer etc
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write('<html><body>{"hello" : 123, "world": 123}')

--- a/changedetectionio/tests/test_obfuscations.py
+++ b/changedetectionio/tests/test_obfuscations.py
@@ -2,7 +2,7 @@

 import time
 from flask import url_for
-from .util import live_server_setup
+from .util import live_server_setup, wait_for_all_checks


 def set_original_ignore_response():
@@ -21,7 +21,7 @@ def set_original_ignore_response():
 def test_obfuscations(client, live_server):
    set_original_ignore_response()
    live_server_setup(live_server)
-    time.sleep(1)
+
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
@@ -32,12 +32,12 @@ def test_obfuscations(client, live_server):
    assert b"1 Imported" in res.data

    # Give the thread time to pick it up
-    time.sleep(3)
+    wait_for_all_checks(client)

    # Check HTML conversion detected and workd
    res = client.get(
        url_for("preview_page", uuid="first"),
        follow_redirects=True
    )
-
+    # whitespace appears but it renders https://github.com/weblyzard/inscriptis/issues/45#issuecomment-1923339265
    assert b'$90.74' in res.data
--- a/changedetectionio/tests/test_request.py
+++ b/changedetectionio/tests/test_request.py
@@ -14,7 +14,7 @@ def test_headers_in_request(client, live_server):
    # Add our URL to the import page
    test_url = url_for('test_headers', _external=True)
    if os.getenv('PLAYWRIGHT_DRIVER_URL'):
-        # Because its no longer calling back to localhost but from the browser container, set in test-only.yml
+        # Because its no longer calling back to localhost but from browserless, set in test-only.yml
        test_url = test_url.replace('localhost', 'changedet')

    # Add the test URL twice, we will check
@@ -89,7 +89,7 @@ def test_body_in_request(client, live_server):
    # Add our URL to the import page
    test_url = url_for('test_body', _external=True)
    if os.getenv('PLAYWRIGHT_DRIVER_URL'):
-        # Because its no longer calling back to localhost but from the browser container, set in test-only.yml
+        # Because its no longer calling back to localhost but from browserless, set in test-only.yml
        test_url = test_url.replace('localhost', 'cdio')

    res = client.post(
@@ -181,7 +181,7 @@ def test_method_in_request(client, live_server):
    # Add our URL to the import page
    test_url = url_for('test_method', _external=True)
    if os.getenv('PLAYWRIGHT_DRIVER_URL'):
-        # Because its no longer calling back to localhost but from the browser container, set in test-only.yml
+        # Because its no longer calling back to localhost but from browserless, set in test-only.yml
        test_url = test_url.replace('localhost', 'cdio')

    # Add the test URL twice, we will check
@@ -258,7 +258,7 @@ def test_headers_textfile_in_request(client, live_server):
    # Add our URL to the import page
    test_url = url_for('test_headers', _external=True)
    if os.getenv('PLAYWRIGHT_DRIVER_URL'):
-        # Because its no longer calling back to localhost but from the browser container, set in test-only.yml
+        # Because its no longer calling back to localhost but from browserless, set in test-only.yml
        test_url = test_url.replace('localhost', 'cdio')

    print ("TEST URL IS ",test_url)
--- a/changedetectionio/tests/test_unique_lines.py
+++ b/changedetectionio/tests/test_unique_lines.py
@@ -2,7 +2,7 @@

 import time
 from flask import url_for
-from .util import live_server_setup, wait_for_all_checks
+from .util import live_server_setup


 def set_original_ignore_response():
@@ -34,23 +34,6 @@ def set_modified_swapped_lines():
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(test_return_data)

-def set_modified_swapped_lines_with_extra_text_for_sorting():
-    test_return_data = """<html>
-     <body>
-     <p>&nbsp;Which is across multiple lines</p>     
-     <p>Some initial text</p>
-     <p>   So let's see what happens.</p>
-     <p>Z last</p>
-     <p>0 numerical</p>
-     <p>A uppercase</p>
-     <p>a lowercase</p>     
-     </body>
-     </html>
-    """
-
-    with open("test-datastore/endpoint-content.txt", "w") as f:
-        f.write(test_return_data)
-

 def set_modified_with_trigger_text_response():
    test_return_data = """<html>
@@ -66,14 +49,15 @@ def set_modified_with_trigger_text_response():
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(test_return_data)

-def test_setup(client, live_server):
-    live_server_setup(live_server)

 def test_unique_lines_functionality(client, live_server):
-    #live_server_setup(live_server)
+    live_server_setup(live_server)

+    sleep_time_for_fetch_thread = 3

    set_original_ignore_response()
+    # Give the endpoint time to spin up
+    time.sleep(1)

    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
@@ -83,7 +67,7 @@ def test_unique_lines_functionality(client, live_server):
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
-    wait_for_all_checks(client)
+    time.sleep(sleep_time_for_fetch_thread)

    # Add our URL to the import page
    res = client.post(
@@ -99,11 +83,12 @@ def test_unique_lines_functionality(client, live_server):
    #  Make a change
    set_modified_swapped_lines()

+    time.sleep(sleep_time_for_fetch_thread)
    # Trigger a check
    client.get(url_for("form_watch_checknow"), follow_redirects=True)

    # Give the thread time to pick it up
-    wait_for_all_checks(client)
+    time.sleep(sleep_time_for_fetch_thread)

    # It should report nothing found (no new 'unviewed' class)
    res = client.get(url_for("index"))
@@ -112,57 +97,7 @@ def test_unique_lines_functionality(client, live_server):
    # Now set the content which contains the new text and re-ordered existing text
    set_modified_with_trigger_text_response()
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
-    wait_for_all_checks(client)
+    time.sleep(sleep_time_for_fetch_thread)
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data
-    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
-    assert b'Deleted' in res.data

-def test_sort_lines_functionality(client, live_server):
-    #live_server_setup(live_server)
-
-    set_modified_swapped_lines_with_extra_text_for_sorting()
-
-    # Add our URL to the import page
-    test_url = url_for('test_endpoint', _external=True)
-    res = client.post(
-        url_for("import_page"),
-        data={"urls": test_url},
-        follow_redirects=True
-    )
-    assert b"1 Imported" in res.data
-    wait_for_all_checks(client)
-
-    # Add our URL to the import page
-    res = client.post(
-        url_for("edit_page", uuid="first"),
-        data={"sort_text_alphabetically": "n",
-              "url": test_url,
-              "fetch_backend": "html_requests"},
-        follow_redirects=True
-    )
-    assert b"Updated watch." in res.data
-
-
-    # Trigger a check
-    client.get(url_for("form_watch_checknow"), follow_redirects=True)
-
-    # Give the thread time to pick it up
-    wait_for_all_checks(client)
-
-
-    res = client.get(url_for("index"))
-    # Should be a change registered
-    assert b'unviewed' in res.data
-
-    res = client.get(
-        url_for("preview_page", uuid="first"),
-        follow_redirects=True
-    )
-
-    assert res.data.find(b'0 numerical') < res.data.find(b'Z last')
-    assert res.data.find(b'A uppercase') < res.data.find(b'Z last')
-    assert res.data.find(b'Some initial text') < res.data.find(b'Which is across multiple lines')
-    
-    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
-    assert b'Deleted' in res.data
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -30,7 +30,7 @@ services:
  #             https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
  #
  #       Alternative Playwright URL, do not use "'s or 's!
-  #      - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000
+  #      - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000/?stealth=1&--disable-web-security=true
  #
  #       Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
  #
@@ -71,23 +71,32 @@ services:
 #            condition: service_started

     # Used for fetching pages via Playwright+Chrome where you need Javascript support.
+     # Note: Playwright/browserless not supported on ARM type devices (rPi etc)
     # RECOMMENDED FOR FETCHING PAGES WITH CHROME
 #    playwright-chrome:
 #        hostname: playwright-chrome
-#        image: dgtlmoon/sockpuppetbrowser:latest
-#        cap_add:
-#            - SYS_ADMIN
-## SYS_ADMIN might be too much, but it can be needed on your platform https://github.com/puppeteer/puppeteer/blob/main/docs/troubleshooting.md#running-puppeteer-on-gitlabci
+#        image: browserless/chrome:1.60-chrome-stable
 #        restart: unless-stopped
 #        environment:
 #            - SCREEN_WIDTH=1920
 #            - SCREEN_HEIGHT=1024
 #            - SCREEN_DEPTH=16
-#            - MAX_CONCURRENT_CHROME_PROCESSES=10
+#            - ENABLE_DEBUGGER=false
+#            - PREBOOT_CHROME=true
+#            - CONNECTION_TIMEOUT=300000
+#            - MAX_CONCURRENT_SESSIONS=10
+#            - CHROME_REFRESH_TIME=600000
+#            - DEFAULT_BLOCK_ADS=true
+#            - DEFAULT_STEALTH=true
+#
+#             Ignore HTTPS errors, like for self-signed certs
+#            - DEFAULT_IGNORE_HTTPS_ERRORS=true
+#

     # Used for fetching pages via Playwright+Chrome where you need Javascript support.
     # Note: Works well but is deprecated, does not fetch full page screenshots (doesnt work with Visual Selector)
     #       Does not report status codes (200, 404, 403) and other issues
+     # More information about the advantages of playwright/browserless https://www.browserless.io/blog/2023/12/13/migrating-selenium-to-playwright/
 #    browser-chrome:
 #        hostname: browser-chrome
 #        image: selenium/standalone-chrome:4
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@ flask_expects_json~=1.7
 flask_restful
 flask_wtf~=1.2
 flask~=2.3
-inscriptis~=2.2
+inscriptis~=2.4
 pytz
 timeago~=1.0
 validators~=0.21
@@ -46,8 +46,8 @@ beautifulsoup4
 # XPath filtering, lxml is required by bs4 anyway, but put it here to be safe.
 lxml

-# XPath 2.0-3.1 support - 4.2.0 broke something?
-elementpath==4.1.5
+# XPath 2.0-3.1 support
+elementpath

 selenium~=4.14.0
Author	SHA1	Message	Date
dgtlmoon	dc33d49840	best to not let it process this	2024-02-02 09:52:44 +01:00
dgtlmoon	c30f96c4cd	update test	2024-02-02 09:45:09 +01:00
dgtlmoon	c8310b7e93	Updating inscriptis library, removing fixes from 2.2	2024-02-02 09:28:24 +01:00