Code - Split content fetcher code up (playwright, puppeteer and requests), fix puppeteer direct chrome support (#2169)

2025-12-13 19:45:56 +00:00 · 2024-02-11 00:09:12 +01:00
parent 1f57d9d0b6
commit 823a0c99f4
32 changed files with 1171 additions and 832 deletions
--- a/changedetectionio/processors/init.py
+++ b/changedetectionio/processors/init.py
@@ -2,7 +2,6 @@ from abc import abstractmethod
 import os
 import hashlib
 import re
-from changedetectionio import content_fetcher
 from copy import deepcopy
 from distutils.util import strtobool
 from loguru import logger
@@ -50,7 +49,7 @@ class difference_detection_processor():
            connection = list(
                filter(lambda s: (s['browser_name'] == key), self.datastore.data['settings']['requests'].get('extra_browsers', [])))
            if connection:
-                prefer_fetch_backend = 'base_html_playwright'
+                prefer_fetch_backend = 'html_webdriver'
                custom_browser_connection_url = connection[0].get('browser_connection_url')

        # PDF should be html_requests because playwright will serve it up (so far) in a embedded page
@@ -60,12 +59,19 @@ class difference_detection_processor():
           prefer_fetch_backend = "html_requests"

        # Grab the right kind of 'fetcher', (playwright, requests, etc)
-        if hasattr(content_fetcher, prefer_fetch_backend):
-            fetcher_obj = getattr(content_fetcher, prefer_fetch_backend)
+        from changedetectionio import content_fetchers
+        if hasattr(content_fetchers, prefer_fetch_backend):
+            # @todo TEMPORARY HACK - SWITCH BACK TO PLAYWRIGHT FOR BROWSERSTEPS
+            if prefer_fetch_backend == 'html_webdriver' and self.watch.has_browser_steps:
+                # This is never supported in selenium anyway
+                logger.warning("Using playwright fetcher override for possible puppeteer request in browsersteps, because puppetteer:browser steps is incomplete.")
+                from changedetectionio.content_fetchers.playwright import fetcher as playwright_fetcher
+                fetcher_obj = playwright_fetcher
+            else:
+                fetcher_obj = getattr(content_fetchers, prefer_fetch_backend)
        else:
-            # If the klass doesnt exist, just use a default
-            fetcher_obj = getattr(content_fetcher, "html_requests")
-
+            # What it referenced doesnt exist, Just use a default
+            fetcher_obj = getattr(content_fetchers, "html_requests")

        proxy_url = None
        if preferred_proxy_id:
--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@@ -8,8 +8,9 @@ import urllib3

 from . import difference_detection_processor
 from ..html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text
-from changedetectionio import content_fetcher, html_tools
+from changedetectionio import html_tools, content_fetchers
 from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
+import changedetectionio.content_fetchers
 from copy import deepcopy
 from loguru import logger

@@ -60,7 +61,7 @@ class perform_site_check(difference_detection_processor):
        update_obj['previous_md5_before_filters'] = hashlib.md5(self.fetcher.content.encode('utf-8')).hexdigest()
        if skip_when_checksum_same:
            if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'):
-                raise content_fetcher.checksumFromPreviousCheckWasTheSame()
+                raise content_fetchers.exceptions.checksumFromPreviousCheckWasTheSame()

        # Fetching complete, now filters

@@ -243,7 +244,7 @@ class perform_site_check(difference_detection_processor):
        # Treat pages with no renderable text content as a change? No by default
        empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
        if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
-            raise content_fetcher.ReplyWithContentButNoText(url=url,
+            raise content_fetchers.exceptions.ReplyWithContentButNoText(url=url,
                                                            status_code=self.fetcher.get_last_status_code(),
                                                            screenshot=screenshot,
                                                            has_filters=has_filter_rule,