From 652f939d6d35c4146868e74f9ecf0b01cc67c3c6 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 24 Nov 2025 12:18:48 +0100 Subject: [PATCH] Pluggable content fetchers --- .../watchlist/templates/watch-overview.html | 8 +- .../content_fetchers/__init__.py | 49 ++++++++++- changedetectionio/content_fetchers/base.py | 18 ++++ .../content_fetchers/playwright.py | 22 +++++ .../content_fetchers/puppeteer.py | 22 +++++ .../content_fetchers/requests.py | 12 +++ .../content_fetchers/webdriver_selenium.py | 22 +++++ changedetectionio/flask_app.py | 74 ++++++++++++++++ changedetectionio/pluggy_interface.py | 88 +++++++++++++++++-- 9 files changed, 302 insertions(+), 13 deletions(-) diff --git a/changedetectionio/blueprint/watchlist/templates/watch-overview.html b/changedetectionio/blueprint/watchlist/templates/watch-overview.html index 1cb22c67..87e54fb6 100644 --- a/changedetectionio/blueprint/watchlist/templates/watch-overview.html +++ b/changedetectionio/blueprint/watchlist/templates/watch-overview.html @@ -182,11 +182,9 @@ document.addEventListener('DOMContentLoaded', function() {
- {%- if watch.get_fetch_backend == "html_webdriver" - or ( watch.get_fetch_backend == "system" and system_default_fetcher == 'html_webdriver' ) - or "extra_browser_" in watch.get_fetch_backend - -%} - Using a Chrome browser + {%- set effective_fetcher = watch.get_fetch_backend if watch.get_fetch_backend != "system" else system_default_fetcher -%} + {%- if effective_fetcher and ("html_webdriver" in effective_fetcher or "html_" in effective_fetcher or "extra_browser_" in effective_fetcher) -%} + {{ effective_fetcher|fetcher_status_icons }} {%- endif -%} {%- if watch.is_pdf -%}Converting PDF to text{%- endif -%} {%- if watch.has_browser_steps -%}Browser Steps is enabled{%- endif -%} diff --git a/changedetectionio/content_fetchers/__init__.py b/changedetectionio/content_fetchers/__init__.py index 46a891cd..4dfdf0cf 100644 --- a/changedetectionio/content_fetchers/__init__.py +++ b/changedetectionio/content_fetchers/__init__.py @@ -7,6 +7,9 @@ import os # Visual Selector scraper - 'Button' is there because some sites have . visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,button' +# Import hookimpl from centralized pluggy interface +from changedetectionio.pluggy_interface import hookimpl + SCREENSHOT_MAX_HEIGHT_DEFAULT = 20000 SCREENSHOT_DEFAULT_QUALITY = 40 @@ -35,17 +38,54 @@ def available_fetchers(): # See the if statement at the bottom of this file for how we switch between playwright and webdriver import inspect p = [] + + # Get built-in fetchers (but skip plugin fetchers that were added via setattr) for name, obj in inspect.getmembers(sys.modules[__name__], inspect.isclass): if inspect.isclass(obj): # @todo html_ is maybe better as fetcher_ or something # In this case, make sure to edit the default one in store.py and fetch_site_status.py if name.startswith('html_'): - t = tuple([name, obj.fetcher_description]) - p.append(t) + # Skip plugin fetchers that were already registered + if name not in _plugin_fetchers: + t = tuple([name, obj.fetcher_description]) + p.append(t) + + # Get plugin fetchers from cache (already loaded at module init) + for name, fetcher_class in _plugin_fetchers.items(): + if hasattr(fetcher_class, 'fetcher_description'): + t = tuple([name, fetcher_class.fetcher_description]) + p.append(t) + else: + logger.warning(f"Plugin fetcher '{name}' does not have fetcher_description attribute") return p +def get_plugin_fetchers(): + """Load and return all plugin fetchers from the centralized plugin manager.""" + from changedetectionio.pluggy_interface import plugin_manager + + fetchers = {} + try: + # Call the register_content_fetcher hook from all registered plugins + results = plugin_manager.hook.register_content_fetcher() + for result in results: + if result: + name, fetcher_class = result + fetchers[name] = fetcher_class + # Register in current module so hasattr() checks work + setattr(sys.modules[__name__], name, fetcher_class) + logger.info(f"Registered plugin fetcher: {name} - {getattr(fetcher_class, 'fetcher_description', 'No description')}") + except Exception as e: + logger.error(f"Error loading plugin fetchers: {e}") + + return fetchers + + +# Initialize plugins at module load time +_plugin_fetchers = get_plugin_fetchers() + + # Decide which is the 'real' HTML webdriver, this is more a system wide config # rather than site-specific. use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False) @@ -62,3 +102,8 @@ else: logger.debug("Falling back to selenium as fetcher") from .webdriver_selenium import fetcher as html_webdriver + +# Register built-in fetchers as plugins after all imports are complete +from changedetectionio.pluggy_interface import register_builtin_fetchers +register_builtin_fetchers() + diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py index 6d7d3d5d..abe1ce2a 100644 --- a/changedetectionio/content_fetchers/base.py +++ b/changedetectionio/content_fetchers/base.py @@ -64,6 +64,24 @@ class Fetcher(): # Time ONTOP of the system defined env minimum time render_extract_delay = 0 + @classmethod + def get_status_icon_data(cls): + """Return data for status icon to display in the watch overview. + + This method can be overridden by subclasses to provide custom status icons. + + Returns: + dict or None: Dictionary with icon data: + { + 'filename': 'icon-name.svg', # Icon filename + 'alt': 'Alt text', # Alt attribute + 'title': 'Tooltip text', # Title attribute + 'style': 'height: 1em;' # Optional inline CSS + } + Or None if no icon + """ + return None + def clear_content(self): """ Explicitly clear all content from memory to free up heap space. diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py index 01f4c8a7..ed82139c 100644 --- a/changedetectionio/content_fetchers/playwright.py +++ b/changedetectionio/content_fetchers/playwright.py @@ -89,6 +89,15 @@ class fetcher(Fetcher): proxy = None + @classmethod + def get_status_icon_data(cls): + """Return Chrome browser icon data for Playwright fetcher.""" + return { + 'filename': 'google-chrome-icon.png', + 'alt': 'Using a Chrome browser', + 'title': 'Using a Chrome browser' + } + def __init__(self, proxy_override=None, custom_browser_connection_url=None): super().__init__() @@ -330,4 +339,17 @@ class fetcher(Fetcher): browser = None +# Plugin registration for built-in fetcher +class PlaywrightFetcherPlugin: + """Plugin class that registers the Playwright fetcher as a built-in plugin.""" + + def register_content_fetcher(self): + """Register the Playwright fetcher""" + return ('html_webdriver', fetcher) + + +# Create module-level instance for plugin registration +playwright_plugin = PlaywrightFetcherPlugin() + + diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py index 58511d54..04eec02b 100644 --- a/changedetectionio/content_fetchers/puppeteer.py +++ b/changedetectionio/content_fetchers/puppeteer.py @@ -98,6 +98,15 @@ class fetcher(Fetcher): proxy = None + @classmethod + def get_status_icon_data(cls): + """Return Chrome browser icon data for Puppeteer fetcher.""" + return { + 'filename': 'google-chrome-icon.png', + 'alt': 'Using a Chrome browser', + 'title': 'Using a Chrome browser' + } + def __init__(self, proxy_override=None, custom_browser_connection_url=None): super().__init__() @@ -384,3 +393,16 @@ class fetcher(Fetcher): ) except asyncio.TimeoutError: raise (BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds.")) + + +# Plugin registration for built-in fetcher +class PuppeteerFetcherPlugin: + """Plugin class that registers the Puppeteer fetcher as a built-in plugin.""" + + def register_content_fetcher(self): + """Register the Puppeteer fetcher""" + return ('html_webdriver', fetcher) + + +# Create module-level instance for plugin registration +puppeteer_plugin = PuppeteerFetcherPlugin() diff --git a/changedetectionio/content_fetchers/requests.py b/changedetectionio/content_fetchers/requests.py index f5b9d51e..c53b1427 100644 --- a/changedetectionio/content_fetchers/requests.py +++ b/changedetectionio/content_fetchers/requests.py @@ -163,3 +163,15 @@ class fetcher(Fetcher): except Exception as e: logger.warning(f"Failed to unlink screenshot: {screenshot} - {e}") + +# Plugin registration for built-in fetcher +class RequestsFetcherPlugin: + """Plugin class that registers the requests fetcher as a built-in plugin.""" + + def register_content_fetcher(self): + """Register the requests fetcher""" + return ('html_requests', fetcher) + + +# Create module-level instance for plugin registration +requests_plugin = RequestsFetcherPlugin() diff --git a/changedetectionio/content_fetchers/webdriver_selenium.py b/changedetectionio/content_fetchers/webdriver_selenium.py index 41cbf5d5..50bee6a6 100644 --- a/changedetectionio/content_fetchers/webdriver_selenium.py +++ b/changedetectionio/content_fetchers/webdriver_selenium.py @@ -14,6 +14,15 @@ class fetcher(Fetcher): proxy = None proxy_url = None + @classmethod + def get_status_icon_data(cls): + """Return Chrome browser icon data for WebDriver fetcher.""" + return { + 'filename': 'google-chrome-icon.png', + 'alt': 'Using a Chrome browser', + 'title': 'Using a Chrome browser' + } + def __init__(self, proxy_override=None, custom_browser_connection_url=None): super().__init__() from urllib.parse import urlparse @@ -141,3 +150,16 @@ class fetcher(Fetcher): # Run the selenium operations in a thread pool to avoid blocking the event loop loop = asyncio.get_event_loop() await loop.run_in_executor(None, _run_sync) + + +# Plugin registration for built-in fetcher +class WebDriverSeleniumFetcherPlugin: + """Plugin class that registers the WebDriver Selenium fetcher as a built-in plugin.""" + + def register_content_fetcher(self): + """Register the WebDriver Selenium fetcher""" + return ('html_webdriver', fetcher) + + +# Create module-level instance for plugin registration +webdriver_selenium_plugin = WebDriverSeleniumFetcherPlugin() diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py index c831c71d..d1890d01 100644 --- a/changedetectionio/flask_app.py +++ b/changedetectionio/flask_app.py @@ -210,6 +210,55 @@ def _jinja2_filter_seconds_precise(timestamp): return format(int(time.time()-timestamp), ',d') +@app.template_filter('fetcher_status_icons') +def _jinja2_filter_fetcher_status_icons(fetcher_name): + """Get status icon HTML for a given fetcher. + + This filter checks both built-in fetchers and plugin fetchers for status icons. + + Args: + fetcher_name: The fetcher name (e.g., 'html_webdriver', 'html_js_zyte') + + Returns: + str: HTML string containing status icon elements + """ + from changedetectionio import content_fetchers + from changedetectionio.pluggy_interface import collect_fetcher_status_icons + from markupsafe import Markup + from flask import url_for + + icon_data = None + + # First check if it's a plugin fetcher (plugins have priority) + plugin_icon_data = collect_fetcher_status_icons(fetcher_name) + if plugin_icon_data: + icon_data = plugin_icon_data + # Check if it's a built-in fetcher + elif hasattr(content_fetchers, fetcher_name): + fetcher_class = getattr(content_fetchers, fetcher_name) + if hasattr(fetcher_class, 'get_status_icon_data'): + icon_data = fetcher_class.get_status_icon_data() + + # Build HTML from icon data + if icon_data and isinstance(icon_data, dict): + # Use 'group' from icon_data if specified, otherwise default to 'images' + group = icon_data.get('group', 'images') + + # Try to use url_for, but fall back to manual URL building if endpoint not registered yet + try: + icon_url = url_for('static_content', group=group, filename=icon_data['filename']) + except: + # Fallback: build URL manually respecting APPLICATION_ROOT + from flask import request + app_root = request.script_root if hasattr(request, 'script_root') else '' + icon_url = f"{app_root}/static/{group}/{icon_data['filename']}" + + style_attr = f' style="{icon_data["style"]}"' if icon_data.get('style') else '' + html = f'{icon_data[' + return Markup(html) + + return '' + # Import login_optionally_required from auth_decorator from changedetectionio.auth_decorator import login_optionally_required @@ -488,6 +537,31 @@ def changedetection_app(config=None, datastore_o=None): except FileNotFoundError: abort(404) + # Handle plugin group specially + if group == 'plugin': + # Serve files from plugin static directories + from changedetectionio.pluggy_interface import plugin_manager + import os as os_check + + for plugin_name, plugin_obj in plugin_manager.list_name_plugin(): + if hasattr(plugin_obj, 'plugin_static_path'): + try: + static_path = plugin_obj.plugin_static_path() + if static_path and os_check.path.isdir(static_path): + # Check if file exists in plugin's static directory + plugin_file_path = os_check.path.join(static_path, filename) + if os_check.path.isfile(plugin_file_path): + # Found the file in a plugin + response = make_response(send_from_directory(static_path, filename)) + response.headers['Cache-Control'] = 'max-age=3600, public' # Cache for 1 hour + return response + except Exception as e: + logger.debug(f"Error checking plugin {plugin_name} for static file: {e}") + pass + + # File not found in any plugin + abort(404) + # These files should be in our subdirectory try: return send_from_directory(f"static/{group}", path=filename) diff --git a/changedetectionio/pluggy_interface.py b/changedetectionio/pluggy_interface.py index fe2f7182..aa13b488 100644 --- a/changedetectionio/pluggy_interface.py +++ b/changedetectionio/pluggy_interface.py @@ -16,15 +16,49 @@ class ChangeDetectionSpec: @hookspec def ui_edit_stats_extras(watch): """Return HTML content to add to the stats tab in the edit view. - + Args: watch: The watch object being edited - + Returns: str: HTML content to be inserted in the stats tab """ pass + @hookspec + def register_content_fetcher(self): + """Return a tuple of (fetcher_name, fetcher_class) for content fetcher plugins. + + The fetcher_name should start with 'html_' and the fetcher_class + should inherit from changedetectionio.content_fetchers.base.Fetcher + + Returns: + tuple: (str: fetcher_name, class: fetcher_class) + """ + pass + + @hookspec + def fetcher_status_icon(fetcher_name): + """Return status icon HTML attributes for a content fetcher. + + Args: + fetcher_name: The name of the fetcher (e.g., 'html_webdriver', 'html_js_zyte') + + Returns: + str: HTML string containing tags or other status icon elements + Empty string if no custom status icon is needed + """ + pass + + @hookspec + def plugin_static_path(self): + """Return the path to the plugin's static files directory. + + Returns: + str: Absolute path to the plugin's static directory, or None if no static files + """ + pass + # Set up Plugin Manager plugin_manager = pluggy.PluginManager(PLUGIN_NAMESPACE) @@ -65,18 +99,60 @@ load_plugins_from_directories() # Discover installed plugins from external packages (if any) plugin_manager.load_setuptools_entrypoints(PLUGIN_NAMESPACE) +# Function to register built-in fetchers - called later from content_fetchers/__init__.py +def register_builtin_fetchers(): + """Register built-in content fetchers as internal plugins + + This is called from content_fetchers/__init__.py after all fetchers are imported + to avoid circular import issues. + """ + from changedetectionio.content_fetchers import requests, playwright, puppeteer, webdriver_selenium + + # Register each built-in fetcher plugin + if hasattr(requests, 'requests_plugin'): + plugin_manager.register(requests.requests_plugin, 'builtin_requests') + + if hasattr(playwright, 'playwright_plugin'): + plugin_manager.register(playwright.playwright_plugin, 'builtin_playwright') + + if hasattr(puppeteer, 'puppeteer_plugin'): + plugin_manager.register(puppeteer.puppeteer_plugin, 'builtin_puppeteer') + + if hasattr(webdriver_selenium, 'webdriver_selenium_plugin'): + plugin_manager.register(webdriver_selenium.webdriver_selenium_plugin, 'builtin_webdriver_selenium') + # Helper function to collect UI stats extras from all plugins def collect_ui_edit_stats_extras(watch): """Collect and combine HTML content from all plugins that implement ui_edit_stats_extras""" extras_content = [] - + # Get all plugins that implement the ui_edit_stats_extras hook results = plugin_manager.hook.ui_edit_stats_extras(watch=watch) - + # If we have results, add them to our content if results: for result in results: if result: # Skip empty results extras_content.append(result) - - return "\n".join(extras_content) if extras_content else "" \ No newline at end of file + + return "\n".join(extras_content) if extras_content else "" + +def collect_fetcher_status_icons(fetcher_name): + """Collect status icon data from all plugins + + Args: + fetcher_name: The name of the fetcher (e.g., 'html_webdriver', 'html_js_zyte') + + Returns: + dict or None: Icon data dictionary from first matching plugin, or None + """ + # Get status icon data from plugins + results = plugin_manager.hook.fetcher_status_icon(fetcher_name=fetcher_name) + + # Return first non-None result + if results: + for result in results: + if result and isinstance(result, dict): + return result + + return None \ No newline at end of file