Pluggable content fetchers

2025-12-14 12:06:55 +00:00 · 2025-11-24 12:18:48 +01:00
parent d763bb4267
commit 652f939d6d
9 changed files with 302 additions and 13 deletions
--- a/changedetectionio/blueprint/watchlist/templates/watch-overview.html
+++ b/changedetectionio/blueprint/watchlist/templates/watch-overview.html
@@ -182,11 +182,9 @@ document.addEventListener('DOMContentLoaded', function() {
                        </div>
                    <div class="status-icons">
                            <a class="link-spread" href="{{url_for('ui.form_share_put_watch', uuid=watch.uuid)}}"><img src="{{url_for('static_content', group='images', filename='spread.svg')}}" class="status-icon icon icon-spread" title="Create a link to share watch config with others" ></a>
-                            {%- if watch.get_fetch_backend == "html_webdriver"
-                                 or ( watch.get_fetch_backend == "system" and system_default_fetcher == 'html_webdriver'  )
-                                 or "extra_browser_" in watch.get_fetch_backend
-                            -%}
-                            <img class="status-icon" src="{{url_for('static_content', group='images', filename='google-chrome-icon.png')}}" alt="Using a Chrome browser" title="Using a Chrome browser" >
+                            {%- set effective_fetcher = watch.get_fetch_backend if watch.get_fetch_backend != "system" else system_default_fetcher -%}
+                            {%- if effective_fetcher and ("html_webdriver" in effective_fetcher or "html_" in effective_fetcher or "extra_browser_" in effective_fetcher) -%}
+                                {{ effective_fetcher|fetcher_status_icons }}
                            {%- endif -%}
                            {%- if watch.is_pdf  -%}<img class="status-icon" src="{{url_for('static_content', group='images', filename='pdf-icon.svg')}}" alt="Converting PDF to text" >{%- endif -%}
                            {%- if watch.has_browser_steps -%}<img class="status-icon status-browsersteps" src="{{url_for('static_content', group='images', filename='steps.svg')}}" alt="Browser Steps is enabled" >{%- endif -%}
--- a/changedetectionio/content_fetchers/init.py
+++ b/changedetectionio/content_fetchers/init.py
@@ -7,6 +7,9 @@ import os
 # Visual Selector scraper - 'Button' is there because some sites have <button>OUT OF STOCK</button>.
 visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,button'

+# Import hookimpl from centralized pluggy interface
+from changedetectionio.pluggy_interface import hookimpl
+
 SCREENSHOT_MAX_HEIGHT_DEFAULT = 20000
 SCREENSHOT_DEFAULT_QUALITY = 40

@@ -35,17 +38,54 @@ def available_fetchers():
    # See the if statement at the bottom of this file for how we switch between playwright and webdriver
    import inspect
    p = []
+
+    # Get built-in fetchers (but skip plugin fetchers that were added via setattr)
    for name, obj in inspect.getmembers(sys.modules[__name__], inspect.isclass):
        if inspect.isclass(obj):
            # @todo html_ is maybe better as fetcher_ or something
            # In this case, make sure to edit the default one in store.py and fetch_site_status.py
            if name.startswith('html_'):
+                # Skip plugin fetchers that were already registered
+                if name not in _plugin_fetchers:
                    t = tuple([name, obj.fetcher_description])
                    p.append(t)

+    # Get plugin fetchers from cache (already loaded at module init)
+    for name, fetcher_class in _plugin_fetchers.items():
+        if hasattr(fetcher_class, 'fetcher_description'):
+            t = tuple([name, fetcher_class.fetcher_description])
+            p.append(t)
+        else:
+            logger.warning(f"Plugin fetcher '{name}' does not have fetcher_description attribute")
+
    return p


+def get_plugin_fetchers():
+    """Load and return all plugin fetchers from the centralized plugin manager."""
+    from changedetectionio.pluggy_interface import plugin_manager
+
+    fetchers = {}
+    try:
+        # Call the register_content_fetcher hook from all registered plugins
+        results = plugin_manager.hook.register_content_fetcher()
+        for result in results:
+            if result:
+                name, fetcher_class = result
+                fetchers[name] = fetcher_class
+                # Register in current module so hasattr() checks work
+                setattr(sys.modules[__name__], name, fetcher_class)
+                logger.info(f"Registered plugin fetcher: {name} - {getattr(fetcher_class, 'fetcher_description', 'No description')}")
+    except Exception as e:
+        logger.error(f"Error loading plugin fetchers: {e}")
+
+    return fetchers
+
+
+# Initialize plugins at module load time
+_plugin_fetchers = get_plugin_fetchers()
+
+
 # Decide which is the 'real' HTML webdriver, this is more a system wide config
 # rather than site-specific.
 use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False)
@@ -62,3 +102,8 @@ else:
    logger.debug("Falling back to selenium as fetcher")
    from .webdriver_selenium import fetcher as html_webdriver

+
+# Register built-in fetchers as plugins after all imports are complete
+from changedetectionio.pluggy_interface import register_builtin_fetchers
+register_builtin_fetchers()
+
--- a/changedetectionio/content_fetchers/base.py
+++ b/changedetectionio/content_fetchers/base.py
@@ -64,6 +64,24 @@ class Fetcher():
    # Time ONTOP of the system defined env minimum time
    render_extract_delay = 0

+    @classmethod
+    def get_status_icon_data(cls):
+        """Return data for status icon to display in the watch overview.
+
+        This method can be overridden by subclasses to provide custom status icons.
+
+        Returns:
+            dict or None: Dictionary with icon data:
+                {
+                    'filename': 'icon-name.svg',  # Icon filename
+                    'alt': 'Alt text',            # Alt attribute
+                    'title': 'Tooltip text',      # Title attribute
+                    'style': 'height: 1em;'       # Optional inline CSS
+                }
+                Or None if no icon
+        """
+        return None
+
    def clear_content(self):
        """
        Explicitly clear all content from memory to free up heap space.
--- a/changedetectionio/content_fetchers/playwright.py
+++ b/changedetectionio/content_fetchers/playwright.py
@@ -89,6 +89,15 @@ class fetcher(Fetcher):

    proxy = None

+    @classmethod
+    def get_status_icon_data(cls):
+        """Return Chrome browser icon data for Playwright fetcher."""
+        return {
+            'filename': 'google-chrome-icon.png',
+            'alt': 'Using a Chrome browser',
+            'title': 'Using a Chrome browser'
+        }
+
    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
        super().__init__()

@@ -330,4 +339,17 @@ class fetcher(Fetcher):
                browser = None


+# Plugin registration for built-in fetcher
+class PlaywrightFetcherPlugin:
+    """Plugin class that registers the Playwright fetcher as a built-in plugin."""
+
+    def register_content_fetcher(self):
+        """Register the Playwright fetcher"""
+        return ('html_webdriver', fetcher)
+
+
+# Create module-level instance for plugin registration
+playwright_plugin = PlaywrightFetcherPlugin()
+
+

--- a/changedetectionio/content_fetchers/puppeteer.py
+++ b/changedetectionio/content_fetchers/puppeteer.py
@@ -98,6 +98,15 @@ class fetcher(Fetcher):

    proxy = None

+    @classmethod
+    def get_status_icon_data(cls):
+        """Return Chrome browser icon data for Puppeteer fetcher."""
+        return {
+            'filename': 'google-chrome-icon.png',
+            'alt': 'Using a Chrome browser',
+            'title': 'Using a Chrome browser'
+        }
+
    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
        super().__init__()

@@ -384,3 +393,16 @@ class fetcher(Fetcher):
            )
        except asyncio.TimeoutError:
            raise (BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))
+
+
+# Plugin registration for built-in fetcher
+class PuppeteerFetcherPlugin:
+    """Plugin class that registers the Puppeteer fetcher as a built-in plugin."""
+
+    def register_content_fetcher(self):
+        """Register the Puppeteer fetcher"""
+        return ('html_webdriver', fetcher)
+
+
+# Create module-level instance for plugin registration
+puppeteer_plugin = PuppeteerFetcherPlugin()
--- a/changedetectionio/content_fetchers/requests.py
+++ b/changedetectionio/content_fetchers/requests.py
@@ -163,3 +163,15 @@ class fetcher(Fetcher):
                except Exception as e:
                    logger.warning(f"Failed to unlink screenshot: {screenshot} - {e}")

+
+# Plugin registration for built-in fetcher
+class RequestsFetcherPlugin:
+    """Plugin class that registers the requests fetcher as a built-in plugin."""
+
+    def register_content_fetcher(self):
+        """Register the requests fetcher"""
+        return ('html_requests', fetcher)
+
+
+# Create module-level instance for plugin registration
+requests_plugin = RequestsFetcherPlugin()
--- a/changedetectionio/content_fetchers/webdriver_selenium.py
+++ b/changedetectionio/content_fetchers/webdriver_selenium.py
@@ -14,6 +14,15 @@ class fetcher(Fetcher):
    proxy = None
    proxy_url = None

+    @classmethod
+    def get_status_icon_data(cls):
+        """Return Chrome browser icon data for WebDriver fetcher."""
+        return {
+            'filename': 'google-chrome-icon.png',
+            'alt': 'Using a Chrome browser',
+            'title': 'Using a Chrome browser'
+        }
+
    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
        super().__init__()
        from urllib.parse import urlparse
@@ -141,3 +150,16 @@ class fetcher(Fetcher):
        # Run the selenium operations in a thread pool to avoid blocking the event loop
        loop = asyncio.get_event_loop()
        await loop.run_in_executor(None, _run_sync)
+
+
+# Plugin registration for built-in fetcher
+class WebDriverSeleniumFetcherPlugin:
+    """Plugin class that registers the WebDriver Selenium fetcher as a built-in plugin."""
+
+    def register_content_fetcher(self):
+        """Register the WebDriver Selenium fetcher"""
+        return ('html_webdriver', fetcher)
+
+
+# Create module-level instance for plugin registration
+webdriver_selenium_plugin = WebDriverSeleniumFetcherPlugin()
--- a/changedetectionio/flask_app.py
+++ b/changedetectionio/flask_app.py
@@ -210,6 +210,55 @@ def _jinja2_filter_seconds_precise(timestamp):

    return format(int(time.time()-timestamp), ',d')

+@app.template_filter('fetcher_status_icons')
+def _jinja2_filter_fetcher_status_icons(fetcher_name):
+    """Get status icon HTML for a given fetcher.
+
+    This filter checks both built-in fetchers and plugin fetchers for status icons.
+
+    Args:
+        fetcher_name: The fetcher name (e.g., 'html_webdriver', 'html_js_zyte')
+
+    Returns:
+        str: HTML string containing status icon elements
+    """
+    from changedetectionio import content_fetchers
+    from changedetectionio.pluggy_interface import collect_fetcher_status_icons
+    from markupsafe import Markup
+    from flask import url_for
+
+    icon_data = None
+
+    # First check if it's a plugin fetcher (plugins have priority)
+    plugin_icon_data = collect_fetcher_status_icons(fetcher_name)
+    if plugin_icon_data:
+        icon_data = plugin_icon_data
+    # Check if it's a built-in fetcher
+    elif hasattr(content_fetchers, fetcher_name):
+        fetcher_class = getattr(content_fetchers, fetcher_name)
+        if hasattr(fetcher_class, 'get_status_icon_data'):
+            icon_data = fetcher_class.get_status_icon_data()
+
+    # Build HTML from icon data
+    if icon_data and isinstance(icon_data, dict):
+        # Use 'group' from icon_data if specified, otherwise default to 'images'
+        group = icon_data.get('group', 'images')
+
+        # Try to use url_for, but fall back to manual URL building if endpoint not registered yet
+        try:
+            icon_url = url_for('static_content', group=group, filename=icon_data['filename'])
+        except:
+            # Fallback: build URL manually respecting APPLICATION_ROOT
+            from flask import request
+            app_root = request.script_root if hasattr(request, 'script_root') else ''
+            icon_url = f"{app_root}/static/{group}/{icon_data['filename']}"
+
+        style_attr = f' style="{icon_data["style"]}"' if icon_data.get('style') else ''
+        html = f'<img class="status-icon" src="{icon_url}" alt="{icon_data["alt"]}" title="{icon_data["title"]}"{style_attr}>'
+        return Markup(html)
+
+    return ''
+
 # Import login_optionally_required from auth_decorator
 from changedetectionio.auth_decorator import login_optionally_required

@@ -488,6 +537,31 @@ def changedetection_app(config=None, datastore_o=None):
            except FileNotFoundError:
                abort(404)

+        # Handle plugin group specially
+        if group == 'plugin':
+            # Serve files from plugin static directories
+            from changedetectionio.pluggy_interface import plugin_manager
+            import os as os_check
+
+            for plugin_name, plugin_obj in plugin_manager.list_name_plugin():
+                if hasattr(plugin_obj, 'plugin_static_path'):
+                    try:
+                        static_path = plugin_obj.plugin_static_path()
+                        if static_path and os_check.path.isdir(static_path):
+                            # Check if file exists in plugin's static directory
+                            plugin_file_path = os_check.path.join(static_path, filename)
+                            if os_check.path.isfile(plugin_file_path):
+                                # Found the file in a plugin
+                                response = make_response(send_from_directory(static_path, filename))
+                                response.headers['Cache-Control'] = 'max-age=3600, public'  # Cache for 1 hour
+                                return response
+                    except Exception as e:
+                        logger.debug(f"Error checking plugin {plugin_name} for static file: {e}")
+                        pass
+
+            # File not found in any plugin
+            abort(404)
+
        # These files should be in our subdirectory
        try:
            return send_from_directory(f"static/{group}", path=filename)
--- a/changedetectionio/pluggy_interface.py
+++ b/changedetectionio/pluggy_interface.py
@@ -25,6 +25,40 @@ class ChangeDetectionSpec:
        """
        pass

+    @hookspec
+    def register_content_fetcher(self):
+        """Return a tuple of (fetcher_name, fetcher_class) for content fetcher plugins.
+
+        The fetcher_name should start with 'html_' and the fetcher_class
+        should inherit from changedetectionio.content_fetchers.base.Fetcher
+
+        Returns:
+            tuple: (str: fetcher_name, class: fetcher_class)
+        """
+        pass
+
+    @hookspec
+    def fetcher_status_icon(fetcher_name):
+        """Return status icon HTML attributes for a content fetcher.
+
+        Args:
+            fetcher_name: The name of the fetcher (e.g., 'html_webdriver', 'html_js_zyte')
+
+        Returns:
+            str: HTML string containing <img> tags or other status icon elements
+                 Empty string if no custom status icon is needed
+        """
+        pass
+
+    @hookspec
+    def plugin_static_path(self):
+        """Return the path to the plugin's static files directory.
+
+        Returns:
+            str: Absolute path to the plugin's static directory, or None if no static files
+        """
+        pass
+

 # Set up Plugin Manager
 plugin_manager = pluggy.PluginManager(PLUGIN_NAMESPACE)
@@ -65,6 +99,28 @@ load_plugins_from_directories()
 # Discover installed plugins from external packages (if any)
 plugin_manager.load_setuptools_entrypoints(PLUGIN_NAMESPACE)

+# Function to register built-in fetchers - called later from content_fetchers/__init__.py
+def register_builtin_fetchers():
+    """Register built-in content fetchers as internal plugins
+
+    This is called from content_fetchers/__init__.py after all fetchers are imported
+    to avoid circular import issues.
+    """
+    from changedetectionio.content_fetchers import requests, playwright, puppeteer, webdriver_selenium
+
+    # Register each built-in fetcher plugin
+    if hasattr(requests, 'requests_plugin'):
+        plugin_manager.register(requests.requests_plugin, 'builtin_requests')
+
+    if hasattr(playwright, 'playwright_plugin'):
+        plugin_manager.register(playwright.playwright_plugin, 'builtin_playwright')
+
+    if hasattr(puppeteer, 'puppeteer_plugin'):
+        plugin_manager.register(puppeteer.puppeteer_plugin, 'builtin_puppeteer')
+
+    if hasattr(webdriver_selenium, 'webdriver_selenium_plugin'):
+        plugin_manager.register(webdriver_selenium.webdriver_selenium_plugin, 'builtin_webdriver_selenium')
+
 # Helper function to collect UI stats extras from all plugins
 def collect_ui_edit_stats_extras(watch):
    """Collect and combine HTML content from all plugins that implement ui_edit_stats_extras"""
@@ -80,3 +136,23 @@ def collect_ui_edit_stats_extras(watch):
                extras_content.append(result)

    return "\n".join(extras_content) if extras_content else ""
+
+def collect_fetcher_status_icons(fetcher_name):
+    """Collect status icon data from all plugins
+
+    Args:
+        fetcher_name: The name of the fetcher (e.g., 'html_webdriver', 'html_js_zyte')
+
+    Returns:
+        dict or None: Icon data dictionary from first matching plugin, or None
+    """
+    # Get status icon data from plugins
+    results = plugin_manager.hook.fetcher_status_icon(fetcher_name=fetcher_name)
+
+    # Return first non-None result
+    if results:
+        for result in results:
+            if result and isinstance(result, dict):
+                return result
+
+    return None