Pluggable content fetchers

This commit is contained in:
dgtlmoon
2025-11-24 12:18:48 +01:00
parent d763bb4267
commit 652f939d6d
9 changed files with 302 additions and 13 deletions

View File

@@ -182,11 +182,9 @@ document.addEventListener('DOMContentLoaded', function() {
</div>
<div class="status-icons">
<a class="link-spread" href="{{url_for('ui.form_share_put_watch', uuid=watch.uuid)}}"><img src="{{url_for('static_content', group='images', filename='spread.svg')}}" class="status-icon icon icon-spread" title="Create a link to share watch config with others" ></a>
{%- if watch.get_fetch_backend == "html_webdriver"
or ( watch.get_fetch_backend == "system" and system_default_fetcher == 'html_webdriver' )
or "extra_browser_" in watch.get_fetch_backend
-%}
<img class="status-icon" src="{{url_for('static_content', group='images', filename='google-chrome-icon.png')}}" alt="Using a Chrome browser" title="Using a Chrome browser" >
{%- set effective_fetcher = watch.get_fetch_backend if watch.get_fetch_backend != "system" else system_default_fetcher -%}
{%- if effective_fetcher and ("html_webdriver" in effective_fetcher or "html_" in effective_fetcher or "extra_browser_" in effective_fetcher) -%}
{{ effective_fetcher|fetcher_status_icons }}
{%- endif -%}
{%- if watch.is_pdf -%}<img class="status-icon" src="{{url_for('static_content', group='images', filename='pdf-icon.svg')}}" alt="Converting PDF to text" >{%- endif -%}
{%- if watch.has_browser_steps -%}<img class="status-icon status-browsersteps" src="{{url_for('static_content', group='images', filename='steps.svg')}}" alt="Browser Steps is enabled" >{%- endif -%}

View File

@@ -7,6 +7,9 @@ import os
# Visual Selector scraper - 'Button' is there because some sites have <button>OUT OF STOCK</button>.
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,button'
# Import hookimpl from centralized pluggy interface
from changedetectionio.pluggy_interface import hookimpl
SCREENSHOT_MAX_HEIGHT_DEFAULT = 20000
SCREENSHOT_DEFAULT_QUALITY = 40
@@ -35,17 +38,54 @@ def available_fetchers():
# See the if statement at the bottom of this file for how we switch between playwright and webdriver
import inspect
p = []
# Get built-in fetchers (but skip plugin fetchers that were added via setattr)
for name, obj in inspect.getmembers(sys.modules[__name__], inspect.isclass):
if inspect.isclass(obj):
# @todo html_ is maybe better as fetcher_ or something
# In this case, make sure to edit the default one in store.py and fetch_site_status.py
if name.startswith('html_'):
# Skip plugin fetchers that were already registered
if name not in _plugin_fetchers:
t = tuple([name, obj.fetcher_description])
p.append(t)
# Get plugin fetchers from cache (already loaded at module init)
for name, fetcher_class in _plugin_fetchers.items():
if hasattr(fetcher_class, 'fetcher_description'):
t = tuple([name, fetcher_class.fetcher_description])
p.append(t)
else:
logger.warning(f"Plugin fetcher '{name}' does not have fetcher_description attribute")
return p
def get_plugin_fetchers():
"""Load and return all plugin fetchers from the centralized plugin manager."""
from changedetectionio.pluggy_interface import plugin_manager
fetchers = {}
try:
# Call the register_content_fetcher hook from all registered plugins
results = plugin_manager.hook.register_content_fetcher()
for result in results:
if result:
name, fetcher_class = result
fetchers[name] = fetcher_class
# Register in current module so hasattr() checks work
setattr(sys.modules[__name__], name, fetcher_class)
logger.info(f"Registered plugin fetcher: {name} - {getattr(fetcher_class, 'fetcher_description', 'No description')}")
except Exception as e:
logger.error(f"Error loading plugin fetchers: {e}")
return fetchers
# Initialize plugins at module load time
_plugin_fetchers = get_plugin_fetchers()
# Decide which is the 'real' HTML webdriver, this is more a system wide config
# rather than site-specific.
use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False)
@@ -62,3 +102,8 @@ else:
logger.debug("Falling back to selenium as fetcher")
from .webdriver_selenium import fetcher as html_webdriver
# Register built-in fetchers as plugins after all imports are complete
from changedetectionio.pluggy_interface import register_builtin_fetchers
register_builtin_fetchers()

View File

@@ -64,6 +64,24 @@ class Fetcher():
# Time ONTOP of the system defined env minimum time
render_extract_delay = 0
@classmethod
def get_status_icon_data(cls):
"""Return data for status icon to display in the watch overview.
This method can be overridden by subclasses to provide custom status icons.
Returns:
dict or None: Dictionary with icon data:
{
'filename': 'icon-name.svg', # Icon filename
'alt': 'Alt text', # Alt attribute
'title': 'Tooltip text', # Title attribute
'style': 'height: 1em;' # Optional inline CSS
}
Or None if no icon
"""
return None
def clear_content(self):
"""
Explicitly clear all content from memory to free up heap space.

View File

@@ -89,6 +89,15 @@ class fetcher(Fetcher):
proxy = None
@classmethod
def get_status_icon_data(cls):
"""Return Chrome browser icon data for Playwright fetcher."""
return {
'filename': 'google-chrome-icon.png',
'alt': 'Using a Chrome browser',
'title': 'Using a Chrome browser'
}
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
super().__init__()
@@ -330,4 +339,17 @@ class fetcher(Fetcher):
browser = None
# Plugin registration for built-in fetcher
class PlaywrightFetcherPlugin:
"""Plugin class that registers the Playwright fetcher as a built-in plugin."""
def register_content_fetcher(self):
"""Register the Playwright fetcher"""
return ('html_webdriver', fetcher)
# Create module-level instance for plugin registration
playwright_plugin = PlaywrightFetcherPlugin()

View File

@@ -98,6 +98,15 @@ class fetcher(Fetcher):
proxy = None
@classmethod
def get_status_icon_data(cls):
"""Return Chrome browser icon data for Puppeteer fetcher."""
return {
'filename': 'google-chrome-icon.png',
'alt': 'Using a Chrome browser',
'title': 'Using a Chrome browser'
}
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
super().__init__()
@@ -384,3 +393,16 @@ class fetcher(Fetcher):
)
except asyncio.TimeoutError:
raise (BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))
# Plugin registration for built-in fetcher
class PuppeteerFetcherPlugin:
"""Plugin class that registers the Puppeteer fetcher as a built-in plugin."""
def register_content_fetcher(self):
"""Register the Puppeteer fetcher"""
return ('html_webdriver', fetcher)
# Create module-level instance for plugin registration
puppeteer_plugin = PuppeteerFetcherPlugin()

View File

@@ -163,3 +163,15 @@ class fetcher(Fetcher):
except Exception as e:
logger.warning(f"Failed to unlink screenshot: {screenshot} - {e}")
# Plugin registration for built-in fetcher
class RequestsFetcherPlugin:
"""Plugin class that registers the requests fetcher as a built-in plugin."""
def register_content_fetcher(self):
"""Register the requests fetcher"""
return ('html_requests', fetcher)
# Create module-level instance for plugin registration
requests_plugin = RequestsFetcherPlugin()

View File

@@ -14,6 +14,15 @@ class fetcher(Fetcher):
proxy = None
proxy_url = None
@classmethod
def get_status_icon_data(cls):
"""Return Chrome browser icon data for WebDriver fetcher."""
return {
'filename': 'google-chrome-icon.png',
'alt': 'Using a Chrome browser',
'title': 'Using a Chrome browser'
}
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
super().__init__()
from urllib.parse import urlparse
@@ -141,3 +150,16 @@ class fetcher(Fetcher):
# Run the selenium operations in a thread pool to avoid blocking the event loop
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, _run_sync)
# Plugin registration for built-in fetcher
class WebDriverSeleniumFetcherPlugin:
"""Plugin class that registers the WebDriver Selenium fetcher as a built-in plugin."""
def register_content_fetcher(self):
"""Register the WebDriver Selenium fetcher"""
return ('html_webdriver', fetcher)
# Create module-level instance for plugin registration
webdriver_selenium_plugin = WebDriverSeleniumFetcherPlugin()

View File

@@ -210,6 +210,55 @@ def _jinja2_filter_seconds_precise(timestamp):
return format(int(time.time()-timestamp), ',d')
@app.template_filter('fetcher_status_icons')
def _jinja2_filter_fetcher_status_icons(fetcher_name):
"""Get status icon HTML for a given fetcher.
This filter checks both built-in fetchers and plugin fetchers for status icons.
Args:
fetcher_name: The fetcher name (e.g., 'html_webdriver', 'html_js_zyte')
Returns:
str: HTML string containing status icon elements
"""
from changedetectionio import content_fetchers
from changedetectionio.pluggy_interface import collect_fetcher_status_icons
from markupsafe import Markup
from flask import url_for
icon_data = None
# First check if it's a plugin fetcher (plugins have priority)
plugin_icon_data = collect_fetcher_status_icons(fetcher_name)
if plugin_icon_data:
icon_data = plugin_icon_data
# Check if it's a built-in fetcher
elif hasattr(content_fetchers, fetcher_name):
fetcher_class = getattr(content_fetchers, fetcher_name)
if hasattr(fetcher_class, 'get_status_icon_data'):
icon_data = fetcher_class.get_status_icon_data()
# Build HTML from icon data
if icon_data and isinstance(icon_data, dict):
# Use 'group' from icon_data if specified, otherwise default to 'images'
group = icon_data.get('group', 'images')
# Try to use url_for, but fall back to manual URL building if endpoint not registered yet
try:
icon_url = url_for('static_content', group=group, filename=icon_data['filename'])
except:
# Fallback: build URL manually respecting APPLICATION_ROOT
from flask import request
app_root = request.script_root if hasattr(request, 'script_root') else ''
icon_url = f"{app_root}/static/{group}/{icon_data['filename']}"
style_attr = f' style="{icon_data["style"]}"' if icon_data.get('style') else ''
html = f'<img class="status-icon" src="{icon_url}" alt="{icon_data["alt"]}" title="{icon_data["title"]}"{style_attr}>'
return Markup(html)
return ''
# Import login_optionally_required from auth_decorator
from changedetectionio.auth_decorator import login_optionally_required
@@ -488,6 +537,31 @@ def changedetection_app(config=None, datastore_o=None):
except FileNotFoundError:
abort(404)
# Handle plugin group specially
if group == 'plugin':
# Serve files from plugin static directories
from changedetectionio.pluggy_interface import plugin_manager
import os as os_check
for plugin_name, plugin_obj in plugin_manager.list_name_plugin():
if hasattr(plugin_obj, 'plugin_static_path'):
try:
static_path = plugin_obj.plugin_static_path()
if static_path and os_check.path.isdir(static_path):
# Check if file exists in plugin's static directory
plugin_file_path = os_check.path.join(static_path, filename)
if os_check.path.isfile(plugin_file_path):
# Found the file in a plugin
response = make_response(send_from_directory(static_path, filename))
response.headers['Cache-Control'] = 'max-age=3600, public' # Cache for 1 hour
return response
except Exception as e:
logger.debug(f"Error checking plugin {plugin_name} for static file: {e}")
pass
# File not found in any plugin
abort(404)
# These files should be in our subdirectory
try:
return send_from_directory(f"static/{group}", path=filename)

View File

@@ -25,6 +25,40 @@ class ChangeDetectionSpec:
"""
pass
@hookspec
def register_content_fetcher(self):
"""Return a tuple of (fetcher_name, fetcher_class) for content fetcher plugins.
The fetcher_name should start with 'html_' and the fetcher_class
should inherit from changedetectionio.content_fetchers.base.Fetcher
Returns:
tuple: (str: fetcher_name, class: fetcher_class)
"""
pass
@hookspec
def fetcher_status_icon(fetcher_name):
"""Return status icon HTML attributes for a content fetcher.
Args:
fetcher_name: The name of the fetcher (e.g., 'html_webdriver', 'html_js_zyte')
Returns:
str: HTML string containing <img> tags or other status icon elements
Empty string if no custom status icon is needed
"""
pass
@hookspec
def plugin_static_path(self):
"""Return the path to the plugin's static files directory.
Returns:
str: Absolute path to the plugin's static directory, or None if no static files
"""
pass
# Set up Plugin Manager
plugin_manager = pluggy.PluginManager(PLUGIN_NAMESPACE)
@@ -65,6 +99,28 @@ load_plugins_from_directories()
# Discover installed plugins from external packages (if any)
plugin_manager.load_setuptools_entrypoints(PLUGIN_NAMESPACE)
# Function to register built-in fetchers - called later from content_fetchers/__init__.py
def register_builtin_fetchers():
"""Register built-in content fetchers as internal plugins
This is called from content_fetchers/__init__.py after all fetchers are imported
to avoid circular import issues.
"""
from changedetectionio.content_fetchers import requests, playwright, puppeteer, webdriver_selenium
# Register each built-in fetcher plugin
if hasattr(requests, 'requests_plugin'):
plugin_manager.register(requests.requests_plugin, 'builtin_requests')
if hasattr(playwright, 'playwright_plugin'):
plugin_manager.register(playwright.playwright_plugin, 'builtin_playwright')
if hasattr(puppeteer, 'puppeteer_plugin'):
plugin_manager.register(puppeteer.puppeteer_plugin, 'builtin_puppeteer')
if hasattr(webdriver_selenium, 'webdriver_selenium_plugin'):
plugin_manager.register(webdriver_selenium.webdriver_selenium_plugin, 'builtin_webdriver_selenium')
# Helper function to collect UI stats extras from all plugins
def collect_ui_edit_stats_extras(watch):
"""Collect and combine HTML content from all plugins that implement ui_edit_stats_extras"""
@@ -80,3 +136,23 @@ def collect_ui_edit_stats_extras(watch):
extras_content.append(result)
return "\n".join(extras_content) if extras_content else ""
def collect_fetcher_status_icons(fetcher_name):
"""Collect status icon data from all plugins
Args:
fetcher_name: The name of the fetcher (e.g., 'html_webdriver', 'html_js_zyte')
Returns:
dict or None: Icon data dictionary from first matching plugin, or None
"""
# Get status icon data from plugins
results = plugin_manager.hook.fetcher_status_icon(fetcher_name=fetcher_name)
# Return first non-None result
if results:
for result in results:
if result and isinstance(result, dict):
return result
return None