Compare commits

...

2 Commits

Author SHA1 Message Date
dgtlmoon
141aea07b8 JSONP - Attempt to strip out JSONP 2026-03-15 21:53:46 +01:00
dgtlmoon
5a4266069b Content Fetchers / Browsers - Improvements for pluggable extra fetchers/browsers. (#3981)
Some checks failed
Build and push containers / metadata (push) Has been cancelled
Build and push containers / build-push-containers (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-14 (push) Has been cancelled
2026-03-15 17:35:46 +01:00
8 changed files with 169 additions and 64 deletions

View File

@@ -102,6 +102,35 @@ def run_async_in_browser_loop(coro):
else:
raise RuntimeError("Browser steps event loop is not available")
async def _close_session_resources(session_data, label=''):
"""Close all browser resources for a session in the correct order.
browserstepper.cleanup() closes page+context but not the browser itself.
For CloakBrowser, browser.close() is what stops the local Chromium process via pw.stop().
For the default CDP path, playwright_context.stop() shuts down the playwright instance.
"""
browserstepper = session_data.get('browserstepper')
if browserstepper:
try:
await browserstepper.cleanup()
except Exception as e:
logger.error(f"Error cleaning up browserstepper{label}: {e}")
browser = session_data.get('browser')
if browser:
try:
await asyncio.wait_for(browser.close(), timeout=5.0)
except Exception as e:
logger.warning(f"Error closing browser{label}: {e}")
playwright_context = session_data.get('playwright_context')
if playwright_context:
try:
await playwright_context.stop()
except Exception as e:
logger.warning(f"Error stopping playwright context{label}: {e}")
def cleanup_expired_sessions():
"""Remove expired browsersteps sessions and cleanup their resources"""
global browsersteps_sessions, browsersteps_watch_to_session
@@ -119,13 +148,10 @@ def cleanup_expired_sessions():
logger.debug(f"Cleaning up expired browsersteps session {session_id}")
session_data = browsersteps_sessions[session_id]
# Cleanup playwright resources asynchronously
browserstepper = session_data.get('browserstepper')
if browserstepper:
try:
run_async_in_browser_loop(browserstepper.cleanup())
except Exception as e:
logger.error(f"Error cleaning up session {session_id}: {e}")
try:
run_async_in_browser_loop(_close_session_resources(session_data, label=f" for session {session_id}"))
except Exception as e:
logger.error(f"Error cleaning up session {session_id}: {e}")
# Remove from sessions dict
del browsersteps_sessions[session_id]
@@ -152,12 +178,10 @@ def cleanup_session_for_watch(watch_uuid):
session_data = browsersteps_sessions.get(session_id)
if session_data:
browserstepper = session_data.get('browserstepper')
if browserstepper:
try:
run_async_in_browser_loop(browserstepper.cleanup())
except Exception as e:
logger.error(f"Error cleaning up session {session_id} for watch {watch_uuid}: {e}")
try:
run_async_in_browser_loop(_close_session_resources(session_data, label=f" for watch {watch_uuid}"))
except Exception as e:
logger.error(f"Error cleaning up session {session_id} for watch {watch_uuid}: {e}")
# Remove from sessions dict
del browsersteps_sessions[session_id]
@@ -178,59 +202,69 @@ def construct_blueprint(datastore: ChangeDetectionStore):
import time
from playwright.async_api import async_playwright
# We keep the playwright session open for many minutes
keepalive_seconds = int(os.getenv('BROWSERSTEPS_MINUTES_KEEPALIVE', 10)) * 60
keepalive_ms = ((keepalive_seconds + 3) * 1000)
browsersteps_start_session = {'start_time': time.time()}
# Create a new async playwright instance for browser steps
playwright_instance = async_playwright()
playwright_context = await playwright_instance.start()
keepalive_ms = ((keepalive_seconds + 3) * 1000)
base_url = os.getenv('PLAYWRIGHT_DRIVER_URL', '').strip('"')
a = "?" if not '?' in base_url else '&'
base_url += a + f"timeout={keepalive_ms}"
browser = await playwright_context.chromium.connect_over_cdp(base_url, timeout=keepalive_ms)
browsersteps_start_session['browser'] = browser
browsersteps_start_session['playwright_context'] = playwright_context
# Build proxy dict first — needed by both the CDP path and fetcher-specific launchers
proxy_id = datastore.get_preferred_proxy_for_watch(uuid=watch_uuid)
proxy = None
if proxy_id:
proxy_url = datastore.proxy_list.get(proxy_id).get('url')
proxy_url = datastore.proxy_list.get(proxy_id, {}).get('url')
if proxy_url:
# Playwright needs separate username and password values
from urllib.parse import urlparse
parsed = urlparse(proxy_url)
proxy = {'server': proxy_url}
if parsed.username:
proxy['username'] = parsed.username
if parsed.password:
proxy['password'] = parsed.password
logger.debug(f"Browser Steps: UUID {watch_uuid} selected proxy {proxy_url}")
# Tell Playwright to connect to Chrome and setup a new session via our stepper interface
# Resolve the fetcher class for this watch so we can ask it to launch its own browser
# if it supports that (e.g. CloakBrowser, which runs locally rather than via CDP)
watch = datastore.data['watching'][watch_uuid]
from changedetectionio import content_fetchers
fetcher_name = watch.get_fetch_backend or 'system'
if fetcher_name == 'system':
fetcher_name = datastore.data['settings']['application'].get('fetch_backend', 'html_requests')
fetcher_class = getattr(content_fetchers, fetcher_name, None)
browser = None
playwright_context = None
# If the fetcher has its own browser launch for the live steps UI, use it.
# get_browsersteps_browser(proxy, keepalive_ms) returns (browser, playwright_context_or_None)
# or None to fall back to the default CDP path.
if fetcher_class and hasattr(fetcher_class, 'get_browsersteps_browser'):
result = await fetcher_class.get_browsersteps_browser(proxy=proxy, keepalive_ms=keepalive_ms)
if result is not None:
browser, playwright_context = result
logger.debug(f"Browser Steps: using fetcher-specific browser for '{fetcher_name}'")
# Default: connect to the remote Playwright/sockpuppetbrowser via CDP
if browser is None:
playwright_instance = async_playwright()
playwright_context = await playwright_instance.start()
base_url = os.getenv('PLAYWRIGHT_DRIVER_URL', '').strip('"')
a = "?" if '?' not in base_url else '&'
base_url += a + f"timeout={keepalive_ms}"
browser = await playwright_context.chromium.connect_over_cdp(base_url, timeout=keepalive_ms)
logger.debug(f"Browser Steps: using CDP connection to {base_url}")
browsersteps_start_session['browser'] = browser
browsersteps_start_session['playwright_context'] = playwright_context
browserstepper = browser_steps.browsersteps_live_ui(
playwright_browser=browser,
proxy=proxy,
start_url=datastore.data['watching'][watch_uuid].link,
headers=datastore.data['watching'][watch_uuid].get('headers')
start_url=watch.link,
headers=watch.get('headers')
)
# Initialize the async connection
await browserstepper.connect(proxy=proxy)
browsersteps_start_session['browserstepper'] = browserstepper
# For test
#await browsersteps_start_session['browserstepper'].action_goto_url(value="http://example.com?time="+str(time.time()))
return browsersteps_start_session

View File

@@ -60,12 +60,8 @@ def construct_blueprint(datastore: ChangeDetectionStore):
versions = []
timestamp = None
system_uses_webdriver = datastore.data['settings']['application']['fetch_backend'] == 'html_webdriver'
extra_stylesheets = [url_for('static_content', group='styles', filename='diff.css')]
is_html_webdriver = False
if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver' or watch.get('fetch_backend', '').startswith('extra_browser_'):
is_html_webdriver = True
is_html_webdriver = watch.fetcher_supports_screenshots
triggered_line_numbers = []
ignored_line_numbers = []

View File

@@ -487,13 +487,25 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
except json.JSONDecodeError as e:
logger.warning(f"Error processing JSON {content[:20]}...{str(e)})")
else:
# Probably something else, go fish inside for it
try:
stripped_text_from_html = extract_json_blob_from_html(content=content,
ensure_is_ldjson_info_type=ensure_is_ldjson_info_type,
json_filter=json_filter )
except json.JSONDecodeError as e:
logger.warning(f"Error processing JSON while extracting JSON from HTML blob {content[:20]}...{str(e)})")
# Check for JSONP wrapper: someCallback({...}) or some.namespace({...})
# Server may claim application/json but actually return JSONP
jsonp_match = re.match(r'^\w[\w.]*\s*\((.+)\)\s*;?\s*$', content.lstrip("\ufeff").strip(), re.DOTALL)
if jsonp_match:
try:
inner = jsonp_match.group(1).strip()
logger.warning(f"Content looks like JSONP, attempting to extract inner JSON for filter '{json_filter}'")
stripped_text_from_html = _parse_json(json.loads(inner), json_filter)
except json.JSONDecodeError as e:
logger.warning(f"Error processing JSONP inner content {content[:20]}...{str(e)})")
if not stripped_text_from_html:
# Probably something else, go fish inside for it
try:
stripped_text_from_html = extract_json_blob_from_html(content=content,
ensure_is_ldjson_info_type=ensure_is_ldjson_info_type,
json_filter=json_filter)
except json.JSONDecodeError as e:
logger.warning(f"Error processing JSON while extracting JSON from HTML blob {content[:20]}...{str(e)})")
if not stripped_text_from_html:
# Re 265 - Just return an empty string when filter not found

View File

@@ -388,6 +388,25 @@ class model(EntityPersistenceMixin, watch_base):
return self.get('fetch_backend')
@property
def fetcher_supports_screenshots(self):
"""Return True if the fetcher configured for this watch supports screenshots.
Resolves 'system' via self._datastore, then checks supports_screenshots on
the actual fetcher class. Works for built-in and plugin fetchers alike.
"""
from changedetectionio import content_fetchers
fetcher_name = self.get_fetch_backend # already handles is_pdf → html_requests
if not fetcher_name or fetcher_name == 'system':
fetcher_name = self._datastore['settings']['application'].get('fetch_backend', 'html_requests')
fetcher_class = getattr(content_fetchers, fetcher_name, None)
if fetcher_class is None:
return False
return bool(getattr(fetcher_class, 'supports_screenshots', False))
@property
def is_pdf(self):
url = str(self.get("url") or "").lower()

View File

@@ -42,10 +42,7 @@ def render_form(watch, datastore, request, url_for, render_template, flash, redi
# Get error information for the template
screenshot_url = watch.get_screenshot()
system_uses_webdriver = datastore.data['settings']['application']['fetch_backend'] == 'html_webdriver'
is_html_webdriver = False
if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver' or watch.get('fetch_backend', '').startswith('extra_browser_'):
is_html_webdriver = True
is_html_webdriver = watch.fetcher_supports_screenshots
password_enabled_and_share_is_off = False
if datastore.data['settings']['application'].get('password') or os.getenv("SALTED_PASS", False):

View File

@@ -100,7 +100,13 @@ class guess_stream_type():
if any(s in http_content_header for s in RSS_XML_CONTENT_TYPES):
self.is_rss = True
elif any(s in http_content_header for s in JSON_CONTENT_TYPES):
self.is_json = True
# JSONP detection: server claims application/json but content is actually JSONP (e.g. cb({...}))
# A JSONP response starts with an identifier followed by '(' - not valid JSON
if re.match(r'^\w[\w.]*\s*\(', test_content):
logger.warning(f"Content-Type header claims JSON but content looks like JSONP (starts with identifier+parenthesis) - treating as plaintext")
self.is_plaintext = True
else:
self.is_json = True
elif 'pdf' in magic_content_header:
self.is_pdf = True
# magic will call a rss document 'xml'

View File

@@ -154,11 +154,7 @@ def render(watch, datastore, request, url_for, render_template, flash, redirect,
screenshot_url = watch.get_screenshot()
system_uses_webdriver = datastore.data['settings']['application']['fetch_backend'] == 'html_webdriver'
is_html_webdriver = False
if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver' or watch.get('fetch_backend', '').startswith('extra_browser_'):
is_html_webdriver = True
is_html_webdriver = watch.fetcher_supports_screenshots
password_enabled_and_share_is_off = False
if datastore.data['settings']['application'].get('password') or os.getenv("SALTED_PASS", False):

View File

@@ -16,6 +16,51 @@ except ModuleNotFoundError:
def test_jsonp_treated_as_plaintext():
from ..processors.magic import guess_stream_type
# JSONP content (server wrongly claims application/json) should be detected as plaintext
# Callback names are arbitrary identifiers, not always 'cb'
jsonp_content = 'jQuery123456({ "version": "8.0.41", "url": "https://example.com/app.apk" })'
result = guess_stream_type(http_content_header="application/json", content=jsonp_content)
assert result.is_json is False
assert result.is_plaintext is True
# Variation with dotted callback name e.g. jQuery.cb(...)
jsonp_dotted = 'some.callback({ "version": "1.0" })'
result = guess_stream_type(http_content_header="application/json", content=jsonp_dotted)
assert result.is_json is False
assert result.is_plaintext is True
# Real JSON should still be detected as JSON
json_content = '{ "version": "8.0.41", "url": "https://example.com/app.apk" }'
result = guess_stream_type(http_content_header="application/json", content=json_content)
assert result.is_json is True
assert result.is_plaintext is False
def test_jsonp_json_filter_extraction():
from .. import html_tools
# Tough case: dotted namespace callback, trailing semicolon, deeply nested content with arrays
jsonp_content = 'weixin.update.callback({"platforms": {"android": {"variants": [{"arch": "arm64", "versionName": "8.0.68", "url": "https://example.com/app-arm64.apk"}, {"arch": "arm32", "versionName": "8.0.41", "url": "https://example.com/app-arm32.apk"}]}}});'
# Deep nested jsonpath filter into array element
text = html_tools.extract_json_as_string(jsonp_content, "json:$.platforms.android.variants[0].versionName")
assert text == '"8.0.68"'
# Filter that selects the second array element
text = html_tools.extract_json_as_string(jsonp_content, "json:$.platforms.android.variants[1].arch")
assert text == '"arm32"'
if jq_support:
text = html_tools.extract_json_as_string(jsonp_content, "jq:.platforms.android.variants[0].versionName")
assert text == '"8.0.68"'
text = html_tools.extract_json_as_string(jsonp_content, "jqraw:.platforms.android.variants[1].url")
assert text == "https://example.com/app-arm32.apk"
def test_unittest_inline_html_extract():
# So lets pretend that the JSON we want is inside some HTML
content="""