mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-02-04 13:26:00 +00:00
Multi-language / Translations Support (#3696) - Complete internationalization system implemented - Support for 7 languages: Czech (cs), German (de), French (fr), Italian (it), Korean (ko), Chinese Simplified (zh), Chinese Traditional (zh_TW) - Language selector with localized flags and theming - Flash message translations - Multiple translation fixes and improvements across all languages - Language setting preserved across redirects Pluggable Content Fetchers (#3653) - New architecture for extensible content fetcher system - Allows custom fetcher implementations Image / Screenshot Comparison Processor (#3680) - New processor for visual change detection (disabled for this release) - Supporting CSS/JS infrastructure added UI Improvements Design & Layout - Auto-generated tag color schemes - Simplified login form styling - Removed hard-coded CSS, moved to SCSS variables - Tag UI cleanup and improvements - Automatic tab wrapper functionality - Menu refactoring for better organization - Cleanup of offset settings - Hide sticky tabs on narrow viewports - Improved responsive layout (#3702) User Experience - Modal alerts/confirmations on delete/clear operations (#3693, #3598, #3382) - Auto-add https:// to URLs in quickwatch form if not present - Better redirect handling on login (#3699) - 'Recheck all' now returns to correct group/tag (#3673) - Language set redirect keeps hash fragment - More friendly human-readable text throughout UI Performance & Reliability Scheduler & Processing - Soft delays instead of blocking time.sleep() calls (#3710) - More resilient handling of same UUID being processed (#3700) - Better Puppeteer timeout handling - Improved Puppeteer shutdown/cleanup (#3692) - Requests cleanup now properly async History & Rendering - Faster server-side "difference" rendering on History page (#3442) - Show ignored/triggered rows in history - API: Retry watch data if watch dict changed (more reliable) API Improvements - Watch get endpoint: retry mechanism for changed watch data - WatchHistoryDiff API endpoint includes extra format args (#3703) Testing Improvements - Replace time.sleep with wait_for_notification_endpoint_output (#3716) - Test for mode switching (#3701) - Test for #3720 added (#3725) - Extract-text difference test fixes - Improved dev workflow Bug Fixes - Notification error text output (#3672, #3669, #3280) - HTML validation fixes (#3704) - Template discovery path fixes - Notification debug log now uses system locale for dates/times - Puppeteer spelling mistake in log output - Recalculation on anchor change - Queue bubble update disabled temporarily Dependency Updates - beautifulsoup4 updated (#3724) - psutil 7.1.0 → 7.2.1 (#3723) - python-engineio ~=4.12.3 → ~=4.13.0 (#3707) - python-socketio ~=5.14.3 → ~=5.16.0 (#3706) - flask-socketio ~=5.5.1 → ~=5.6.0 (#3691) - brotli ~=1.1 → ~=1.2 (#3687) - lxml updated (#3590) - pytest ~=7.2 → ~=9.0 (#3676) - jsonschema ~=4.0 → ~=4.25 (#3618) - pluggy ~=1.5 → ~=1.6 (#3616) - cryptography 44.0.1 → 46.0.3 (security) (#3589) Documentation - README updated with viewport size setup information Development Infrastructure - Dev container only built on dev branch - Improved dev workflow tooling
190 lines
8.0 KiB
Python
190 lines
8.0 KiB
Python
from loguru import logger
|
|
import hashlib
|
|
import os
|
|
import re
|
|
import asyncio
|
|
from functools import partial
|
|
from changedetectionio import strtobool
|
|
from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived
|
|
from changedetectionio.content_fetchers.base import Fetcher
|
|
|
|
|
|
# "html_requests" is listed as the default fetcher in store.py!
|
|
class fetcher(Fetcher):
|
|
fetcher_description = "Basic fast Plaintext/HTTP Client"
|
|
|
|
def __init__(self, proxy_override=None, custom_browser_connection_url=None, **kwargs):
|
|
super().__init__(**kwargs)
|
|
self.proxy_override = proxy_override
|
|
# browser_connection_url is none because its always 'launched locally'
|
|
|
|
def _run_sync(self,
|
|
url,
|
|
timeout,
|
|
request_headers,
|
|
request_body,
|
|
request_method,
|
|
ignore_status_codes=False,
|
|
current_include_filters=None,
|
|
is_binary=False,
|
|
empty_pages_are_a_change=False,
|
|
watch_uuid=None,
|
|
):
|
|
"""Synchronous version of run - the original requests implementation"""
|
|
|
|
import chardet
|
|
import requests
|
|
from requests.exceptions import ProxyError, ConnectionError, RequestException
|
|
|
|
if self.browser_steps_get_valid_steps():
|
|
raise BrowserStepsInUnsupportedFetcher(url=url)
|
|
|
|
proxies = {}
|
|
|
|
# Allows override the proxy on a per-request basis
|
|
# https://requests.readthedocs.io/en/latest/user/advanced/#socks
|
|
# Should also work with `socks5://user:pass@host:port` type syntax.
|
|
|
|
if self.proxy_override:
|
|
proxies = {'http': self.proxy_override, 'https': self.proxy_override, 'ftp': self.proxy_override}
|
|
else:
|
|
if self.system_http_proxy:
|
|
proxies['http'] = self.system_http_proxy
|
|
if self.system_https_proxy:
|
|
proxies['https'] = self.system_https_proxy
|
|
|
|
session = requests.Session()
|
|
|
|
|
|
if strtobool(os.getenv('ALLOW_FILE_URI', 'false')) and url.startswith('file://'):
|
|
from requests_file import FileAdapter
|
|
session.mount('file://', FileAdapter())
|
|
try:
|
|
r = session.request(method=request_method,
|
|
data=request_body.encode('utf-8') if type(request_body) is str else request_body,
|
|
url=url,
|
|
headers=request_headers,
|
|
timeout=timeout,
|
|
proxies=proxies,
|
|
verify=False)
|
|
except Exception as e:
|
|
msg = str(e)
|
|
if proxies and 'SOCKSHTTPSConnectionPool' in msg:
|
|
msg = f"Proxy connection failed? {msg}"
|
|
raise Exception(msg) from e
|
|
|
|
# If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
|
|
# For example - some sites don't tell us it's utf-8, but return utf-8 content
|
|
# This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably.
|
|
# https://github.com/psf/requests/issues/1604 good info about requests encoding detection
|
|
if not is_binary:
|
|
# Don't run this for PDF (and requests identified as binary) takes a _long_ time
|
|
if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'):
|
|
# For XML/RSS feeds, check the XML declaration for encoding attribute
|
|
# This is more reliable than chardet which can misdetect UTF-8 as MacRoman
|
|
content_type = r.headers.get('content-type', '').lower()
|
|
if 'xml' in content_type or 'rss' in content_type:
|
|
# Look for <?xml version="1.0" encoding="UTF-8"?>
|
|
xml_encoding_match = re.search(rb'<\?xml[^>]+encoding=["\']([^"\']+)["\']', r.content[:200])
|
|
if xml_encoding_match:
|
|
r.encoding = xml_encoding_match.group(1).decode('ascii')
|
|
else:
|
|
# Default to UTF-8 for XML if no encoding found
|
|
r.encoding = 'utf-8'
|
|
else:
|
|
# For other content types, use chardet
|
|
encoding = chardet.detect(r.content)['encoding']
|
|
if encoding:
|
|
r.encoding = encoding
|
|
|
|
self.headers = r.headers
|
|
|
|
if not r.content or not len(r.content):
|
|
logger.debug(f"Requests returned empty content for '{url}'")
|
|
if not empty_pages_are_a_change:
|
|
raise EmptyReply(url=url, status_code=r.status_code)
|
|
else:
|
|
logger.debug(f"URL {url} gave zero byte content reply with Status Code {r.status_code}, but empty_pages_are_a_change = True")
|
|
|
|
# @todo test this
|
|
# @todo maybe you really want to test zero-byte return pages?
|
|
if r.status_code != 200 and not ignore_status_codes:
|
|
# maybe check with content works?
|
|
raise Non200ErrorCodeReceived(url=url, status_code=r.status_code, page_html=r.text)
|
|
|
|
self.status_code = r.status_code
|
|
if is_binary:
|
|
# Binary files just return their checksum until we add something smarter
|
|
self.content = hashlib.md5(r.content).hexdigest()
|
|
else:
|
|
self.content = r.text
|
|
|
|
self.raw_content = r.content
|
|
|
|
# If the content is an image, set it as screenshot for SSIM/visual comparison
|
|
content_type = r.headers.get('content-type', '').lower()
|
|
if 'image/' in content_type:
|
|
self.screenshot = r.content
|
|
logger.debug(f"Image content detected ({content_type}), set as screenshot for comparison")
|
|
|
|
async def run(self,
|
|
fetch_favicon=True,
|
|
current_include_filters=None,
|
|
empty_pages_are_a_change=False,
|
|
ignore_status_codes=False,
|
|
is_binary=False,
|
|
request_body=None,
|
|
request_headers=None,
|
|
request_method=None,
|
|
screenshot_format=None,
|
|
timeout=None,
|
|
url=None,
|
|
watch_uuid=None,
|
|
):
|
|
"""Async wrapper that runs the synchronous requests code in a thread pool"""
|
|
|
|
loop = asyncio.get_event_loop()
|
|
|
|
# Run the synchronous _run_sync in a thread pool to avoid blocking the event loop
|
|
await loop.run_in_executor(
|
|
None, # Use default ThreadPoolExecutor
|
|
lambda: self._run_sync(
|
|
url=url,
|
|
timeout=timeout,
|
|
request_headers=request_headers,
|
|
request_body=request_body,
|
|
request_method=request_method,
|
|
ignore_status_codes=ignore_status_codes,
|
|
current_include_filters=current_include_filters,
|
|
is_binary=is_binary,
|
|
empty_pages_are_a_change=empty_pages_are_a_change,
|
|
watch_uuid=watch_uuid,
|
|
)
|
|
)
|
|
|
|
async def quit(self, watch=None):
|
|
|
|
# In case they switched to `requests` fetcher from something else
|
|
# Then the screenshot could be old, in any case, it's not used here.
|
|
# REMOVE_REQUESTS_OLD_SCREENSHOTS - Mainly used for testing
|
|
if strtobool(os.getenv("REMOVE_REQUESTS_OLD_SCREENSHOTS", 'true')):
|
|
screenshot = watch.get_screenshot()
|
|
if screenshot:
|
|
try:
|
|
os.unlink(screenshot)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to unlink screenshot: {screenshot} - {e}")
|
|
|
|
|
|
# Plugin registration for built-in fetcher
|
|
class RequestsFetcherPlugin:
|
|
"""Plugin class that registers the requests fetcher as a built-in plugin."""
|
|
|
|
def register_content_fetcher(self):
|
|
"""Register the requests fetcher"""
|
|
return ('html_requests', fetcher)
|
|
|
|
|
|
# Create module-level instance for plugin registration
|
|
requests_plugin = RequestsFetcherPlugin()
|