diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 65b94951..81fed7cc 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -61,8 +61,22 @@ import time # ============================================================================== import multiprocessing +import os import sys +# Limit glibc malloc arena count to prevent RSS growth from concurrent requests. +# Default: glibc creates up to 8×CPU_cores arenas. Each concurrent thread/connection +# can trigger a new arena, and freed memory stays mapped in those arenas as RSS forever. +# With MALLOC_ARENA_MAX=2, at most 2 arenas are used; freed pages return to the OS faster. +# Must be set before worker threads start; env var is read lazily by glibc on first arena creation. +if 'MALLOC_ARENA_MAX' not in os.environ: + os.environ['MALLOC_ARENA_MAX'] = '2' + try: + import ctypes as _ctypes + _ctypes.CDLL('libc.so.6').mallopt(-8, 2) # M_ARENA_MAX = -8 + except Exception: + pass + # Set spawn as global default (safety net - all our code uses explicit contexts anyway) # Skip in tests to avoid breaking pytest-flask's LiveServer fixture (uses unpicklable local functions) if 'pytest' not in sys.modules: diff --git a/changedetectionio/blueprint/watchlist/__init__.py b/changedetectionio/blueprint/watchlist/__init__.py index 1191582b..ebdfa627 100644 --- a/changedetectionio/blueprint/watchlist/__init__.py +++ b/changedetectionio/blueprint/watchlist/__init__.py @@ -81,6 +81,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe sorted_tags = sorted(datastore.data['settings']['application'].get('tags').items(), key=lambda x: x[1]['title']) + proxy_list = datastore.proxy_list output = render_template( "watch-overview.html", active_tag=active_tag, @@ -92,7 +93,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe form=form, generate_tag_colors=processors.generate_processor_badge_colors, guid=datastore.data['app_guid'], - has_proxies=datastore.proxy_list, + has_proxies=proxy_list, hosted_sticky=os.getenv("SALTED_PASS", False) == False, now_time_server=round(time.time()), pagination=pagination, @@ -110,6 +111,16 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe watches=sorted_watches ) + # Return freed template-building memory to the OS immediately. + # render_template allocates ~20MB of intermediate strings that are freed on return, + # but glibc keeps those pages mapped in its arenas as RSS. malloc_trim() forces + # glibc to release them, preventing RSS growth from concurrent Chrome connections. + try: + import ctypes + ctypes.CDLL('libc.so.6').malloc_trim(0) + except Exception: + pass + if session.get('share-link'): del (session['share-link']) diff --git a/changedetectionio/blueprint/watchlist/templates/watch-overview.html b/changedetectionio/blueprint/watchlist/templates/watch-overview.html index eadd7192..55fbb9e0 100644 --- a/changedetectionio/blueprint/watchlist/templates/watch-overview.html +++ b/changedetectionio/blueprint/watchlist/templates/watch-overview.html @@ -213,12 +213,13 @@ html[data-darkmode="true"] .watch-tag-list.tag-{{ class_name }} { {%- set checking_now = is_checking_now(watch) -%} {%- set history_n = watch.history_n -%} {%- set favicon = watch.get_favicon_filename() -%} + {%- set error_texts = watch.compile_error_texts(has_proxies=has_proxies) -%} {%- set system_use_url_watchlist = datastore.data['settings']['application']['ui'].get('use_page_title_in_list') -%} {# Class settings mirrored in changedetectionio/static/js/realtime.js for the frontend #} {%- set row_classes = [ loop.cycle('pure-table-odd', 'pure-table-even'), 'processor-' ~ watch['processor'], - 'has-error' if watch.compile_error_texts()|length > 2 else '', + 'has-error' if error_texts|length > 2 else '', 'paused' if watch.paused is defined and watch.paused != False else '', 'unviewed' if watch.has_unviewed else '', 'has-restock-info' if watch.has_restock_info else 'no-restock-info', @@ -271,7 +272,7 @@ html[data-darkmode="true"] .watch-tag-list.tag-{{ class_name }} { {% endif %}   - + {%- if watch['processor'] == 'text_json_diff' -%} {%- if watch['has_ldjson_price_data'] and not watch['track_ldjson_price_data'] -%}
Switch to Restock & Price watch mode? Yes No
diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py index f879e605..2809ed01 100644 --- a/changedetectionio/flask_app.py +++ b/changedetectionio/flask_app.py @@ -4,6 +4,7 @@ import flask_login import locale import os import queue +import re import sys import threading import time @@ -387,6 +388,8 @@ def _jinja2_filter_fetcher_status_icons(fetcher_name): return '' +_RE_SANITIZE_TAG = re.compile(r'[^a-zA-Z0-9]') + @app.template_filter('sanitize_tag_class') def _jinja2_filter_sanitize_tag_class(tag_title): """Sanitize a tag title to create a valid CSS class name. @@ -398,9 +401,8 @@ def _jinja2_filter_sanitize_tag_class(tag_title): Returns: str: A sanitized string suitable for use as a CSS class name """ - import re # Remove all non-alphanumeric characters and convert to lowercase - sanitized = re.sub(r'[^a-zA-Z0-9]', '', tag_title).lower() + sanitized = _RE_SANITIZE_TAG.sub('', tag_title).lower() # Ensure it starts with a letter (CSS requirement) if sanitized and not sanitized[0].isalpha(): sanitized = 'tag' + sanitized @@ -488,28 +490,21 @@ def changedetection_app(config=None, datastore_o=None): available_languages = get_available_languages() language_codes = get_language_codes() - def get_locale(): - # Locale aliases: map browser language codes to translation directory names - # This handles cases where browsers send standard codes (e.g., zh-TW) - # but our translations use more specific codes (e.g., zh_Hant_TW) - locale_aliases = { - 'zh-TW': 'zh_Hant_TW', # Traditional Chinese: browser sends zh-TW, we use zh_Hant_TW - 'zh_TW': 'zh_Hant_TW', # Also handle underscore variant - } + _locale_aliases = { + 'zh-TW': 'zh_Hant_TW', # Traditional Chinese: browser sends zh-TW, we use zh_Hant_TW + 'zh_TW': 'zh_Hant_TW', # Also handle underscore variant + } + _locale_match_list = language_codes + list(_locale_aliases.keys()) + def get_locale(): # 1. Try to get locale from session (user explicitly selected) if 'locale' in session: return session['locale'] # 2. Fall back to Accept-Language header - # Get the best match from browser's Accept-Language header - browser_locale = request.accept_languages.best_match(language_codes + list(locale_aliases.keys())) - - # 3. Check if we need to map the browser locale to our internal locale - if browser_locale in locale_aliases: - return locale_aliases[browser_locale] - - return browser_locale + browser_locale = request.accept_languages.best_match(_locale_match_list) + # 3. Map browser locale to our internal locale if needed + return _locale_aliases.get(browser_locale, browser_locale) # Initialize Babel with locale selector babel = Babel(app, locale_selector=get_locale) @@ -1022,15 +1017,16 @@ def check_for_new_version(): import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + session = requests.Session() + session.verify = False + while not app.config.exit.is_set(): try: - r = requests.post("https://changedetection.io/check-ver.php", + r = session.post("https://changedetection.io/check-ver.php", data={'version': __version__, 'app_guid': datastore.data['app_guid'], 'watch_count': len(datastore.data['watching']) - }, - - verify=False) + }) except: pass diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index deb682ee..9485401b 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -43,6 +43,11 @@ from ..html_tools import TRANSLATE_WHITESPACE_TABLE FAVICON_RESAVE_THRESHOLD_SECONDS=86400 BROTLI_COMPRESS_SIZE_THRESHOLD = int(os.getenv('SNAPSHOT_BROTLI_COMPRESSION_THRESHOLD', 1024*20)) +# Module-level favicon filename cache: data_dir → basename (or None) +# Keyed by data_dir so it survives Watch object recreation, deepcopy, and concurrent requests. +# Invalidated explicitly in bump_favicon() when a new favicon is saved. +_FAVICON_FILENAME_CACHE: dict = {} + minimum_seconds_recheck_time = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 3)) mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7} @@ -806,9 +811,8 @@ class model(EntityPersistenceMixin, watch_base): with open(fname, 'wb') as f: f.write(decoded) - # Invalidate favicon filename cache - if hasattr(self, '_favicon_filename_cache'): - delattr(self, '_favicon_filename_cache') + # Invalidate module-level favicon filename cache for this watch + _FAVICON_FILENAME_CACHE.pop(self.data_dir, None) # A signal that could trigger the socket server to update the browser also watch_check_update = signal('watch_favicon_bump') @@ -823,35 +827,23 @@ class model(EntityPersistenceMixin, watch_base): def get_favicon_filename(self) -> str | None: """ - Find any favicon.* file in the current working directory - and return the contents of the newest one. + Find any favicon.* file in the watch data directory. - MEMORY LEAK FIX: Cache the result to avoid repeated glob.glob() operations. - glob.glob() causes millions of fnmatch allocations when called for every watch on page load. + Uses a module-level cache keyed by data_dir to survive Watch object recreation, + deepcopy (which drops instance attrs), and concurrent request races. + Invalidated by bump_favicon() when a new favicon is saved. Returns: - str: Basename of the newest favicon file, or None if not found. + str: Basename of the favicon file, or None if not found. """ - # Check cache first (prevents 26M+ allocations from repeated glob operations) - cache_key = '_favicon_filename_cache' - if hasattr(self, cache_key): - return getattr(self, cache_key) + if self.data_dir in _FAVICON_FILENAME_CACHE: + return _FAVICON_FILENAME_CACHE[self.data_dir] import glob - - # Search for all favicon.* files files = glob.glob(os.path.join(self.data_dir, "favicon.*")) - - if not files: - result = None - else: - # Find the newest by modification time - newest_file = max(files, key=os.path.getmtime) - result = os.path.basename(newest_file) - - # Cache the result - setattr(self, cache_key, result) - return result + fname = os.path.basename(files[0]) if files else None + _FAVICON_FILENAME_CACHE[self.data_dir] = fname + return fname def get_screenshot_as_thumbnail(self, max_age=3200): """Return path to a square thumbnail of the most recent screenshot. @@ -1182,18 +1174,13 @@ class model(EntityPersistenceMixin, watch_base): def compile_error_texts(self, has_proxies=None): """Compile error texts for this watch. Accepts has_proxies parameter to ensure it works even outside app context""" - from flask import url_for + from flask import url_for, has_request_context from markupsafe import Markup output = [] # Initialize as list since we're using append last_error = self.get('last_error','') - try: - url_for('settings.settings_page') - except Exception as e: - has_app_context = False - else: - has_app_context = True + has_app_context = has_request_context() # has app+request context, we can use url_for() if has_app_context: diff --git a/changedetectionio/validate_url.py b/changedetectionio/validate_url.py index 38a26740..08efce78 100644 --- a/changedetectionio/validate_url.py +++ b/changedetectionio/validate_url.py @@ -100,6 +100,19 @@ def is_safe_valid_url(test_url): logger.warning('URL validation failed: URL is empty or whitespace only') return False + # Per-request cache: same URL is often validated 2-3x per watchlist render (sort + display). + # Flask's g is scoped to one request and auto-cleared on teardown, so dynamic Jinja2 URLs + # like {{microtime()}} are always re-evaluated on the next request. + # Falls back gracefully when called outside a request context (e.g. background workers). + _cache_key = test_url + try: + from flask import g + _cache = g.setdefault('_url_validation_cache', {}) + if _cache_key in _cache: + return _cache[_cache_key] + except RuntimeError: + _cache = None # No app context + allow_file_access = strtobool(os.getenv('ALLOW_FILE_URI', 'false')) safe_protocol_regex = '^(http|https|ftp|file):' if allow_file_access else '^(http|https|ftp):' @@ -112,11 +125,14 @@ def is_safe_valid_url(test_url): test_url = r.sub('', test_url) # Check the actual rendered URL in case of any Jinja markup - try: - test_url = jinja_render(test_url) - except Exception as e: - logger.error(f'URL "{test_url}" is not correct Jinja2? {str(e)}') - return False + # Only run jinja_render when the URL actually contains Jinja2 syntax - creating a new + # ImmutableSandboxedEnvironment is expensive and is called once per watch per page load + if '{%' in test_url or '{{' in test_url: + try: + test_url = jinja_render(test_url) + except Exception as e: + logger.error(f'URL "{test_url}" is not correct Jinja2? {str(e)}') + return False # Check query parameters and fragment if re.search(r'[<>]', test_url): @@ -142,4 +158,6 @@ def is_safe_valid_url(test_url): logger.warning(f'URL f"{test_url}" failed validation, aborting.') return False + if _cache is not None: + _cache[_cache_key] = True return True