Compare commits

..

8 Commits

Author SHA1 Message Date
dgtlmoon
9144b67610 Adding test 2025-10-12 19:40:44 +02:00
dgtlmoon
b20fe70b49 adding test 2025-10-12 19:39:15 +02:00
dgtlmoon
080acced85 Merge branch 'master' into proxy-url-validation 2025-10-12 19:26:49 +02:00
dgtlmoon
66ddd87ee4 Move proxy default selection to proxy tab 2025-10-12 19:26:04 +02:00
dgtlmoon
233189e4f7 Build - Splitting memory report (#3493) 2025-10-12 19:20:28 +02:00
dgtlmoon
6b895ae972 Refactoring fieldlist to accept correct validation 2025-10-12 19:19:49 +02:00
dgtlmoon
7efd4c2a99 WIP - proxy/browser settings URL validation 2025-10-12 19:05:47 +02:00
dgtlmoon
b237fd7201 Replace stream/filetype detection library with puremagic, 20Mb less RAM usage (#3491) 2025-10-12 18:40:37 +02:00
10 changed files with 274 additions and 38 deletions

View File

@@ -253,6 +253,30 @@ jobs:
docker logs test-cdio-basic-tests > output-logs/test-cdio-basic-tests-stdout-${{ env.PYTHON_VERSION }}.txt
docker logs test-cdio-basic-tests 2> output-logs/test-cdio-basic-tests-stderr-${{ env.PYTHON_VERSION }}.txt
- name: Extract and display memory test report
if: always()
run: |
# Extract test-memory.log from the container
echo "Extracting test-memory.log from container..."
docker cp test-cdio-basic-tests:/app/changedetectionio/test-memory.log output-logs/test-memory-${{ env.PYTHON_VERSION }}.log || echo "test-memory.log not found in container"
# Display the memory log contents for immediate visibility in workflow output
echo "=== Top 10 Highest Peak Memory Tests ==="
if [ -f output-logs/test-memory-${{ env.PYTHON_VERSION }}.log ]; then
# Sort by peak memory value (extract number before MB and sort numerically, reverse order)
grep "Peak memory:" output-logs/test-memory-${{ env.PYTHON_VERSION }}.log | \
sed 's/.*Peak memory: //' | \
paste -d'|' - <(grep "Peak memory:" output-logs/test-memory-${{ env.PYTHON_VERSION }}.log) | \
sort -t'|' -k1 -nr | \
cut -d'|' -f2 | \
head -10
echo ""
echo "=== Full Memory Test Report ==="
cat output-logs/test-memory-${{ env.PYTHON_VERSION }}.log
else
echo "No memory log available"
fi
- name: Store everything including test-datastore
if: always()
uses: actions/upload-artifact@v4

View File

@@ -334,6 +334,10 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore):
if update_handler.fetcher.content or (not update_handler.fetcher.content and empty_pages_are_a_change):
watch.save_last_fetched_html(contents=update_handler.fetcher.content, timestamp=int(fetch_start_time))
# Explicitly delete large content variables to free memory IMMEDIATELY after saving
# These are no longer needed after being saved to history
del contents
# Send notifications on second+ check
if watch.history_n >= 2:
logger.info(f"Change detected in UUID {uuid} - {watch['url']}")
@@ -372,6 +376,12 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore):
datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - fetch_start_time, 3),
'check_count': count})
# NOW clear fetcher content - after all processing is complete
# This is the last point where we need the fetcher data
if update_handler and hasattr(update_handler, 'fetcher') and update_handler.fetcher:
update_handler.fetcher.clear_content()
logger.debug(f"Cleared fetcher content for UUID {uuid}")
except Exception as e:
logger.error(f"Worker {worker_id} unexpected error processing {uuid}: {e}")
logger.error(f"Worker {worker_id} traceback:", exc_info=True)
@@ -392,7 +402,28 @@ async def async_update_worker(worker_id, q, notification_q, app, datastore):
#logger.info(f"Worker {worker_id} sending completion signal for UUID {watch['uuid']}")
watch_check_update.send(watch_uuid=watch['uuid'])
update_handler = None
# Explicitly clean up update_handler and all its references
if update_handler:
# Clear fetcher content using the proper method
if hasattr(update_handler, 'fetcher') and update_handler.fetcher:
update_handler.fetcher.clear_content()
# Clear processor references
if hasattr(update_handler, 'content_processor'):
update_handler.content_processor = None
update_handler = None
# Clear local contents variable if it still exists
if 'contents' in locals():
del contents
# Note: We don't set watch = None here because:
# 1. watch is just a local reference to datastore.data['watching'][uuid]
# 2. Setting it to None doesn't affect the datastore
# 3. GC can't collect the object anyway (still referenced by datastore)
# 4. It would just cause confusion
logger.debug(f"Worker {worker_id} completed watch {uuid} in {time.time()-fetch_start_time:.2f}s")
except Exception as cleanup_error:
logger.error(f"Worker {worker_id} error during cleanup: {cleanup_error}")

View File

@@ -1,7 +1,7 @@
{% extends 'base.html' %}
{% block content %}
{% from '_helpers.html' import render_field, render_checkbox_field, render_button, render_time_schedule_form, render_ternary_field %}
{% from '_helpers.html' import render_field, render_checkbox_field, render_button, render_time_schedule_form, render_ternary_field, render_fieldlist_with_inline_errors %}
{% from '_common_fields.html' import render_common_settings_form %}
<script>
const notification_base_url="{{url_for('ui.ui_notification.ajax_callback_send_notification_test', mode="global-settings")}}";
@@ -89,15 +89,6 @@
<span class="pure-form-message-inline">Transforms RSS/RDF feed watches into beautiful text only</span>
</div>
</div>
{% if form.requests.proxy %}
<div class="pure-control-group inline-radio">
{{ render_field(form.requests.form.proxy, class="fetch-backend-proxy") }}
<span class="pure-form-message-inline">
Choose a default proxy for all watches
</span>
</div>
{% endif %}
</fieldset>
</div>
@@ -321,17 +312,27 @@ nav
<p><strong>Tip</strong>: "Residential" and "Mobile" proxy type can be more successfull than "Data Center" for blocked websites.
<div class="pure-control-group" id="extra-proxies-setting">
{{ render_field(form.requests.form.extra_proxies) }}
{{ render_fieldlist_with_inline_errors(form.requests.form.extra_proxies) }}
<span class="pure-form-message-inline">"Name" will be used for selecting the proxy in the Watch Edit settings</span><br>
<span class="pure-form-message-inline">SOCKS5 proxies with authentication are only supported with 'plain requests' fetcher, for other fetchers you should whitelist the IP access instead</span>
{% if form.requests.proxy %}
<div>
<br>
<div class="inline-radio">
{{ render_field(form.requests.form.proxy, class="fetch-backend-proxy") }}
<span class="pure-form-message-inline">Choose a default proxy for all watches</span>
</div>
</div>
{% endif %}
</div>
<div class="pure-control-group" id="extra-browsers-setting">
<p>
<span class="pure-form-message-inline"><i>Extra Browsers</i> can be attached to further defeat CAPTCHA's on websites that are particularly hard to scrape.</span><br>
<span class="pure-form-message-inline">Simply paste the connection address into the box, <a href="https://changedetection.io/tutorial/using-bright-datas-scraping-browser-pass-captchas-and-other-protection-when-monitoring">More instructions and examples here</a> </span>
</p>
{{ render_field(form.requests.form.extra_browsers) }}
{{ render_fieldlist_with_inline_errors(form.requests.form.extra_browsers) }}
</div>
</div>
<div id="actions">
<div class="pure-control-group">

View File

@@ -64,6 +64,19 @@ class Fetcher():
# Time ONTOP of the system defined env minimum time
render_extract_delay = 0
def clear_content(self):
"""
Explicitly clear all content from memory to free up heap space.
Call this after content has been saved to disk.
"""
self.content = None
if hasattr(self, 'raw_content'):
self.raw_content = None
self.screenshot = None
self.xpath_data = None
# Keep headers and status_code as they're small
logger.trace("Fetcher content cleared from memory")
@abstractmethod
def get_error(self):
return self.error

View File

@@ -678,6 +678,51 @@ class ValidateCSSJSONXPATHInput(object):
except:
raise ValidationError("A system-error occurred when validating your jq expression")
class ValidateSimpleURL:
"""Validate that the value can be parsed by urllib.parse.urlparse() and has a scheme/netloc."""
def __init__(self, message=None):
self.message = message or "Invalid URL."
def __call__(self, form, field):
data = (field.data or "").strip()
if not data:
return # empty is OK — pair with validators.Optional()
from urllib.parse import urlparse
parsed = urlparse(data)
if not parsed.scheme or not parsed.netloc:
raise ValidationError(self.message)
class ValidateStartsWithRegex(object):
def __init__(self, regex, *, flags=0, message=None, allow_empty=True, split_lines=True):
# compile with given flags (well pass re.IGNORECASE below)
self.pattern = re.compile(regex, flags) if isinstance(regex, str) else regex
self.message = message
self.allow_empty = allow_empty
self.split_lines = split_lines
def __call__(self, form, field):
data = field.data
if not data:
return
# normalize into list of lines
if isinstance(data, str) and self.split_lines:
lines = data.splitlines()
elif isinstance(data, (list, tuple)):
lines = data
else:
lines = [data]
for line in lines:
stripped = line.strip()
if not stripped:
if self.allow_empty:
continue
raise ValidationError(self.message or "Empty value not allowed.")
if not self.pattern.match(stripped):
raise ValidationError(self.message or "Invalid value.")
class quickWatchForm(Form):
from . import processors
@@ -865,16 +910,29 @@ class processor_text_json_diff_form(commonSettingsForm):
class SingleExtraProxy(Form):
# maybe better to set some <script>var..
proxy_name = StringField('Name', [validators.Optional()], render_kw={"placeholder": "Name"})
proxy_url = StringField('Proxy URL', [validators.Optional()], render_kw={"placeholder": "socks5:// or regular proxy http://user:pass@...:3128", "size":50})
# @todo do the validation here instead
proxy_url = StringField('Proxy URL', [
validators.Optional(),
ValidateStartsWithRegex(
regex=r'^(https?|socks5)://', # ✅ main pattern
flags=re.IGNORECASE, # ✅ makes it case-insensitive
message='Proxy URLs must start with http://, https:// or socks5://',
),
ValidateSimpleURL()
], render_kw={"placeholder": "socks5:// or regular proxy http://user:pass@...:3128", "size":50})
class SingleExtraBrowser(Form):
browser_name = StringField('Name', [validators.Optional()], render_kw={"placeholder": "Name"})
browser_connection_url = StringField('Browser connection URL', [validators.Optional()], render_kw={"placeholder": "wss://brightdata... wss://oxylabs etc", "size":50})
# @todo do the validation here instead
browser_connection_url = StringField('Browser connection URL', [
validators.Optional(),
ValidateStartsWithRegex(
regex=r'^(wss?|ws)://',
flags=re.IGNORECASE,
message='Browser URLs must start with wss:// or ws://'
),
ValidateSimpleURL()
], render_kw={"placeholder": "wss://brightdata... wss://oxylabs etc", "size":50})
class DefaultUAInputForm(Form):
html_requests = StringField('Plaintext requests', validators=[validators.Optional()], render_kw={"placeholder": "<default>"})
@@ -885,7 +943,7 @@ class DefaultUAInputForm(Form):
class globalSettingsRequestForm(Form):
time_between_check = RequiredFormField(TimeBetweenCheckForm)
time_schedule_limit = FormField(ScheduleLimitForm)
proxy = RadioField('Proxy')
proxy = RadioField('Default proxy')
jitter_seconds = IntegerField('Random jitter seconds ± check',
render_kw={"style": "width: 5em;"},
validators=[validators.NumberRange(min=0, message="Should contain zero or more seconds")])

View File

@@ -64,24 +64,31 @@ class guess_stream_type():
# Remove whitespace between < and tag name for robust detection (handles '< html', '<\nhtml', etc.)
test_content_normalized = re.sub(r'<\s+', '<', test_content)
# Magic will sometimes call text/plain as text/html!
# Use puremagic for lightweight MIME detection (saves ~14MB vs python-magic)
magic_result = None
try:
import magic
import puremagic
mime = magic.from_buffer(content[:200], mime=True) # Send the original content
logger.debug(f"Guessing mime type, original content_type '{http_content_header}', mime type detected '{mime}'")
if mime and "/" in mime:
magic_result = mime
# Ignore generic/fallback mime types from magic
if mime in ['application/octet-stream', 'application/x-empty', 'binary']:
logger.debug(f"Ignoring generic mime type '{mime}' from magic library")
# Trust magic for non-text types immediately
elif mime not in ['text/html', 'text/plain']:
magic_content_header = mime
# puremagic needs bytes, so encode if we have a string
content_bytes = content[:200].encode('utf-8') if isinstance(content, str) else content[:200]
# puremagic returns a list of PureMagic objects with confidence scores
detections = puremagic.magic_string(content_bytes)
if detections:
# Get the highest confidence detection
mime = detections[0].mime_type
logger.debug(f"Guessing mime type, original content_type '{http_content_header}', mime type detected '{mime}'")
if mime and "/" in mime:
magic_result = mime
# Ignore generic/fallback mime types
if mime in ['application/octet-stream', 'application/x-empty', 'binary']:
logger.debug(f"Ignoring generic mime type '{mime}' from puremagic library")
# Trust puremagic for non-text types immediately
elif mime not in ['text/html', 'text/plain']:
magic_content_header = mime
except Exception as e:
logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}), using content-based detection")
logger.error(f"Error getting a more precise mime type from 'puremagic' library ({str(e)}), using content-based detection")
# Content-based detection (most reliable for text formats)
# Check for HTML patterns first - if found, override magic's text/plain

View File

@@ -556,6 +556,20 @@ class perform_site_check(difference_detection_processor):
else:
logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} had unique content")
# Note: Explicit cleanup is only needed here because text_json_diff handles
# large strings (100KB-300KB for RSS/HTML). The other processors work with
# small strings and don't need this.
#
# Python would clean these up automatically, but explicit `del` frees memory
# immediately rather than waiting for function return, reducing peak memory usage.
del content
if 'html_content' in locals() and html_content is not stripped_text:
del html_content
if 'text_content_before_ignored_filter' in locals() and text_content_before_ignored_filter is not stripped_text:
del text_content_before_ignored_filter
if 'text_for_checksuming' in locals() and text_for_checksuming is not stripped_text:
del text_for_checksuming
return changed_detected, update_obj, stripped_text
def _apply_diff_filtering(self, watch, stripped_text, text_before_filter):

View File

@@ -14,13 +14,31 @@
{% if field.errors is mapping and 'form' in field.errors %}
{# and subfield form errors, such as used in RequiredFormField() for TimeBetweenCheckForm sub form #}
{% set errors = field.errors['form'] %}
{% for error in errors %}
<li>{{ error }}</li>
{% endfor %}
{% elif field.type == 'FieldList' %}
{# Handle FieldList of FormFields - errors is a list of dicts, one per entry #}
{% for idx, entry_errors in field.errors|enumerate %}
{% if entry_errors is mapping and entry_errors %}
{# Only show entries that have actual errors #}
<li><strong>Entry {{ idx + 1 }}:</strong>
<ul>
{% for field_name, messages in entry_errors.items() %}
{% for message in messages %}
<li>{{ field_name }}: {{ message }}</li>
{% endfor %}
{% endfor %}
</ul>
</li>
{% endif %}
{% endfor %}
{% else %}
{# regular list of errors with this field #}
{% set errors = field.errors %}
{% for error in field.errors %}
<li>{{ error }}</li>
{% endfor %}
{% endif %}
{% for error in errors %}
<li>{{ error }}</li>
{% endfor %}
</ul>
{% endif %}
</div>
@@ -93,6 +111,39 @@
{{ field(**kwargs)|safe }}
{% endmacro %}
{% macro render_fieldlist_with_inline_errors(fieldlist) %}
{# Specialized macro for FieldList(FormField(...)) that renders errors inline with each field #}
<div {% if fieldlist.errors %} class="error" {% endif %}>{{ fieldlist.label }}</div>
<div {% if fieldlist.errors %} class="error" {% endif %}>
<ul id="{{ fieldlist.id }}">
{% for entry in fieldlist %}
<li {% if entry.errors %} class="error" {% endif %}>
<label for="{{ entry.id }}" {% if entry.errors %} class="error" {% endif %}>{{ fieldlist.label.text }}-{{ loop.index0 }}</label>
<table id="{{ entry.id }}" {% if entry.errors %} class="error" {% endif %}>
<tbody>
{% for subfield in entry %}
<tr {% if subfield.errors %} class="error" {% endif %}>
<th {% if subfield.errors %} class="error" {% endif %}><label for="{{ subfield.id }}" {% if subfield.errors %} class="error" {% endif %}>{{ subfield.label.text }}</label></th>
<td {% if subfield.errors %} class="error" {% endif %}>
{{ subfield(**kwargs)|safe }}
{% if subfield.errors %}
<ul class="errors">
{% for error in subfield.errors %}
<li class="error">{{ error }}</li>
{% endfor %}
</ul>
{% endif %}
</td>
</tr>
{% endfor %}
</tbody>
</table>
</li>
{% endfor %}
</ul>
</div>
{% endmacro %}
{% macro render_conditions_fieldlist_of_formfields_as_table(fieldlist, table_id="rulesTable") %}
<div class="fieldlist_formfields" id="{{ table_id }}">
<div class="fieldlist-header">

View File

@@ -49,3 +49,39 @@ def test_select_custom(client, live_server, measure_memory_usage):
#
# Now we should see the request in the container logs for "squid-squid-custom" because it will be the only default
def test_custom_proxy_validation(client, live_server, measure_memory_usage):
# live_server_setup(live_server) # Setup on conftest per function
# Goto settings, add our custom one
res = client.post(
url_for("settings.settings_page"),
data={
"requests-time_between_check-minutes": 180,
"application-ignore_whitespace": "y",
"application-fetch_backend": 'html_requests',
"requests-extra_proxies-0-proxy_name": "custom-test-proxy",
"requests-extra_proxies-0-proxy_url": "xxxxhtt/333??p://test:awesome@squid-custom:3128",
},
follow_redirects=True
)
assert b"Settings updated." not in res.data
assert b'Proxy URLs must start with' in res.data
res = client.post(
url_for("settings.settings_page"),
data={
"requests-time_between_check-minutes": 180,
"application-ignore_whitespace": "y",
"application-fetch_backend": 'html_requests',
"requests-extra_proxies-0-proxy_name": "custom-test-proxy",
"requests-extra_proxies-0-proxy_url": "https://",
},
follow_redirects=True
)
assert b"Settings updated." not in res.data
assert b"Invalid URL." in res.data

View File

@@ -125,8 +125,9 @@ price-parser
# flask_socket_io - incorrect package name, already have flask-socketio above
# So far for detecting correct favicon type, but for other things in the future
python-magic
# Lightweight MIME type detection (saves ~14MB memory vs python-magic/libmagic)
# Used for detecting correct favicon type and content-type detection
puremagic
# Scheduler - Windows seemed to miss a lot of default timezone info (even "UTC" !)
tzdata