Pin bs4

Attempt to fix socks test server test
2025-11-08 10:36:32 +00:00 · 2025-09-29 11:14:42 +02:00 · 2025-09-29 10:37:25 +02:00
22 changed files with 76 additions and 426 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -11,4 +11,6 @@ updates:
  - package-ecosystem: pip
    directory: /
    schedule:
-      interval: "weekly"
+      interval: "daily"
+    allow:
+      - dependency-name: "apprise"
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -34,7 +34,7 @@ jobs:

    # Initializes the CodeQL tools for scanning.
    - name: Initialize CodeQL
-      uses: github/codeql-action/init@v4
+      uses: github/codeql-action/init@v3
      with:
        languages: ${{ matrix.language }}
        # If you wish to specify custom queries, you can do so here or in a config file.
@@ -45,7 +45,7 @@ jobs:
    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
    # If this step fails, then you should remove it and run the build manually (see below)
    - name: Autobuild
-      uses: github/codeql-action/autobuild@v4
+      uses: github/codeql-action/autobuild@v3

    # ℹ️ Command-line programs to run using the OS shell.
    # 📚 https://git.io/JvXDl
@@ -59,4 +59,4 @@ jobs:
    #   make release

    - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v4
+      uses: github/codeql-action/analyze@v3
--- a/.github/workflows/containers.yml
+++ b/.github/workflows/containers.yml
@@ -95,7 +95,7 @@ jobs:
          push: true
          tags: |
            ${{ secrets.DOCKER_HUB_USERNAME }}/changedetection.io:dev,ghcr.io/${{ github.repository }}:dev
-          platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/arm/v8
+          platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/arm/v8,linux/arm64/v8
          cache-from: type=gha
          cache-to: type=gha,mode=max

@@ -133,7 +133,7 @@ jobs:
          file: ./Dockerfile
          push: true
          tags: ${{ steps.meta.outputs.tags }}
-          platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/arm/v8
+          platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/arm/v8,linux/arm64/v8
          cache-from: type=gha
          cache-to: type=gha,mode=max
 # Looks like this was disabled
--- a/.github/workflows/test-container-build.yml
+++ b/.github/workflows/test-container-build.yml
@@ -38,6 +38,8 @@ jobs:
            dockerfile: ./Dockerfile
          - platform: linux/arm/v8
            dockerfile: ./Dockerfile
+          - platform: linux/arm64/v8
+            dockerfile: ./Dockerfile
          # Alpine Dockerfile platforms (musl via alpine check)
          - platform: linux/amd64
            dockerfile: ./.github/test/Dockerfile-alpine
@@ -74,5 +76,5 @@ jobs:
            file: ${{ matrix.dockerfile }}
            platforms: ${{ matrix.platform }}
            cache-from: type=gha
-            cache-to: type=gha,mode=min
+            cache-to: type=gha,mode=max

--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@@ -2,7 +2,7 @@

 # Read more https://github.com/dgtlmoon/changedetection.io/wiki

-__version__ = '0.50.18'
+__version__ = '0.50.14'

 from changedetectionio.strtobool import strtobool
 from json.decoder import JSONDecodeError
--- a/changedetectionio/api/init.py
+++ b/changedetectionio/api/init.py
@@ -1,7 +1,10 @@
 import copy
+import yaml
 import functools
 from flask import request, abort
 from loguru import logger
+from openapi_core import OpenAPI
+from openapi_core.contrib.flask import FlaskOpenAPIRequest
 from . import api_schema
 from ..model import watch_base

@@ -31,11 +34,7 @@ schema_delete_notification_urls['required'] = ['notification_urls']

@functools.cache
 def get_openapi_spec():
-    """Lazy load OpenAPI spec and dependencies only when validation is needed."""
    import os
-    import yaml  # Lazy import - only loaded when API validation is actually used
-    from openapi_core import OpenAPI  # Lazy import - saves ~10.7 MB on startup
-
    spec_path = os.path.join(os.path.dirname(__file__), '../../docs/api-spec.yaml')
    with open(spec_path, 'r') as f:
        spec_dict = yaml.safe_load(f)
@@ -50,9 +49,6 @@ def validate_openapi_request(operation_id):
            try:
                # Skip OpenAPI validation for GET requests since they don't have request bodies
                if request.method.upper() != 'GET':
-                    # Lazy import - only loaded when actually validating a request
-                    from openapi_core.contrib.flask import FlaskOpenAPIRequest
-
                    spec = get_openapi_spec()
                    openapi_request = FlaskOpenAPIRequest(request)
                    result = spec.unmarshal_request(openapi_request)
--- a/changedetectionio/blueprint/settings/templates/settings.html
+++ b/changedetectionio/blueprint/settings/templates/settings.html
@@ -191,12 +191,6 @@ nav
                        </ul>
                     </span>
                    </fieldset>
-                    <fieldset class="pure-group">
-                        {{ render_checkbox_field(form.application.form.strip_ignored_lines) }}
-                        <span class="pure-form-message-inline">Remove any text that appears in the "Ignore text" from the output (otherwise its just ignored for change-detection)<br>
-                        <i>Note:</i> Changing this will change the status of your existing watches, possibly trigger alerts etc.
-                        </span>
-                    </fieldset>
           </div>

            <div class="tab-pane-inner" id="api">
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -759,7 +759,6 @@ class processor_text_json_diff_form(commonSettingsForm):
    check_unique_lines = BooleanField('Only trigger when unique lines appear in all history', default=False)
    remove_duplicate_lines = BooleanField('Remove duplicate lines of text', default=False)
    sort_text_alphabetically =  BooleanField('Sort text alphabetically', default=False)
-    strip_ignored_lines = TernaryNoneBooleanField('Strip ignored lines', default=None)
    trim_text_whitespace = BooleanField('Trim whitespace before and after text', default=False)

    filter_text_added = BooleanField('Added lines', default=True)
@@ -937,7 +936,6 @@ class globalSettingsApplicationForm(commonSettingsForm):
    removepassword_button = SubmitField('Remove password', render_kw={"class": "pure-button pure-button-primary"})
    render_anchor_tag_content = BooleanField('Render anchor tag content', default=False)
    shared_diff_access = BooleanField('Allow anonymous access to watch history page when password is enabled', default=False, validators=[validators.Optional()])
-    strip_ignored_lines = BooleanField('Strip ignored lines')
    rss_hide_muted_watches = BooleanField('Hide muted watches from RSS feed', default=True,
                                      validators=[validators.Optional()])
    filter_failure_notification_threshold_attempts = IntegerField('Number of times the filter can be missing before sending a notification',
--- a/changedetectionio/model/App.py
+++ b/changedetectionio/model/App.py
@@ -57,7 +57,6 @@ class model(dict):
                    'rss_hide_muted_watches': True,
                    'schema_version' : 0,
                    'shared_diff_access': False,
-                    'strip_ignored_lines': False,
                    'tags': {}, #@todo use Tag.model initialisers
                    'timezone': None, # Default IANA timezone name
                    'webdriver_delay': None , # Extra delay in seconds before extracting text
--- a/changedetectionio/model/init.py
+++ b/changedetectionio/model/init.py
@@ -58,7 +58,6 @@ class watch_base(dict):
            'proxy': None,  # Preferred proxy connection
            'remote_server_reply': None,  # From 'server' reply header
            'sort_text_alphabetically': False,
-            'strip_ignored_lines': None,
            'subtractive_selectors': [],
            'tag': '',  # Old system of text name for a tag, to be removed
            'tags': [],  # list of UUIDs to App.Tags
--- a/changedetectionio/processors/magic.py
+++ b/changedetectionio/processors/magic.py
@@ -1,125 +0,0 @@
-"""
-Content Type Detection and Stream Classification
-
-This module provides intelligent content-type detection for changedetection.io.
-It addresses the common problem where HTTP Content-Type headers are missing, incorrect,
-or too generic, which would otherwise cause the wrong processor to be used.
-
-The guess_stream_type class combines:
-1. HTTP Content-Type headers (when available and reliable)
-2. Python-magic library for MIME detection (analyzing actual file content)
-3. Content-based pattern matching for text formats (HTML tags, XML declarations, etc.)
-
-This multi-layered approach ensures accurate detection of RSS feeds, JSON, HTML, PDF,
-plain text, CSV, YAML, and XML formats - even when servers provide misleading headers.
-
-Used by: processors/text_json_diff/processor.py and other content processors
-"""
-
-# When to apply the 'cdata to real HTML' hack
-RSS_XML_CONTENT_TYPES = [
-    "application/rss+xml",
-    "application/rdf+xml",
-    "application/atom+xml",
-    "text/rss+xml",  # rare, non-standard
-    "application/x-rss+xml",  # legacy (older feed software)
-    "application/x-atom+xml",  # legacy (older Atom)
-]
-
-# JSON Content-types
-JSON_CONTENT_TYPES = [
-    "application/activity+json",
-    "application/feed+json",
-    "application/json",
-    "application/ld+json",
-    "application/vnd.api+json",
-]
-
-
-# Generic XML Content-types (non-RSS/Atom)
-XML_CONTENT_TYPES = [
-    "text/xml",
-    "application/xml",
-]
-
-HTML_PATTERNS = ['<!doctype html', '<html', '<head', '<body', '<script', '<iframe', '<div']
-
-from loguru import logger
-
-class guess_stream_type():
-    is_pdf = False
-    is_json = False
-    is_html = False
-    is_plaintext = False
-    is_rss = False
-    is_csv = False
-    is_xml = False  # Generic XML, not RSS/Atom
-    is_yaml = False
-
-    def __init__(self, http_content_header, content):
-        import re
-        magic_content_header = http_content_header
-        test_content = content[:200].lower().strip()
-
-        # Remove whitespace between < and tag name for robust detection (handles '< html', '<\nhtml', etc.)
-        test_content_normalized = re.sub(r'<\s+', '<', test_content)
-
-        # Magic will sometimes call text/plain as text/html!
-        magic_result = None
-        try:
-            import magic
-
-            mime = magic.from_buffer(content[:200], mime=True) # Send the original content
-            logger.debug(f"Guessing mime type, original content_type '{http_content_header}', mime type detected '{mime}'")
-            if mime and "/" in mime:
-                magic_result = mime
-                # Ignore generic/fallback mime types from magic
-                if mime in ['application/octet-stream', 'application/x-empty', 'binary']:
-                    logger.debug(f"Ignoring generic mime type '{mime}' from magic library")
-                # Trust magic for non-text types immediately
-                elif mime not in ['text/html', 'text/plain']:
-                    magic_content_header = mime
-
-        except Exception as e:
-            logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}), using content-based detection")
-
-        # Content-based detection (most reliable for text formats)
-        # Check for HTML patterns first - if found, override magic's text/plain
-        has_html_patterns = any(p in test_content_normalized for p in HTML_PATTERNS)
-
-        # Always trust headers first
-        if 'text/plain' in http_content_header:
-            self.is_plaintext = True
-        if any(s in http_content_header for s in RSS_XML_CONTENT_TYPES):
-            self.is_rss = True
-        elif any(s in http_content_header for s in JSON_CONTENT_TYPES):
-            self.is_json = True
-        elif any(s in http_content_header for s in XML_CONTENT_TYPES):
-            # Only mark as generic XML if not already detected as RSS
-            if not self.is_rss:
-                self.is_xml = True
-        elif 'pdf' in magic_content_header:
-            self.is_pdf = True
-###
-        elif has_html_patterns or http_content_header == 'text/html':
-            self.is_html = True
-        # If magic says text/plain and we found no HTML patterns, trust it
-        elif magic_result == 'text/plain':
-            self.is_plaintext = True
-            logger.debug(f"Trusting magic's text/plain result (no HTML patterns detected)")
-        elif any(s in magic_content_header for s in JSON_CONTENT_TYPES):
-            self.is_json = True
-        # magic will call a rss document 'xml'
-        elif '<rss' in test_content_normalized or '<feed' in test_content_normalized or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES):
-            self.is_rss = True
-        elif test_content_normalized.startswith('<?xml') or any(s in magic_content_header for s in XML_CONTENT_TYPES):
-            # Generic XML that's not RSS/Atom (RSS/Atom checked above)
-            self.is_xml = True
-        elif '%pdf-1' in test_content:
-            self.is_pdf = True
-        elif http_content_header.startswith('text/'):
-            self.is_plaintext = True
-        # Only trust magic for 'text' if no other patterns matched
-        elif 'text' in magic_content_header:
-            self.is_plaintext = True
-
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@@ -13,8 +13,6 @@ from changedetectionio import html_tools, content_fetchers
 from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
 from loguru import logger

-from changedetectionio.processors.magic import guess_stream_type
-
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

 name = 'Webpage Text/HTML, JSON and PDF changes'
@@ -22,9 +20,6 @@ description = 'Detects all text changes where possible'

 json_filter_prefixes = ['json:', 'jq:', 'jqraw:']

-# Assume it's this type if the server says nothing on content-type
-DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER = 'text/html'
-
 class FilterNotFoundInResponse(ValueError):
    def __init__(self, msg, screenshot=None, xpath_data=None):
        self.screenshot = screenshot
@@ -50,9 +45,6 @@ class perform_site_check(difference_detection_processor):
        if not watch:
            raise Exception("Watch no longer exists.")

-        ctype_header = self.fetcher.get_all_headers().get('content-type', DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER).lower()
-        stream_content_type = guess_stream_type(http_content_header=ctype_header, content=self.fetcher.content)
-
        # Unset any existing notification error
        update_obj = {'last_notification_error': False, 'last_error': False}

@@ -62,7 +54,7 @@ class perform_site_check(difference_detection_processor):
        self.xpath_data = self.fetcher.xpath_data

        # Track the content type
-        update_obj['content_type'] = ctype_header
+        update_obj['content_type'] = self.fetcher.get_all_headers().get('content-type', '').lower()

        # Watches added automatically in the queue manager will skip if its the same checksum as the previous run
        # Saves a lot of CPU
@@ -77,12 +69,24 @@ class perform_site_check(difference_detection_processor):
        # https://stackoverflow.com/questions/41817578/basic-method-chaining ?
        # return content().textfilter().jsonextract().checksumcompare() ?

+        is_json = 'application/json' in self.fetcher.get_all_headers().get('content-type', '').lower()
+        is_html = not is_json
+        is_rss = False

+        ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower()
        # Go into RSS preprocess for converting CDATA/comment to usable text
-        if stream_content_type.is_rss:
-            self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
+        if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']):
+            if '<rss' in self.fetcher.content[:100].lower():
+                self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
+                is_rss = True

-        if watch.is_pdf or stream_content_type.is_pdf:
+        # source: support, basically treat it as plaintext
+        if watch.is_source_type_url:
+            is_html = False
+            is_json = False
+
+        inline_pdf = self.fetcher.get_all_headers().get('content-disposition', '') and '%PDF-1' in self.fetcher.content[:10]
+        if watch.is_pdf or 'application/pdf' in self.fetcher.get_all_headers().get('content-type', '').lower() or inline_pdf:
            from shutil import which
            tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml")
            if not which(tool):
@@ -126,12 +130,11 @@ class perform_site_check(difference_detection_processor):
        has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip())
        has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip())

-        if stream_content_type.is_json:
-            if not has_filter_rule:
-                # Force a reformat
-                include_filters_rule.append("json:$")
-                has_filter_rule = True
+        if is_json and not has_filter_rule:
+            include_filters_rule.append("json:$")
+            has_filter_rule = True

+        if is_json:
            # Sort the JSON so we dont get false alerts when the content is just re-ordered
            try:
                self.fetcher.content = json.dumps(json.loads(self.fetcher.content), sort_keys=True)
@@ -139,25 +142,34 @@ class perform_site_check(difference_detection_processor):
                # Might have just been a snippet, or otherwise bad JSON, continue
                pass

-            if has_filter_rule:
-                for filter in include_filters_rule:
-                    if any(prefix in filter for prefix in json_filter_prefixes):
-                        stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
-                        if stripped_text_from_html:
-                            stream_content_type.is_json = True
-                            stream_content_type.is_html = False
+        if has_filter_rule:
+            for filter in include_filters_rule:
+                if any(prefix in filter for prefix in json_filter_prefixes):
+                    stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
+                    is_html = False

-        # We have 'watch.is_source_type_url' because we should be able to use selectors on the raw HTML but return just that selected HTML
-        if stream_content_type.is_html or watch.is_source_type_url or stream_content_type.is_plaintext or stream_content_type.is_rss or stream_content_type.is_xml or stream_content_type.is_pdf:
+        if is_html or watch.is_source_type_url:

            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
            self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
            html_content = self.fetcher.content
+            content_type = self.fetcher.get_all_headers().get('content-type', '').lower()
+            is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower()

-            # Some kind of "text" but definitely not RSS looking
-            if stream_content_type.is_plaintext:
+            # Try to detect better mime types if its a download or not announced as HTML
+            if is_attachment or 'octet-stream' in content_type or not 'html' in content_type:
+                logger.debug(f"Got a reply that may be a download or possibly a text attachment, checking..")
+                try:
+                    import magic
+                    mime = magic.from_buffer(html_content, mime=True)
+                    logger.debug(f"Guessing mime type, original content_type '{content_type}', mime type detected '{mime}'")
+                    if mime and "/" in mime: # looks valid and is a valid mime type
+                        content_type = mime
+                except Exception as e:
+                    logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}")
+
+            if 'text/' in content_type and not 'html' in content_type:
                # Don't run get_text or xpath/css filters on plaintext
-                # We are not HTML, we are not any kind of RSS, doesnt even look like HTML
                stripped_text_from_html = html_content
            else:
                # If not JSON, and if it's not text/plain..
@@ -174,13 +186,13 @@ class perform_site_check(difference_detection_processor):
                            html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
                                                                    html_content=self.fetcher.content,
                                                                    append_pretty_line_formatting=not watch.is_source_type_url,
-                                                                    is_rss=stream_content_type.is_rss)
+                                                                    is_rss=is_rss)

                        elif filter_rule.startswith('xpath1:'):
                            html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
                                                                     html_content=self.fetcher.content,
                                                                     append_pretty_line_formatting=not watch.is_source_type_url,
-                                                                     is_rss=stream_content_type.is_rss)
+                                                                     is_rss=is_rss)
                        else:
                            html_content += html_tools.include_filters(include_filters=filter_rule,
                                                                       html_content=self.fetcher.content,
@@ -199,7 +211,7 @@ class perform_site_check(difference_detection_processor):
                    do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
                    stripped_text_from_html = html_tools.html_to_text(html_content=html_content,
                                                                      render_anchor_tag_content=do_anchor,
-                                                                      is_rss=stream_content_type.is_rss)  # 1874 activate the <title workaround hack
+                                                                      is_rss=is_rss)  # 1874 activate the <title workaround hack

        if watch.get('trim_text_whitespace'):
            stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
@@ -238,7 +250,7 @@ class perform_site_check(difference_detection_processor):

        # Treat pages with no renderable text content as a change? No by default
        empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
-        if not stream_content_type.is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
+        if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
            raise content_fetchers.exceptions.ReplyWithContentButNoText(url=url,
                                                            status_code=self.fetcher.get_last_status_code(),
                                                            screenshot=self.fetcher.screenshot,
@@ -303,11 +315,6 @@ class perform_site_check(difference_detection_processor):
        text_for_checksuming = stripped_text_from_html
        if text_to_ignore:
            text_for_checksuming = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore)
-            # Some people prefer to also completely remove it
-            strip_ignored_lines = watch.get('strip_ignored_lines') if watch.get('strip_ignored_lines') is not None else self.datastore.data['settings']['application'].get('strip_ignored_lines')
-            if strip_ignored_lines:
-                # @todo add test in the 'preview' mode, check the widget works? compare to datastruct
-                stripped_text_from_html = text_for_checksuming

        # Re #133 - if we should strip whitespaces from triggering the change detected comparison
        if text_for_checksuming and self.datastore.data['settings']['application'].get('ignore_whitespace', False):
--- a/changedetectionio/static/styles/scss/parts/_widgets.scss
+++ b/changedetectionio/static/styles/scss/parts/_widgets.scss
@@ -34,6 +34,7 @@
      transition: all 0.2s ease;
      cursor: pointer;
      display: block;
+      min-width: 60px;
      text-align: center;
    }

--- a/changedetectionio/static/styles/styles.css
+++ b/changedetectionio/static/styles/styles.css
--- a/changedetectionio/templates/edit/text-options.html
+++ b/changedetectionio/templates/edit/text-options.html
@@ -26,10 +26,7 @@
                            <li>Changing this will affect the comparison checksum which may trigger an alert</li>
                        </ul>
                </span>
-                <br><br>
-                    <div class="pure-control-group">
-                      {{ render_ternary_field(form.strip_ignored_lines) }}
-                    </div>
+
                </fieldset>

                <fieldset>
--- a/changedetectionio/tests/test_backend.py
+++ b/changedetectionio/tests/test_backend.py
@@ -167,20 +167,6 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
    assert b'Deleted' in res.data

 def test_non_text_mime_or_downloads(client, live_server, measure_memory_usage):
-    """
-
-    https://github.com/dgtlmoon/changedetection.io/issues/3434
-    I noticed that a watched website can be monitored fine as long as the server sends content-type: text/plain; charset=utf-8,
-    but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog),
-    changedetection somehow ignores all line breaks and treats the document file as if everything is on one line.
-
-    WHAT THIS DOES - makes the system rely on 'magic' to determine what is it
-
-    :param client:
-    :param live_server:
-    :param measure_memory_usage:
-    :return:
-    """
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write("""some random text that should be split by line
 and not parsed with html_to_text
@@ -229,102 +215,3 @@ got it\r\n

    res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)

-
-def test_standard_text_plain(client, live_server, measure_memory_usage):
-    """
-
-    https://github.com/dgtlmoon/changedetection.io/issues/3434
-    I noticed that a watched website can be monitored fine as long as the server sends content-type: text/plain; charset=utf-8,
-    but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog),
-    changedetection somehow ignores all line breaks and treats the document file as if everything is on one line.
-
-    The real bug here can be that it will try to process plain-text as HTML, losing <etc>
-
-    :param client:
-    :param live_server:
-    :param measure_memory_usage:
-    :return:
-    """
-    with open("test-datastore/endpoint-content.txt", "w") as f:
-        f.write("""some random text that should be split by line
-and not parsed with html_to_text
-<title>Even this title should stay because we are just plain text</title>
-this way we know that it correctly parsed as plain text
-\r\n
-ok\r\n
-got it\r\n
-""")
-
-    test_url = url_for('test_endpoint', content_type="text/plain", _external=True)
-
-    # Add our URL to the import page
-    res = client.post(
-        url_for("imports.import_page"),
-        data={"urls": test_url},
-        follow_redirects=True
-    )
-
-    assert b"1 Imported" in res.data
-
-    wait_for_all_checks(client)
-
-    ### check the front end
-    res = client.get(
-        url_for("ui.ui_views.preview_page", uuid="first"),
-        follow_redirects=True
-    )
-
-    assert b"some random text that should be split by line\n" in res.data
-    ####
-
-    # Check the snapshot by API that it has linefeeds too
-    watch_uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
-    api_key = live_server.app.config['DATASTORE'].data['settings']['application'].get('api_access_token')
-    res = client.get(
-        url_for("watchhistory", uuid=watch_uuid),
-        headers={'x-api-key': api_key},
-    )
-
-    # Fetch a snapshot by timestamp, check the right one was found
-    res = client.get(
-        url_for("watchsinglehistory", uuid=watch_uuid, timestamp=list(res.json.keys())[-1]),
-        headers={'x-api-key': api_key},
-    )
-    assert b"some random text that should be split by line\n" in res.data
-    assert b"<title>Even this title should stay because we are just plain text</title>" in res.data
-
-    res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
-
-# Server says its plaintext, we should always treat it as plaintext
-def test_plaintext_even_if_xml_content(client, live_server, measure_memory_usage):
-
-    with open("test-datastore/endpoint-content.txt", "w") as f:
-        f.write("""<?xml version="1.0" encoding="utf-8"?>
-<resources xmlns:tools="http://schemas.android.com/tools">
-    <!--Activity and fragment titles-->
-    <string name="feed_update_receiver_name">Abonnementen bijwerken</string>
-</resources>
-""")
-
-    test_url = url_for('test_endpoint', content_type="text/plain", _external=True)
-
-    # Add our URL to the import page
-    res = client.post(
-        url_for("imports.import_page"),
-        data={"urls": test_url},
-        follow_redirects=True
-    )
-
-    assert b"1 Imported" in res.data
-
-    wait_for_all_checks(client)
-
-    res = client.get(
-        url_for("ui.ui_views.preview_page", uuid="first"),
-        follow_redirects=True
-    )
-
-    assert b'&lt;string name=&#34;feed_update_receiver_name&#34;' in res.data
-
-    res = client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
-
--- a/changedetectionio/tests/test_group.py
+++ b/changedetectionio/tests/test_group.py
@@ -264,6 +264,8 @@ def test_limit_tag_ui(client, live_server, measure_memory_usage):
    client.get(url_for('ui.mark_all_viewed', tag=tag_uuid), follow_redirects=True)
    wait_for_all_checks(client)

+    with open('/tmp/fuck.html', 'wb') as f:
+        f.write(res.data)
    # Should be only 1 unviewed
    res = client.get(url_for("watchlist.index"))
    assert res.data.count(b' unviewed ') == 1
--- a/changedetectionio/tests/test_history_consistency.py
+++ b/changedetectionio/tests/test_history_consistency.py
@@ -3,8 +3,9 @@
 import time
 import os
 import json
+import logging
 from flask import url_for
-from .util import wait_for_all_checks
+from .util import live_server_setup, wait_for_all_checks
 from urllib.parse import urlparse, parse_qs

 def test_consistent_history(client, live_server, measure_memory_usage):
--- a/changedetectionio/tests/test_ignore.py
+++ b/changedetectionio/tests/test_ignore.py
@@ -58,39 +58,3 @@ def test_ignore(client, live_server, measure_memory_usage):
    # Should be in base.html
    assert b'csrftoken' in res.data

-
-def test_strip_ignore_lines(client, live_server, measure_memory_usage):
-   #  live_server_setup(live_server) # Setup on conftest per function
-    set_original_ignore_response()
-
-
-    # Goto the settings page, add our ignore text
-    res = client.post(
-        url_for("settings.settings_page"),
-        data={
-            "requests-time_between_check-minutes": 180,
-            "application-ignore_whitespace": "y",
-            "application-strip_ignored_lines": "y",
-            "application-global_ignore_text": "Which is across multiple",
-            'application-fetch_backend': "html_requests"
-        },
-        follow_redirects=True
-    )
-    assert b"Settings updated." in res.data
-
-    test_url = url_for('test_endpoint', _external=True)
-    res = client.post(
-        url_for("imports.import_page"),
-        data={"urls": test_url},
-        follow_redirects=True
-    )
-    assert b"1 Imported" in res.data
-
-    # Give the thread time to pick it up
-    wait_for_all_checks(client)
-    uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
-
-    # It should not be in the preview anymore
-    res = client.get(url_for("ui.ui_views.preview_page", uuid=uuid))
-    assert b'<div class="ignored">' not in res.data
-    assert b'Which is across multiple' not in res.data
--- a/changedetectionio/tests/test_rss.py
+++ b/changedetectionio/tests/test_rss.py
@@ -111,7 +111,7 @@ def test_basic_cdata_rss_markup(client, live_server, measure_memory_usage):

    set_original_cdata_xml()

-    test_url = url_for('test_endpoint', content_type="application/atom+xml; charset=UTF-8", _external=True)
+    test_url = url_for('test_endpoint', content_type="application/xml", _external=True)

    # Add our URL to the import page
    res = client.post(
@@ -139,7 +139,7 @@ def test_rss_xpath_filtering(client, live_server, measure_memory_usage):

    set_original_cdata_xml()

-    test_url = url_for('test_endpoint', content_type="application/atom+xml; charset=UTF-8", _external=True)
+    test_url = url_for('test_endpoint', content_type="application/xml", _external=True)

    res = client.post(
        url_for("ui.ui_views.form_quick_watch_add"),
--- a/changedetectionio/tests/test_xpath_selector.py
+++ b/changedetectionio/tests/test_xpath_selector.py
@@ -1,42 +1,12 @@
 # -*- coding: utf-8 -*-

-
+import time
 from flask import url_for
-from .util import  wait_for_all_checks
-from ..processors.magic import RSS_XML_CONTENT_TYPES
+from .util import live_server_setup, wait_for_all_checks
+
+from ..html_tools import *


-def set_rss_atom_feed_response(header=''):
-    test_return_data = f"""{header}<!-- Generated on Wed, 08 Oct 2025 08:42:33 -0700, really really honestly  -->
-<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
-<channel>
-    <atom:link href="https://store.waterpowered.com/news/collection//" rel="self" type="application/rss+xml"/>
-    <title>RSS Feed</title>
-    <link>
-        <![CDATA[ https://store.waterpowered.com/news/collection// ]]>
-    </link>
-    <description>
-        <![CDATA[ Events and Announcements for ]]>
-    </description>
-    <language>en-us</language>
-    <generator>water News RSS</generator>
-    <item>
-        <title> 🍁 Lets go discount</title>
-        <description><p class="bb_paragraph">ok heres the description</p></description>
-        <link>
-        <![CDATA[ https://store.waterpowered.com/news/app/1643320/view/511845698831908921 ]]>
-        </link>
-        <pubDate>Wed, 08 Oct 2025 15:28:55 +0000</pubDate>
-        <guid isPermaLink="true">https://store.waterpowered.com/news/app/1643320/view/511845698831908921</guid>
-        <enclosure url="https://clan.fastly.waterstatic.com/images/40721482/42822e5f00b2becf520ace9500981bb56f3a89f2.jpg" length="0" type="image/jpeg"/>
-    </item>
-</channel>
-</rss>"""
-
-    with open("test-datastore/endpoint-content.txt", "w") as f:
-        f.write(test_return_data)
-
-    return None



@@ -605,47 +575,3 @@ def test_xpath_20_function_string_join_matches(client, live_server, measure_memo

    client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)

-
-def _subtest_xpath_rss(client, content_type='text/html'):
-
-    # Add our URL to the import page
-    test_url = url_for('test_endpoint', content_type=content_type, _external=True)
-    res = client.post(
-        url_for("ui.ui_views.form_quick_watch_add"),
-        data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
-        follow_redirects=True
-    )
-
-    assert b"Watch added in Paused state, saving will unpause" in res.data
-
-    res = client.post(
-        url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1),
-        data={
-            "url": test_url,
-            "include_filters": "xpath://item",
-            "tags": '',
-            "fetch_backend": "html_requests",
-            "time_between_check_use_default": "y",
-        },
-        follow_redirects=True
-    )
-
-    assert b"unpaused" in res.data
-    wait_for_all_checks(client)
-
-    res = client.get(
-        url_for("ui.ui_views.preview_page", uuid="first"),
-        follow_redirects=True
-    )
-
-    assert b"Lets go discount" in res.data, f"When testing for Lets go discount called with content type '{content_type}'"
-    assert b"Events and Announcements" not in res.data, f"When testing for Lets go discount called with content type '{content_type}'" # It should not be here because thats not our selector target
-
-    client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
-
-# Be sure all-in-the-wild types of RSS feeds work with xpath
-def test_rss_xpath(client, live_server):
-    for feed_header in ['', '<?xml version="1.0" encoding="utf-8"?>']:
-        set_rss_atom_feed_response(header=feed_header)
-        for content_type in RSS_XML_CONTENT_TYPES:
-            _subtest_xpath_rss(client, content_type=content_type)
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,7 @@ flask_wtf~=1.2
 flask~=2.3
 flask-socketio~=5.5.1
 python-socketio~=5.13.0
-python-engineio~=4.12.3
+python-engineio~=4.12.0
 inscriptis~=2.2
 pytz
 timeago~=1.0
@@ -39,7 +39,7 @@ jsonpath-ng~=1.5.3
 # jq not available on Windows so must be installed manually

 # Notification library
-apprise==1.9.5
+apprise==1.9.4

 # - Needed for apprise/spush, and maybe others? hopefully doesnt trigger a rust compile.
 # - Requires extra wheel for rPi, adds build time for arm/v8 which is not in piwheels
@@ -135,7 +135,7 @@ tzdata
 pluggy ~= 1.5

 # Needed for testing, cross-platform for process and system monitoring
-psutil==7.1.0
+psutil==7.0.0

 ruff >= 0.11.2
 pre_commit >= 4.2.0
Author	SHA1	Message	Date
dgtlmoon	3704580990	Pin bs4	2025-09-29 11:14:42 +02:00
dgtlmoon	e2fa021f80	Attempt to fix socks test server test	2025-09-29 10:37:25 +02:00