Byte Order mark detection

No encoding in HTTP header -> Try to get it from the document -> use chardet last resort
Improve logging message
2026-03-16 08:56:02 +00:00 · 2026-03-05 12:38:08 +01:00 · 2026-03-05 12:35:58 +01:00 · 2026-03-05 11:44:38 +01:00 · 2026-03-05 11:31:03 +01:00 · 2026-03-05 11:12:04 +01:00
3 changed files with 100 additions and 4 deletions
--- a/changedetectionio/content_fetchers/requests.py
+++ b/changedetectionio/content_fetchers/requests.py
@@ -148,10 +148,32 @@ class fetcher(Fetcher):
                        # Default to UTF-8 for XML if no encoding found
                        r.encoding = 'utf-8'
                else:
-                    # For other content types, use chardet
-                    encoding = chardet.detect(r.content)['encoding']
-                    if encoding:
-                        r.encoding = encoding
+                    # No charset in HTTP header - sniff encoding in priority order matching browsers
+                    # (WHATWG encoding sniffing algorithm):
+                    # 1. BOM - highest confidence, check before anything else
+                    # 2. <meta charset> in first 2kb
+                    # 3. chardet statistical detection - last resort
+                    # See: https://github.com/dgtlmoon/changedetection.io/issues/3952
+                    boms = [
+                        (b'\xef\xbb\xbf', 'utf-8-sig'),
+                        (b'\xff\xfe', 'utf-16-le'),
+                        (b'\xfe\xff', 'utf-16-be'),
+                    ]
+                    bom_encoding = next((enc for bom, enc in boms if r.content.startswith(bom)), None)
+                    if bom_encoding:
+                        logger.info(f"URL: {url} Using encoding '{bom_encoding}' detected from BOM")
+                        r.encoding = bom_encoding
+                    else:
+                        meta_charset_match = re.search(rb'<meta[^>]+charset\s*=\s*["\']?\s*([^"\'\s;>]+)', r.content[:2000], re.IGNORECASE)
+                        if meta_charset_match:
+                            encoding = meta_charset_match.group(1).decode('ascii', errors='ignore')
+                            logger.info(f"URL: {url} No content-type encoding in HTTP headers - Using encoding '{encoding}' from HTML meta charset tag")
+                            r.encoding = encoding
+                        else:
+                            encoding = chardet.detect(r.content)['encoding']
+                            logger.warning(f"URL: {url} No charset in headers or meta tag, guessed encoding as '{encoding}' via chardet")
+                            if encoding:
+                                r.encoding = encoding

        self.headers = r.headers

--- a/changedetectionio/processors/base.py
+++ b/changedetectionio/processors/base.py
@@ -260,6 +260,16 @@ class difference_detection_processor():
        # @todo .quit here could go on close object, so we can run JS if change-detected
        await self.fetcher.quit(watch=self.watch)

+        # Sanitize lone surrogates - these can appear when servers return malformed/mixed-encoding
+        # content that gets decoded into surrogate characters (e.g. \udcad). Without this,
+        # encode('utf-8') raises UnicodeEncodeError downstream in checksums, diffs, file writes, etc.
+        # Covers all fetchers (requests, playwright, puppeteer, selenium) in one place.
+        # Also note: By this point we SHOULD know the original encoding so it can safely convert to utf-8 for the rest of the app.
+        # See: https://github.com/dgtlmoon/changedetection.io/issues/3952
+
+        if self.fetcher.content and isinstance(self.fetcher.content, str):
+            self.fetcher.content = self.fetcher.content.encode('utf-8', errors='replace').decode('utf-8')
+
        # After init, call run_changedetection() which will do the actual change-detection

    def get_extra_watch_config(self, filename):
--- a/changedetectionio/tests/test_encoding.py
+++ b/changedetectionio/tests/test_encoding.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # coding=utf-8

+import hashlib
 import time
 from flask import url_for
 from .util import live_server_setup, wait_for_all_checks, extract_UUID_from_client
@@ -11,6 +12,69 @@ import os



+def test_surrogate_characters_in_content_are_sanitized():
+    """Lone surrogates can appear in requests' r.text when a server returns malformed/mixed-encoding
+    content. Without sanitization, encoding to UTF-8 raises UnicodeEncodeError.
+    See: https://github.com/dgtlmoon/changedetection.io/issues/3952
+    """
+    content_with_surrogate = '<html><body>Hello \udcad World</body></html>'
+
+    # Confirm the raw problem exists
+    with pytest.raises(UnicodeEncodeError):
+        content_with_surrogate.encode('utf-8')
+
+    # Our fix: sanitize after fetcher.run() in processors/base.py call_browser()
+    sanitized = content_with_surrogate.encode('utf-8', errors='replace').decode('utf-8')
+    assert 'Hello' in sanitized
+    assert 'World' in sanitized
+    assert '\udcad' not in sanitized
+
+    # Checksum computation (processors/base.py get_raw_document_checksum) must not crash
+    hashlib.md5(sanitized.encode('utf-8')).hexdigest()
+
+
+def test_utf8_content_without_charset_header(client, live_server, datastore_path):
+    """Server returns UTF-8 content but no charset in Content-Type header.
+    chardet can misdetect such pages as UTF-7 (Python 3.14 then produces surrogates).
+    Our fix tries UTF-8 first before falling back to chardet.
+    See: https://github.com/dgtlmoon/changedetection.io/issues/3952
+    """
+    from .util import write_test_file_and_sync
+    # UTF-8 encoded content with non-ASCII chars - no charset will be in the header
+    html = '<html><body><p>Español</p><p>Français</p><p>日本語</p></body></html>'
+    write_test_file_and_sync(os.path.join(datastore_path, "endpoint-content.txt"), html.encode('utf-8'), mode='wb')
+
+    test_url = url_for('test_endpoint', content_type="text/html", _external=True)
+    client.application.config.get('DATASTORE').add_watch(url=test_url)
+    client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
+    wait_for_all_checks(client)
+
+    res = client.get(url_for("ui.ui_preview.preview_page", uuid="first"), follow_redirects=True)
+    # Should decode correctly as UTF-8, not produce mojibake (EspaÃ±ol) or replacement chars
+    assert 'Español'.encode('utf-8') in res.data
+    assert 'Français'.encode('utf-8') in res.data
+    assert '日本語'.encode('utf-8') in res.data
+
+
+def test_shiftjis_with_meta_charset(client, live_server, datastore_path):
+    """Server returns Shift-JIS content with no charset in HTTP header, but the HTML
+    declares <meta charset="Shift-JIS">. We should use the meta tag, not chardet.
+    Real-world case: https://github.com/dgtlmoon/changedetection.io/issues/3952
+    """
+    from .util import write_test_file_and_sync
+    japanese_text = '日本語のページ'
+    html = f'<html><head><meta http-equiv="Content-Type" content="text/html;charset=Shift-JIS"></head><body><p>{japanese_text}</p></body></html>'
+    write_test_file_and_sync(os.path.join(datastore_path, "endpoint-content.txt"), html.encode('shift_jis'), mode='wb')
+
+    test_url = url_for('test_endpoint', content_type="text/html", _external=True)
+    client.application.config.get('DATASTORE').add_watch(url=test_url)
+    client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
+    wait_for_all_checks(client)
+
+    res = client.get(url_for("ui.ui_preview.preview_page", uuid="first"), follow_redirects=True)
+    assert japanese_text.encode('utf-8') in res.data
+
+
 def set_html_response(datastore_path):
    test_return_data = """
 <html><body><span class="nav_second_img_text">
Author	SHA1	Message	Date
dgtlmoon	cc6170dbb6	Byte Order mark detection Some checks failed Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled Details ChangeDetection.io App Test / lint-code (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built package works basically. (push) Has been cancelled Details Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled Details ChangeDetection.io App Test / test-application-3-14 (push) Has been cancelled Details	2026-03-05 12:38:08 +01:00
dgtlmoon	1577f4eb5b	No encoding in HTTP header -> Try to get it from the document -> use chardet last resort	2026-03-05 12:35:58 +01:00
dgtlmoon	99ca16c45d	Improve logging message	2026-03-05 11:44:38 +01:00
dgtlmoon	1453119516	More non standard encoding fixes	2026-03-05 11:31:03 +01:00
dgtlmoon	8e83643c70	testing CI	2026-03-05 11:12:04 +01:00
dgtlmoon	28a70c4e2a	Always replace/upgrade broken utf8 Re #3952	2026-03-05 10:59:06 +01:00