not here

fix punctuation
be more helpful
2025-10-30 22:27:52 +00:00 · 2023-09-26 13:40:13 +02:00 · 2023-09-26 13:33:24 +02:00 · 2023-09-26 13:31:31 +02:00 · 2023-09-26 13:30:30 +02:00 · 2023-09-26 13:26:50 +02:00
4 changed files with 99 additions and 5 deletions
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@@ -77,11 +77,13 @@ class ScreenshotUnavailable(Exception):


 class ReplyWithContentButNoText(Exception):
-    def __init__(self, status_code, url, screenshot=None):
+    def __init__(self, status_code, url, screenshot=None, has_filters=False, html_content=''):
        # Set this so we can use it in other parts of the app
        self.status_code = status_code
        self.url = url
        self.screenshot = screenshot
+        self.has_filters = has_filters
+        self.html_content = html_content
        return


--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@@ -314,7 +314,12 @@ class perform_site_check(difference_detection_processor):
        # Treat pages with no renderable text content as a change? No by default
        empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
        if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
-            raise content_fetcher.ReplyWithContentButNoText(url=url, status_code=fetcher.get_last_status_code(), screenshot=screenshot)
+            raise content_fetcher.ReplyWithContentButNoText(url=url,
+                                                            status_code=fetcher.get_last_status_code(),
+                                                            screenshot=screenshot,
+                                                            has_filters=has_filter_rule,
+                                                            html_content=html_content
+                                                            )

        # We rely on the actual text in the html output.. many sites have random script vars etc,
        # in the future we'll implement other mechanisms.
--- a/changedetectionio/tests/test_css_selector.py
+++ b/changedetectionio/tests/test_css_selector.py
@@ -2,7 +2,7 @@

 import time
 from flask import url_for
-from . util import live_server_setup
+from .util import live_server_setup, wait_for_all_checks

 from ..html_tools import *

@@ -176,3 +176,77 @@ def test_check_multiple_filters(client, live_server):
    assert b"Blob A" in res.data # CSS was ok
    assert b"Blob B" in res.data # xPath was ok
    assert b"Blob C" not in res.data # Should not be included
+
+# The filter exists, but did not contain anything useful
+# Mainly used when the filter contains just an IMG, this can happen when someone selects an image in the visual-selector
+# Tests fetcher can throw a "ReplyWithContentButNoText" exception after applying filter and extracting text
+def test_filter_is_empty_help_suggestion(client, live_server):
+    #live_server_setup(live_server)
+
+    include_filters = "#blob-a"
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write("""<html><body>
+         <div id="blob-a">
+           <img src="something.jpg">
+         </div>
+         </body>
+         </html>
+        """)
+
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+    wait_for_all_checks(client)
+
+    # Goto the edit page, add our ignore text
+    # Add our URL to the import page
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={"include_filters": include_filters,
+              "url": test_url,
+              "tags": "",
+              "headers": "",
+              'fetch_backend': "html_requests"},
+        follow_redirects=True
+    )
+    assert b"Updated watch." in res.data
+
+    wait_for_all_checks(client)
+
+
+    res = client.get(
+        url_for("index"),
+        follow_redirects=True
+    )
+
+    assert b'empty result or contain only an image' in res.data
+
+
+    ### Just an empty selector, no image
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write("""<html><body>
+         <div id="blob-a">
+           <!-- doo doo -->
+         </div>
+         </body>
+         </html>
+        """)
+
+    res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
+    wait_for_all_checks(client)
+
+    res = client.get(
+        url_for("index"),
+        follow_redirects=True
+    )
+
+    assert b'empty result or contain only an image' not in res.data
+    assert b'but contained no usable text' in res.data
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -3,7 +3,7 @@ import threading
 import queue
 import time

-from changedetectionio import content_fetcher
+from changedetectionio import content_fetcher, html_tools
 from .processors.text_json_diff import FilterNotFoundInResponse
 from .processors.restock_diff import UnableToExtractRestockData

@@ -251,7 +251,20 @@ class update_worker(threading.Thread):
                        # Totally fine, it's by choice - just continue on, nothing more to care about
                        # Page had elements/content but no renderable text
                        # Backend (not filters) gave zero output
-                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found (With {} reply code).".format(e.status_code)})
+                        extra_help = ""
+                        if e.has_filters:
+                            # Maybe it contains an image? offer a more helpful link
+                            has_img = html_tools.include_filters(include_filters='img',
+                                                                 html_content=e.html_content)
+                            if has_img:
+                                extra_help = ", it's possible that the filters you have give an empty result or contain only an image <a href=\"https://github.com/dgtlmoon/changedetection.io/wiki/Detecting-changes-in-images\">more help here</a>."
+                            else:
+                                extra_help = ", it's possible that the filters were found, but contained no usable text."
+
+                        self.datastore.update_watch(uuid=uuid, update_obj={
+                            'last_error': f"Got HTML content but no text found (With {e.status_code} reply code){extra_help}"
+                        })
+
                        if e.screenshot:
                            self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot)
                        process_changedetection_results = False
Author	SHA1	Message	Date
dgtlmoon	5db65bcafd	not here	2023-09-26 13:40:13 +02:00
dgtlmoon	86832228ed	fix punctuation	2023-09-26 13:33:24 +02:00
dgtlmoon	bd10a1f7c4	be more helpful	2023-09-26 13:31:31 +02:00
dgtlmoon	ccbfa1e20e	more testing	2023-09-26 13:30:30 +02:00
dgtlmoon	29d34bcd22	better error	2023-09-26 13:26:50 +02:00
dgtlmoon	9b4fb80bef	Add test	2023-09-26 13:18:12 +02:00
dgtlmoon	2ff65b53fb	Try to be more helpful	2023-09-26 12:45:41 +02:00