mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-11-04 08:34:57 +00:00 
			
		
		
		
	Compare commits
	
		
			7 Commits
		
	
	
		
			minor-stoc
			...
			filters-co
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					5db65bcafd | ||
| 
						 | 
					86832228ed | ||
| 
						 | 
					bd10a1f7c4 | ||
| 
						 | 
					ccbfa1e20e | ||
| 
						 | 
					29d34bcd22 | ||
| 
						 | 
					9b4fb80bef | ||
| 
						 | 
					2ff65b53fb | 
@@ -77,11 +77,13 @@ class ScreenshotUnavailable(Exception):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ReplyWithContentButNoText(Exception):
 | 
			
		||||
    def __init__(self, status_code, url, screenshot=None):
 | 
			
		||||
    def __init__(self, status_code, url, screenshot=None, has_filters=False, html_content=''):
 | 
			
		||||
        # Set this so we can use it in other parts of the app
 | 
			
		||||
        self.status_code = status_code
 | 
			
		||||
        self.url = url
 | 
			
		||||
        self.screenshot = screenshot
 | 
			
		||||
        self.has_filters = has_filters
 | 
			
		||||
        self.html_content = html_content
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -314,7 +314,12 @@ class perform_site_check(difference_detection_processor):
 | 
			
		||||
        # Treat pages with no renderable text content as a change? No by default
 | 
			
		||||
        empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
 | 
			
		||||
        if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
 | 
			
		||||
            raise content_fetcher.ReplyWithContentButNoText(url=url, status_code=fetcher.get_last_status_code(), screenshot=screenshot)
 | 
			
		||||
            raise content_fetcher.ReplyWithContentButNoText(url=url,
 | 
			
		||||
                                                            status_code=fetcher.get_last_status_code(),
 | 
			
		||||
                                                            screenshot=screenshot,
 | 
			
		||||
                                                            has_filters=has_filter_rule,
 | 
			
		||||
                                                            html_content=html_content
 | 
			
		||||
                                                            )
 | 
			
		||||
 | 
			
		||||
        # We rely on the actual text in the html output.. many sites have random script vars etc,
 | 
			
		||||
        # in the future we'll implement other mechanisms.
 | 
			
		||||
 
 | 
			
		||||
@@ -2,7 +2,7 @@
 | 
			
		||||
 | 
			
		||||
import time
 | 
			
		||||
from flask import url_for
 | 
			
		||||
from . util import live_server_setup
 | 
			
		||||
from .util import live_server_setup, wait_for_all_checks
 | 
			
		||||
 | 
			
		||||
from ..html_tools import *
 | 
			
		||||
 | 
			
		||||
@@ -176,3 +176,77 @@ def test_check_multiple_filters(client, live_server):
 | 
			
		||||
    assert b"Blob A" in res.data # CSS was ok
 | 
			
		||||
    assert b"Blob B" in res.data # xPath was ok
 | 
			
		||||
    assert b"Blob C" not in res.data # Should not be included
 | 
			
		||||
 | 
			
		||||
# The filter exists, but did not contain anything useful
 | 
			
		||||
# Mainly used when the filter contains just an IMG, this can happen when someone selects an image in the visual-selector
 | 
			
		||||
# Tests fetcher can throw a "ReplyWithContentButNoText" exception after applying filter and extracting text
 | 
			
		||||
def test_filter_is_empty_help_suggestion(client, live_server):
 | 
			
		||||
    #live_server_setup(live_server)
 | 
			
		||||
 | 
			
		||||
    include_filters = "#blob-a"
 | 
			
		||||
 | 
			
		||||
    with open("test-datastore/endpoint-content.txt", "w") as f:
 | 
			
		||||
        f.write("""<html><body>
 | 
			
		||||
         <div id="blob-a">
 | 
			
		||||
           <img src="something.jpg">
 | 
			
		||||
         </div>
 | 
			
		||||
         </body>
 | 
			
		||||
         </html>
 | 
			
		||||
        """)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    # Add our URL to the import page
 | 
			
		||||
    test_url = url_for('test_endpoint', _external=True)
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("import_page"),
 | 
			
		||||
        data={"urls": test_url},
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"1 Imported" in res.data
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
 | 
			
		||||
    # Goto the edit page, add our ignore text
 | 
			
		||||
    # Add our URL to the import page
 | 
			
		||||
    res = client.post(
 | 
			
		||||
        url_for("edit_page", uuid="first"),
 | 
			
		||||
        data={"include_filters": include_filters,
 | 
			
		||||
              "url": test_url,
 | 
			
		||||
              "tags": "",
 | 
			
		||||
              "headers": "",
 | 
			
		||||
              'fetch_backend': "html_requests"},
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
    assert b"Updated watch." in res.data
 | 
			
		||||
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    res = client.get(
 | 
			
		||||
        url_for("index"),
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    assert b'empty result or contain only an image' in res.data
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    ### Just an empty selector, no image
 | 
			
		||||
 | 
			
		||||
    with open("test-datastore/endpoint-content.txt", "w") as f:
 | 
			
		||||
        f.write("""<html><body>
 | 
			
		||||
         <div id="blob-a">
 | 
			
		||||
           <!-- doo doo -->
 | 
			
		||||
         </div>
 | 
			
		||||
         </body>
 | 
			
		||||
         </html>
 | 
			
		||||
        """)
 | 
			
		||||
 | 
			
		||||
    res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
 | 
			
		||||
    wait_for_all_checks(client)
 | 
			
		||||
 | 
			
		||||
    res = client.get(
 | 
			
		||||
        url_for("index"),
 | 
			
		||||
        follow_redirects=True
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    assert b'empty result or contain only an image' not in res.data
 | 
			
		||||
    assert b'but contained no usable text' in res.data
 | 
			
		||||
 
 | 
			
		||||
@@ -3,7 +3,7 @@ import threading
 | 
			
		||||
import queue
 | 
			
		||||
import time
 | 
			
		||||
 | 
			
		||||
from changedetectionio import content_fetcher
 | 
			
		||||
from changedetectionio import content_fetcher, html_tools
 | 
			
		||||
from .processors.text_json_diff import FilterNotFoundInResponse
 | 
			
		||||
from .processors.restock_diff import UnableToExtractRestockData
 | 
			
		||||
 | 
			
		||||
@@ -251,7 +251,20 @@ class update_worker(threading.Thread):
 | 
			
		||||
                        # Totally fine, it's by choice - just continue on, nothing more to care about
 | 
			
		||||
                        # Page had elements/content but no renderable text
 | 
			
		||||
                        # Backend (not filters) gave zero output
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found (With {} reply code).".format(e.status_code)})
 | 
			
		||||
                        extra_help = ""
 | 
			
		||||
                        if e.has_filters:
 | 
			
		||||
                            # Maybe it contains an image? offer a more helpful link
 | 
			
		||||
                            has_img = html_tools.include_filters(include_filters='img',
 | 
			
		||||
                                                                 html_content=e.html_content)
 | 
			
		||||
                            if has_img:
 | 
			
		||||
                                extra_help = ", it's possible that the filters you have give an empty result or contain only an image <a href=\"https://github.com/dgtlmoon/changedetection.io/wiki/Detecting-changes-in-images\">more help here</a>."
 | 
			
		||||
                            else:
 | 
			
		||||
                                extra_help = ", it's possible that the filters were found, but contained no usable text."
 | 
			
		||||
 | 
			
		||||
                        self.datastore.update_watch(uuid=uuid, update_obj={
 | 
			
		||||
                            'last_error': f"Got HTML content but no text found (With {e.status_code} reply code){extra_help}"
 | 
			
		||||
                        })
 | 
			
		||||
 | 
			
		||||
                        if e.screenshot:
 | 
			
		||||
                            self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot)
 | 
			
		||||
                        process_changedetection_results = False
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user