mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-10-30 22:27:52 +00:00 
			
		
		
		
	Compare commits
	
		
			7 Commits
		
	
	
		
			openapi-me
			...
			filters-co
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | 5db65bcafd | ||
|   | 86832228ed | ||
|   | bd10a1f7c4 | ||
|   | ccbfa1e20e | ||
|   | 29d34bcd22 | ||
|   | 9b4fb80bef | ||
|   | 2ff65b53fb | 
| @@ -77,11 +77,13 @@ class ScreenshotUnavailable(Exception): | ||||
|  | ||||
|  | ||||
| class ReplyWithContentButNoText(Exception): | ||||
|     def __init__(self, status_code, url, screenshot=None): | ||||
|     def __init__(self, status_code, url, screenshot=None, has_filters=False, html_content=''): | ||||
|         # Set this so we can use it in other parts of the app | ||||
|         self.status_code = status_code | ||||
|         self.url = url | ||||
|         self.screenshot = screenshot | ||||
|         self.has_filters = has_filters | ||||
|         self.html_content = html_content | ||||
|         return | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -314,7 +314,12 @@ class perform_site_check(difference_detection_processor): | ||||
|         # Treat pages with no renderable text content as a change? No by default | ||||
|         empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False) | ||||
|         if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0: | ||||
|             raise content_fetcher.ReplyWithContentButNoText(url=url, status_code=fetcher.get_last_status_code(), screenshot=screenshot) | ||||
|             raise content_fetcher.ReplyWithContentButNoText(url=url, | ||||
|                                                             status_code=fetcher.get_last_status_code(), | ||||
|                                                             screenshot=screenshot, | ||||
|                                                             has_filters=has_filter_rule, | ||||
|                                                             html_content=html_content | ||||
|                                                             ) | ||||
|  | ||||
|         # We rely on the actual text in the html output.. many sites have random script vars etc, | ||||
|         # in the future we'll implement other mechanisms. | ||||
|   | ||||
| @@ -2,7 +2,7 @@ | ||||
|  | ||||
| import time | ||||
| from flask import url_for | ||||
| from . util import live_server_setup | ||||
| from .util import live_server_setup, wait_for_all_checks | ||||
|  | ||||
| from ..html_tools import * | ||||
|  | ||||
| @@ -176,3 +176,77 @@ def test_check_multiple_filters(client, live_server): | ||||
|     assert b"Blob A" in res.data # CSS was ok | ||||
|     assert b"Blob B" in res.data # xPath was ok | ||||
|     assert b"Blob C" not in res.data # Should not be included | ||||
|  | ||||
| # The filter exists, but did not contain anything useful | ||||
| # Mainly used when the filter contains just an IMG, this can happen when someone selects an image in the visual-selector | ||||
| # Tests fetcher can throw a "ReplyWithContentButNoText" exception after applying filter and extracting text | ||||
| def test_filter_is_empty_help_suggestion(client, live_server): | ||||
|     #live_server_setup(live_server) | ||||
|  | ||||
|     include_filters = "#blob-a" | ||||
|  | ||||
|     with open("test-datastore/endpoint-content.txt", "w") as f: | ||||
|         f.write("""<html><body> | ||||
|          <div id="blob-a"> | ||||
|            <img src="something.jpg"> | ||||
|          </div> | ||||
|          </body> | ||||
|          </html> | ||||
|         """) | ||||
|  | ||||
|  | ||||
|     # Add our URL to the import page | ||||
|     test_url = url_for('test_endpoint', _external=True) | ||||
|     res = client.post( | ||||
|         url_for("import_page"), | ||||
|         data={"urls": test_url}, | ||||
|         follow_redirects=True | ||||
|     ) | ||||
|     assert b"1 Imported" in res.data | ||||
|     wait_for_all_checks(client) | ||||
|  | ||||
|     # Goto the edit page, add our ignore text | ||||
|     # Add our URL to the import page | ||||
|     res = client.post( | ||||
|         url_for("edit_page", uuid="first"), | ||||
|         data={"include_filters": include_filters, | ||||
|               "url": test_url, | ||||
|               "tags": "", | ||||
|               "headers": "", | ||||
|               'fetch_backend': "html_requests"}, | ||||
|         follow_redirects=True | ||||
|     ) | ||||
|     assert b"Updated watch." in res.data | ||||
|  | ||||
|     wait_for_all_checks(client) | ||||
|  | ||||
|  | ||||
|     res = client.get( | ||||
|         url_for("index"), | ||||
|         follow_redirects=True | ||||
|     ) | ||||
|  | ||||
|     assert b'empty result or contain only an image' in res.data | ||||
|  | ||||
|  | ||||
|     ### Just an empty selector, no image | ||||
|  | ||||
|     with open("test-datastore/endpoint-content.txt", "w") as f: | ||||
|         f.write("""<html><body> | ||||
|          <div id="blob-a"> | ||||
|            <!-- doo doo --> | ||||
|          </div> | ||||
|          </body> | ||||
|          </html> | ||||
|         """) | ||||
|  | ||||
|     res = client.get(url_for("form_watch_checknow"), follow_redirects=True) | ||||
|     wait_for_all_checks(client) | ||||
|  | ||||
|     res = client.get( | ||||
|         url_for("index"), | ||||
|         follow_redirects=True | ||||
|     ) | ||||
|  | ||||
|     assert b'empty result or contain only an image' not in res.data | ||||
|     assert b'but contained no usable text' in res.data | ||||
|   | ||||
| @@ -3,7 +3,7 @@ import threading | ||||
| import queue | ||||
| import time | ||||
|  | ||||
| from changedetectionio import content_fetcher | ||||
| from changedetectionio import content_fetcher, html_tools | ||||
| from .processors.text_json_diff import FilterNotFoundInResponse | ||||
| from .processors.restock_diff import UnableToExtractRestockData | ||||
|  | ||||
| @@ -251,7 +251,20 @@ class update_worker(threading.Thread): | ||||
|                         # Totally fine, it's by choice - just continue on, nothing more to care about | ||||
|                         # Page had elements/content but no renderable text | ||||
|                         # Backend (not filters) gave zero output | ||||
|                         self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found (With {} reply code).".format(e.status_code)}) | ||||
|                         extra_help = "" | ||||
|                         if e.has_filters: | ||||
|                             # Maybe it contains an image? offer a more helpful link | ||||
|                             has_img = html_tools.include_filters(include_filters='img', | ||||
|                                                                  html_content=e.html_content) | ||||
|                             if has_img: | ||||
|                                 extra_help = ", it's possible that the filters you have give an empty result or contain only an image <a href=\"https://github.com/dgtlmoon/changedetection.io/wiki/Detecting-changes-in-images\">more help here</a>." | ||||
|                             else: | ||||
|                                 extra_help = ", it's possible that the filters were found, but contained no usable text." | ||||
|  | ||||
|                         self.datastore.update_watch(uuid=uuid, update_obj={ | ||||
|                             'last_error': f"Got HTML content but no text found (With {e.status_code} reply code){extra_help}" | ||||
|                         }) | ||||
|  | ||||
|                         if e.screenshot: | ||||
|                             self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot) | ||||
|                         process_changedetection_results = False | ||||
|   | ||||
		Reference in New Issue
	
	Block a user