mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-10-31 06:37:41 +00:00 
			
		
		
		
	Compare commits
	
		
			4 Commits
		
	
	
		
			HTML-notif
			...
			revert-mul
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | 24b9e4dc83 | ||
|   | d481a6b7b1 | ||
|   | e38f264750 | ||
|   | 1ec86bd38d | 
| @@ -436,55 +436,27 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False | ||||
|     return re.sub(pattern, repl, html_content) | ||||
|  | ||||
|  | ||||
| def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False): | ||||
| # NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON | ||||
|  | ||||
|  | ||||
| def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str: | ||||
|     from inscriptis import get_text | ||||
|     from inscriptis.model.config import ParserConfig | ||||
|  | ||||
|     """Converts html string to a string with just the text. If ignoring | ||||
|     rendering anchor tag content is enable, anchor tag content are also | ||||
|     included in the text | ||||
|  | ||||
|     :param html_content: string with html content | ||||
|     :param render_anchor_tag_content: boolean flag indicating whether to extract | ||||
|     hyperlinks (the anchor tag content) together with text. This refers to the | ||||
|     'href' inside 'a' tags. | ||||
|     Anchor tag content is rendered in the following manner: | ||||
|     '[ text ](anchor tag content)' | ||||
|     :return: extracted text from the HTML | ||||
|     """ | ||||
|     #  if anchor tag content flag is set to True define a config for | ||||
|     #  extracting this content | ||||
|     if render_anchor_tag_content: | ||||
|         parser_config = ParserConfig( | ||||
|             annotation_rules={"a": ["hyperlink"]}, | ||||
|             display_links=True | ||||
|         ) | ||||
|     # otherwise set config to None/default | ||||
|     else: | ||||
|         parser_config = None | ||||
|  | ||||
|     # RSS Mode - Inscriptis will treat `title` as something else. | ||||
|     # Make it as a regular block display element (//item/title) | ||||
|     # This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874 | ||||
|     if is_rss: | ||||
|         html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content) | ||||
|         html_content = re.sub(r'</title>', r'</h1>', html_content) | ||||
|  | ||||
|     text_content = get_text(html_content, config=parser_config) | ||||
|     conn.send(text_content) | ||||
|     conn.close() | ||||
|  | ||||
| # NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON | ||||
| def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False): | ||||
|     from multiprocessing import Process, Pipe | ||||
|  | ||||
|     parent_conn, child_conn = Pipe() | ||||
|     p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss)) | ||||
|     p.start() | ||||
|     text = parent_conn.recv() | ||||
|     p.join() | ||||
|     return text | ||||
|     return text_content | ||||
|  | ||||
| # Does LD+JSON exist with a @type=='product' and a .price set anywhere? | ||||
| def has_ldjson_product_info(content): | ||||
|   | ||||
| @@ -38,6 +38,9 @@ pytest tests/test_backend.py | ||||
| pytest tests/test_rss.py | ||||
| pytest tests/test_unique_lines.py | ||||
|  | ||||
| # Try high concurrency | ||||
| FETCH_WORKERS=130 pytest  tests/test_history_consistency.py -v -l | ||||
|  | ||||
| # Check file:// will pickup a file when enabled | ||||
| echo "Hello world" > /tmp/test-file.txt | ||||
| ALLOW_FILE_URI=yes pytest tests/test_security.py | ||||
|   | ||||
| @@ -10,8 +10,8 @@ from urllib.parse import urlparse, parse_qs | ||||
|  | ||||
| def test_consistent_history(client, live_server, measure_memory_usage): | ||||
|     live_server_setup(live_server) | ||||
|  | ||||
|     r = range(1, 30) | ||||
|     workers = int(os.getenv("FETCH_WORKERS", 10)) | ||||
|     r = range(1, 10+workers) | ||||
|  | ||||
|     for one in r: | ||||
|         test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True) | ||||
| @@ -46,9 +46,10 @@ def test_consistent_history(client, live_server, measure_memory_usage): | ||||
|  | ||||
|     # assert the right amount of watches was found in the JSON | ||||
|     assert len(json_obj['watching']) == len(r), "Correct number of watches was found in the JSON" | ||||
|  | ||||
|     i=0 | ||||
|     # each one should have a history.txt containing just one line | ||||
|     for w in json_obj['watching'].keys(): | ||||
|         i+=1 | ||||
|         history_txt_index_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, 'history.txt') | ||||
|         assert os.path.isfile(history_txt_index_file), f"History.txt should exist where I expect it at {history_txt_index_file}" | ||||
|  | ||||
| @@ -58,8 +59,8 @@ def test_consistent_history(client, live_server, measure_memory_usage): | ||||
|             assert len(tmp_history) == 1, "History.txt should contain 1 line" | ||||
|  | ||||
|         # Should be two files,. the history.txt , and the snapshot.txt | ||||
|         files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, | ||||
|                                                      w)) | ||||
|         files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w)) | ||||
|  | ||||
|         # Find the snapshot one | ||||
|         for fname in files_in_watch_dir: | ||||
|             if fname != 'history.txt' and 'html' not in fname: | ||||
| @@ -75,7 +76,6 @@ def test_consistent_history(client, live_server, measure_memory_usage): | ||||
|  | ||||
|         assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot" | ||||
|  | ||||
|  | ||||
|     json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json') | ||||
|     with open(json_db_file, 'r') as f: | ||||
|         assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved" | ||||
|   | ||||
| @@ -126,18 +126,51 @@ def extract_UUID_from_client(client): | ||||
|     uuid = m.group(1) | ||||
|     return uuid.strip() | ||||
|  | ||||
| def wait_for_all_checks(client): | ||||
|     # actually this is not entirely true, it can still be 'processing' but not in the queue | ||||
|     # Loop waiting until done.. | ||||
|     attempt=0 | ||||
|     # because sub-second rechecks are problematic in testing, use lots of delays | ||||
|     time.sleep(1) | ||||
|     while attempt < 60: | ||||
|         res = client.get(url_for("watchlist.index")) | ||||
|         if not b'Checking now' in res.data: | ||||
|             break | ||||
|         logging.getLogger().info(f"Waiting for watch-list to not say 'Checking now'.. {attempt}") | ||||
|         time.sleep(1) | ||||
|  | ||||
| def wait_for_all_checks(client=None): | ||||
|     """ | ||||
|     Waits until the queue is empty and remains empty for at least `required_empty_duration` seconds, | ||||
|     and also ensures no running threads have `current_uuid` set. | ||||
|     Retries for up to `max_attempts` times, sleeping `wait_between_attempts` seconds between checks. | ||||
|     """ | ||||
|     from changedetectionio.flask_app import update_q as global_update_q, running_update_threads | ||||
|  | ||||
|     # Configuration | ||||
|     attempt = 0 | ||||
|     i=0 | ||||
|     max_attempts = 60 | ||||
|     wait_between_attempts = 2 | ||||
|     required_empty_duration = 2 | ||||
|  | ||||
|     logger = logging.getLogger() | ||||
|     time.sleep(1.2) | ||||
|  | ||||
|     empty_since = None | ||||
|  | ||||
|     while attempt < max_attempts: | ||||
|         q_length = global_update_q.qsize() | ||||
|  | ||||
|         # Check if any threads are still processing | ||||
|         time.sleep(1.2) | ||||
|         any_threads_busy = any(t.current_uuid for t in running_update_threads) | ||||
|  | ||||
|  | ||||
|         if q_length == 0 and not any_threads_busy: | ||||
|             if empty_since is None: | ||||
|                 empty_since = time.time() | ||||
|                 logger.info(f"Queue empty and no active threads at attempt {attempt}, starting empty timer...") | ||||
|             elif time.time() - empty_since >= required_empty_duration: | ||||
|                 logger.info(f"Queue has been empty and threads idle for {required_empty_duration} seconds. Done waiting.") | ||||
|                 break | ||||
|             else: | ||||
|                 logger.info(f"Still waiting: queue empty and no active threads, but not yet {required_empty_duration} seconds...") | ||||
|         else: | ||||
|             if q_length != 0: | ||||
|                 logger.info(f"Queue not empty (size={q_length}), resetting timer.") | ||||
|             if any_threads_busy: | ||||
|                 busy_threads = [t.name for t in running_update_threads if t.current_uuid] | ||||
|                 logger.info(f"Threads still busy: {busy_threads}, resetting timer.") | ||||
|             empty_since = None | ||||
|         attempt += 1 | ||||
|  | ||||
|     time.sleep(1) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user