Revert multiprocess html to text and add test for high concurrency

Improve wait check
Avoid pickling issues
2025-11-08 18:47:32 +00:00 · 2025-05-08 18:54:14 +02:00 · 2025-05-08 18:38:44 +02:00 · 2025-05-08 18:15:16 +02:00 · 2025-05-08 18:09:47 +02:00
4 changed files with 58 additions and 50 deletions
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -436,55 +436,27 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
    return re.sub(pattern, repl, html_content)
-def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
+# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
 def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
    from inscriptis import get_text
    from inscriptis.model.config import ParserConfig
    """Converts html string to a string with just the text. If ignoring
    rendering anchor tag content is enable, anchor tag content are also
    included in the text
    :param html_content: string with html content
    :param render_anchor_tag_content: boolean flag indicating whether to extract
    hyperlinks (the anchor tag content) together with text. This refers to the
    'href' inside 'a' tags.
    Anchor tag content is rendered in the following manner:
    '[ text ](anchor tag content)'
    :return: extracted text from the HTML
    """
    #  if anchor tag content flag is set to True define a config for
    #  extracting this content
    if render_anchor_tag_content:
        parser_config = ParserConfig(
            annotation_rules={"a": ["hyperlink"]},
            display_links=True
        )
    # otherwise set config to None/default
    else:
        parser_config = None
    # RSS Mode - Inscriptis will treat `title` as something else.
    # Make it as a regular block display element (//item/title)
    # This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
    if is_rss:
        html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
        html_content = re.sub(r'</title>', r'</h1>', html_content)
    text_content = get_text(html_content, config=parser_config)
-    conn.send(text_content)
+    return text_content
    conn.close()
 # NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
 def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
    from multiprocessing import Process, Pipe
    parent_conn, child_conn = Pipe()
    p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
    p.start()
    text = parent_conn.recv()
    p.join()
    return text
 # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
 def has_ldjson_product_info(content):
--- a/changedetectionio/run_basic_tests.sh
+++ b/changedetectionio/run_basic_tests.sh
@@ -38,6 +38,9 @@ pytest tests/test_backend.py
 pytest tests/test_rss.py
 pytest tests/test_unique_lines.py
 # Try high concurrency
 FETCH_WORKERS=130 pytest  tests/test_history_consistency.py -v -l
 # Check file:// will pickup a file when enabled
 echo "Hello world" > /tmp/test-file.txt
 ALLOW_FILE_URI=yes pytest tests/test_security.py
--- a/changedetectionio/tests/test_history_consistency.py
+++ b/changedetectionio/tests/test_history_consistency.py
@@ -10,8 +10,8 @@ from urllib.parse import urlparse, parse_qs
 def test_consistent_history(client, live_server, measure_memory_usage):
    live_server_setup(live_server)
-
+    workers = int(os.getenv("FETCH_WORKERS", 10))
-    r = range(1, 30)
+    r = range(1, 10+workers)
    for one in r:
        test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)
@@ -46,9 +46,10 @@ def test_consistent_history(client, live_server, measure_memory_usage):
    # assert the right amount of watches was found in the JSON
    assert len(json_obj['watching']) == len(r), "Correct number of watches was found in the JSON"
-
+    i=0
    # each one should have a history.txt containing just one line
    for w in json_obj['watching'].keys():
        i+=1
        history_txt_index_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, 'history.txt')
        assert os.path.isfile(history_txt_index_file), f"History.txt should exist where I expect it at {history_txt_index_file}"
@@ -58,8 +59,8 @@ def test_consistent_history(client, live_server, measure_memory_usage):
            assert len(tmp_history) == 1, "History.txt should contain 1 line"
        # Should be two files,. the history.txt , and the snapshot.txt
-        files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path,
+        files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w))
-                                                     w))
+
        # Find the snapshot one
        for fname in files_in_watch_dir:
            if fname != 'history.txt' and 'html' not in fname:
@@ -75,7 +76,6 @@ def test_consistent_history(client, live_server, measure_memory_usage):
        assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"
    json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
    with open(json_db_file, 'r') as f:
        assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"
--- a/changedetectionio/tests/util.py
+++ b/changedetectionio/tests/util.py
@@ -126,18 +126,51 @@ def extract_UUID_from_client(client):
    uuid = m.group(1)
    return uuid.strip()
-def wait_for_all_checks(client):
+
-    # actually this is not entirely true, it can still be 'processing' but not in the queue
+def wait_for_all_checks(client=None):
-    # Loop waiting until done..
+    """
-    attempt=0
+    Waits until the queue is empty and remains empty for at least `required_empty_duration` seconds,
-    # because sub-second rechecks are problematic in testing, use lots of delays
+    and also ensures no running threads have `current_uuid` set.
-    time.sleep(1)
+    Retries for up to `max_attempts` times, sleeping `wait_between_attempts` seconds between checks.
-    while attempt < 60:
+    """
-        res = client.get(url_for("watchlist.index"))
+    from changedetectionio.flask_app import update_q as global_update_q, running_update_threads
-        if not b'Checking now' in res.data:
+
-            break
+    # Configuration
-        logging.getLogger().info(f"Waiting for watch-list to not say 'Checking now'.. {attempt}")
+    attempt = 0
-        time.sleep(1)
+    i=0
    max_attempts = 60
    wait_between_attempts = 2
    required_empty_duration = 2
    logger = logging.getLogger()
    time.sleep(1.2)
    empty_since = None
    while attempt < max_attempts:
        q_length = global_update_q.qsize()
        # Check if any threads are still processing
        time.sleep(1.2)
        any_threads_busy = any(t.current_uuid for t in running_update_threads)
        if q_length == 0 and not any_threads_busy:
            if empty_since is None:
                empty_since = time.time()
                logger.info(f"Queue empty and no active threads at attempt {attempt}, starting empty timer...")
            elif time.time() - empty_since >= required_empty_duration:
                logger.info(f"Queue has been empty and threads idle for {required_empty_duration} seconds. Done waiting.")
                break
            else:
                logger.info(f"Still waiting: queue empty and no active threads, but not yet {required_empty_duration} seconds...")
        else:
            if q_length != 0:
                logger.info(f"Queue not empty (size={q_length}), resetting timer.")
            if any_threads_busy:
                busy_threads = [t.name for t in running_update_threads if t.current_uuid]
                logger.info(f"Threads still busy: {busy_threads}, resetting timer.")
            empty_since = None
        attempt += 1
    time.sleep(1)
Author	SHA1	Message	Date
dgtlmoon	24b9e4dc83	Revert multiprocess html to text and add test for high concurrency	2025-05-08 18:54:14 +02:00
dgtlmoon	d481a6b7b1	Improve wait check	2025-05-08 18:38:44 +02:00
dgtlmoon	e38f264750	Avoid pickling issues	2025-05-08 18:15:16 +02:00
dgtlmoon	1ec86bd38d	Revert multiprocess memory management, was unreliable under high concurrency	2025-05-08 18:09:47 +02:00