mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-08 18:47:32 +00:00
Compare commits
4 Commits
browser-no
...
revert-mul
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
24b9e4dc83 | ||
|
|
d481a6b7b1 | ||
|
|
e38f264750 | ||
|
|
1ec86bd38d |
@@ -436,55 +436,27 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
|
|||||||
return re.sub(pattern, repl, html_content)
|
return re.sub(pattern, repl, html_content)
|
||||||
|
|
||||||
|
|
||||||
def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
|
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
|
||||||
|
|
||||||
|
|
||||||
|
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
|
||||||
from inscriptis import get_text
|
from inscriptis import get_text
|
||||||
from inscriptis.model.config import ParserConfig
|
from inscriptis.model.config import ParserConfig
|
||||||
|
|
||||||
"""Converts html string to a string with just the text. If ignoring
|
|
||||||
rendering anchor tag content is enable, anchor tag content are also
|
|
||||||
included in the text
|
|
||||||
|
|
||||||
:param html_content: string with html content
|
|
||||||
:param render_anchor_tag_content: boolean flag indicating whether to extract
|
|
||||||
hyperlinks (the anchor tag content) together with text. This refers to the
|
|
||||||
'href' inside 'a' tags.
|
|
||||||
Anchor tag content is rendered in the following manner:
|
|
||||||
'[ text ](anchor tag content)'
|
|
||||||
:return: extracted text from the HTML
|
|
||||||
"""
|
|
||||||
# if anchor tag content flag is set to True define a config for
|
|
||||||
# extracting this content
|
|
||||||
if render_anchor_tag_content:
|
if render_anchor_tag_content:
|
||||||
parser_config = ParserConfig(
|
parser_config = ParserConfig(
|
||||||
annotation_rules={"a": ["hyperlink"]},
|
annotation_rules={"a": ["hyperlink"]},
|
||||||
display_links=True
|
display_links=True
|
||||||
)
|
)
|
||||||
# otherwise set config to None/default
|
|
||||||
else:
|
else:
|
||||||
parser_config = None
|
parser_config = None
|
||||||
|
|
||||||
# RSS Mode - Inscriptis will treat `title` as something else.
|
|
||||||
# Make it as a regular block display element (//item/title)
|
|
||||||
# This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
|
|
||||||
if is_rss:
|
if is_rss:
|
||||||
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
|
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
|
||||||
html_content = re.sub(r'</title>', r'</h1>', html_content)
|
html_content = re.sub(r'</title>', r'</h1>', html_content)
|
||||||
|
|
||||||
text_content = get_text(html_content, config=parser_config)
|
text_content = get_text(html_content, config=parser_config)
|
||||||
conn.send(text_content)
|
return text_content
|
||||||
conn.close()
|
|
||||||
|
|
||||||
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
|
|
||||||
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
|
|
||||||
from multiprocessing import Process, Pipe
|
|
||||||
|
|
||||||
parent_conn, child_conn = Pipe()
|
|
||||||
p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
|
|
||||||
p.start()
|
|
||||||
text = parent_conn.recv()
|
|
||||||
p.join()
|
|
||||||
return text
|
|
||||||
|
|
||||||
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
|
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
|
||||||
def has_ldjson_product_info(content):
|
def has_ldjson_product_info(content):
|
||||||
|
|||||||
@@ -38,6 +38,9 @@ pytest tests/test_backend.py
|
|||||||
pytest tests/test_rss.py
|
pytest tests/test_rss.py
|
||||||
pytest tests/test_unique_lines.py
|
pytest tests/test_unique_lines.py
|
||||||
|
|
||||||
|
# Try high concurrency
|
||||||
|
FETCH_WORKERS=130 pytest tests/test_history_consistency.py -v -l
|
||||||
|
|
||||||
# Check file:// will pickup a file when enabled
|
# Check file:// will pickup a file when enabled
|
||||||
echo "Hello world" > /tmp/test-file.txt
|
echo "Hello world" > /tmp/test-file.txt
|
||||||
ALLOW_FILE_URI=yes pytest tests/test_security.py
|
ALLOW_FILE_URI=yes pytest tests/test_security.py
|
||||||
|
|||||||
@@ -10,8 +10,8 @@ from urllib.parse import urlparse, parse_qs
|
|||||||
|
|
||||||
def test_consistent_history(client, live_server, measure_memory_usage):
|
def test_consistent_history(client, live_server, measure_memory_usage):
|
||||||
live_server_setup(live_server)
|
live_server_setup(live_server)
|
||||||
|
workers = int(os.getenv("FETCH_WORKERS", 10))
|
||||||
r = range(1, 30)
|
r = range(1, 10+workers)
|
||||||
|
|
||||||
for one in r:
|
for one in r:
|
||||||
test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)
|
test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)
|
||||||
@@ -46,9 +46,10 @@ def test_consistent_history(client, live_server, measure_memory_usage):
|
|||||||
|
|
||||||
# assert the right amount of watches was found in the JSON
|
# assert the right amount of watches was found in the JSON
|
||||||
assert len(json_obj['watching']) == len(r), "Correct number of watches was found in the JSON"
|
assert len(json_obj['watching']) == len(r), "Correct number of watches was found in the JSON"
|
||||||
|
i=0
|
||||||
# each one should have a history.txt containing just one line
|
# each one should have a history.txt containing just one line
|
||||||
for w in json_obj['watching'].keys():
|
for w in json_obj['watching'].keys():
|
||||||
|
i+=1
|
||||||
history_txt_index_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, 'history.txt')
|
history_txt_index_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, 'history.txt')
|
||||||
assert os.path.isfile(history_txt_index_file), f"History.txt should exist where I expect it at {history_txt_index_file}"
|
assert os.path.isfile(history_txt_index_file), f"History.txt should exist where I expect it at {history_txt_index_file}"
|
||||||
|
|
||||||
@@ -58,8 +59,8 @@ def test_consistent_history(client, live_server, measure_memory_usage):
|
|||||||
assert len(tmp_history) == 1, "History.txt should contain 1 line"
|
assert len(tmp_history) == 1, "History.txt should contain 1 line"
|
||||||
|
|
||||||
# Should be two files,. the history.txt , and the snapshot.txt
|
# Should be two files,. the history.txt , and the snapshot.txt
|
||||||
files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path,
|
files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w))
|
||||||
w))
|
|
||||||
# Find the snapshot one
|
# Find the snapshot one
|
||||||
for fname in files_in_watch_dir:
|
for fname in files_in_watch_dir:
|
||||||
if fname != 'history.txt' and 'html' not in fname:
|
if fname != 'history.txt' and 'html' not in fname:
|
||||||
@@ -75,7 +76,6 @@ def test_consistent_history(client, live_server, measure_memory_usage):
|
|||||||
|
|
||||||
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"
|
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"
|
||||||
|
|
||||||
|
|
||||||
json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
|
json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
|
||||||
with open(json_db_file, 'r') as f:
|
with open(json_db_file, 'r') as f:
|
||||||
assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"
|
assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"
|
||||||
|
|||||||
@@ -126,18 +126,51 @@ def extract_UUID_from_client(client):
|
|||||||
uuid = m.group(1)
|
uuid = m.group(1)
|
||||||
return uuid.strip()
|
return uuid.strip()
|
||||||
|
|
||||||
def wait_for_all_checks(client):
|
|
||||||
# actually this is not entirely true, it can still be 'processing' but not in the queue
|
def wait_for_all_checks(client=None):
|
||||||
# Loop waiting until done..
|
"""
|
||||||
attempt=0
|
Waits until the queue is empty and remains empty for at least `required_empty_duration` seconds,
|
||||||
# because sub-second rechecks are problematic in testing, use lots of delays
|
and also ensures no running threads have `current_uuid` set.
|
||||||
time.sleep(1)
|
Retries for up to `max_attempts` times, sleeping `wait_between_attempts` seconds between checks.
|
||||||
while attempt < 60:
|
"""
|
||||||
res = client.get(url_for("watchlist.index"))
|
from changedetectionio.flask_app import update_q as global_update_q, running_update_threads
|
||||||
if not b'Checking now' in res.data:
|
|
||||||
break
|
# Configuration
|
||||||
logging.getLogger().info(f"Waiting for watch-list to not say 'Checking now'.. {attempt}")
|
attempt = 0
|
||||||
time.sleep(1)
|
i=0
|
||||||
|
max_attempts = 60
|
||||||
|
wait_between_attempts = 2
|
||||||
|
required_empty_duration = 2
|
||||||
|
|
||||||
|
logger = logging.getLogger()
|
||||||
|
time.sleep(1.2)
|
||||||
|
|
||||||
|
empty_since = None
|
||||||
|
|
||||||
|
while attempt < max_attempts:
|
||||||
|
q_length = global_update_q.qsize()
|
||||||
|
|
||||||
|
# Check if any threads are still processing
|
||||||
|
time.sleep(1.2)
|
||||||
|
any_threads_busy = any(t.current_uuid for t in running_update_threads)
|
||||||
|
|
||||||
|
|
||||||
|
if q_length == 0 and not any_threads_busy:
|
||||||
|
if empty_since is None:
|
||||||
|
empty_since = time.time()
|
||||||
|
logger.info(f"Queue empty and no active threads at attempt {attempt}, starting empty timer...")
|
||||||
|
elif time.time() - empty_since >= required_empty_duration:
|
||||||
|
logger.info(f"Queue has been empty and threads idle for {required_empty_duration} seconds. Done waiting.")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
logger.info(f"Still waiting: queue empty and no active threads, but not yet {required_empty_duration} seconds...")
|
||||||
|
else:
|
||||||
|
if q_length != 0:
|
||||||
|
logger.info(f"Queue not empty (size={q_length}), resetting timer.")
|
||||||
|
if any_threads_busy:
|
||||||
|
busy_threads = [t.name for t in running_update_threads if t.current_uuid]
|
||||||
|
logger.info(f"Threads still busy: {busy_threads}, resetting timer.")
|
||||||
|
empty_since = None
|
||||||
attempt += 1
|
attempt += 1
|
||||||
|
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|||||||
Reference in New Issue
Block a user