mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-13 04:56:39 +00:00
Compare commits
4 Commits
0.49.13
...
playwright
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3d2bc5049b | ||
|
|
186016e605 | ||
|
|
3a583a4e5d | ||
|
|
cfb4decf67 |
@@ -19,6 +19,20 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
|
||||
if tag_uuid in watch.get('tags', []) and (tag.get('include_filters') or tag.get('subtractive_selectors')):
|
||||
return True
|
||||
|
||||
def levenshtein_ratio_recent_history(watch):
|
||||
try:
|
||||
from Levenshtein import ratio, distance
|
||||
k = list(watch.history.keys())
|
||||
if len(k) >= 2:
|
||||
a = watch.get_history_snapshot(timestamp=k[0])
|
||||
b = watch.get_history_snapshot(timestamp=k[1])
|
||||
distance = distance(a, b)
|
||||
return distance
|
||||
except Exception as e:
|
||||
logger.warning("Unable to calc similarity", e)
|
||||
return "Unable to calc similarity"
|
||||
return ''
|
||||
|
||||
@edit_blueprint.route("/edit/<string:uuid>", methods=['GET', 'POST'])
|
||||
@login_optionally_required
|
||||
# https://stackoverflow.com/questions/42984453/wtforms-populate-form-with-data-if-data-exists
|
||||
@@ -247,14 +261,15 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
|
||||
'has_default_notification_urls': True if len(datastore.data['settings']['application']['notification_urls']) else False,
|
||||
'has_extra_headers_file': len(datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) > 0,
|
||||
'has_special_tag_options': _watch_has_tag_options_set(watch=watch),
|
||||
'watch_uses_webdriver': watch_uses_webdriver,
|
||||
'jq_support': jq_support,
|
||||
'lev_info': levenshtein_ratio_recent_history(watch),
|
||||
'playwright_enabled': os.getenv('PLAYWRIGHT_DRIVER_URL', False),
|
||||
'settings_application': datastore.data['settings']['application'],
|
||||
'timezone_default_config': datastore.data['settings']['application'].get('timezone'),
|
||||
'using_global_webdriver_wait': not default['webdriver_delay'],
|
||||
'uuid': uuid,
|
||||
'watch': watch
|
||||
'watch': watch,
|
||||
'watch_uses_webdriver': watch_uses_webdriver,
|
||||
}
|
||||
|
||||
included_content = None
|
||||
|
||||
@@ -96,7 +96,7 @@ def execute_ruleset_against_all_plugins(current_watch_uuid: str, application_dat
|
||||
|
||||
ruleset_settings = application_datastruct['watching'].get(current_watch_uuid)
|
||||
|
||||
if ruleset_settings.get("conditions"):
|
||||
if ruleset_settings and ruleset_settings.get("conditions"):
|
||||
logic_operator = "and" if ruleset_settings.get("conditions_match_logic", "ALL") == "ALL" else "or"
|
||||
complete_rules = filter_complete_rules(ruleset_settings['conditions'])
|
||||
if complete_rules:
|
||||
|
||||
@@ -26,9 +26,11 @@ def capture_full_page(page):
|
||||
step_size = SCREENSHOT_SIZE_STITCH_THRESHOLD # Size that won't cause GPU to overflow
|
||||
screenshot_chunks = []
|
||||
y = 0
|
||||
|
||||
# If page height is larger than current viewport, use a larger viewport for better capturing
|
||||
|
||||
if page_height > page.viewport_size['height']:
|
||||
if page_height < step_size:
|
||||
step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
|
||||
logger.debug(f"Setting bigger viewport to step through large page width W{page.viewport_size['width']}xH{step_size} because page_height > viewport_size")
|
||||
# Set viewport to a larger size to capture more content at once
|
||||
page.set_viewport_size({'width': page.viewport_size['width'], 'height': step_size})
|
||||
|
||||
@@ -59,7 +61,10 @@ def capture_full_page(page):
|
||||
p.join()
|
||||
logger.debug(
|
||||
f"Screenshot (chunked/stitched) - Page height: {page_height} Capture height: {SCREENSHOT_MAX_TOTAL_HEIGHT} - Stitched together in {time.time() - start:.2f}s")
|
||||
|
||||
# Explicit cleanup
|
||||
del screenshot_chunks
|
||||
del p
|
||||
del parent_conn, child_conn
|
||||
screenshot_chunks = None
|
||||
return screenshot
|
||||
|
||||
@@ -286,12 +291,28 @@ class fetcher(Fetcher):
|
||||
pass
|
||||
|
||||
# Clean up resources properly
|
||||
context.close()
|
||||
context = None
|
||||
try:
|
||||
self.page.request_gc()
|
||||
except:
|
||||
pass
|
||||
|
||||
self.page.close()
|
||||
try:
|
||||
self.page.close()
|
||||
except:
|
||||
pass
|
||||
self.page = None
|
||||
|
||||
browser.close()
|
||||
borwser = None
|
||||
try:
|
||||
context.close()
|
||||
except:
|
||||
pass
|
||||
context = None
|
||||
|
||||
try:
|
||||
browser.close()
|
||||
except:
|
||||
pass
|
||||
browser = None
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -46,9 +46,10 @@ async def capture_full_page(page):
|
||||
screenshot_chunks = []
|
||||
y = 0
|
||||
if page_height > page.viewport['height']:
|
||||
if page_height < step_size:
|
||||
step_size = page_height # Incase page is bigger than default viewport but smaller than proposed step size
|
||||
await page.setViewport({'width': page.viewport['width'], 'height': step_size})
|
||||
|
||||
|
||||
while y < min(page_height, SCREENSHOT_MAX_TOTAL_HEIGHT):
|
||||
await page.evaluate(f"window.scrollTo(0, {y})")
|
||||
screenshot_chunks.append(await page.screenshot(type_='jpeg',
|
||||
|
||||
@@ -435,7 +435,9 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
|
||||
|
||||
return re.sub(pattern, repl, html_content)
|
||||
|
||||
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False) -> str:
|
||||
|
||||
def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
|
||||
|
||||
from inscriptis import get_text
|
||||
from inscriptis.model.config import ParserConfig
|
||||
|
||||
@@ -470,9 +472,19 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
|
||||
html_content = re.sub(r'</title>', r'</h1>', html_content)
|
||||
|
||||
text_content = get_text(html_content, config=parser_config)
|
||||
conn.send(text_content)
|
||||
conn.close()
|
||||
|
||||
return text_content
|
||||
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
|
||||
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
|
||||
from multiprocessing import Process, Pipe
|
||||
|
||||
parent_conn, child_conn = Pipe()
|
||||
p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
|
||||
p.start()
|
||||
text = parent_conn.recv()
|
||||
p.join()
|
||||
return text
|
||||
|
||||
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
|
||||
def has_ldjson_product_info(content):
|
||||
|
||||
@@ -443,6 +443,10 @@ Math: {{ 1 + 1 }}") }}
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<h4>Text similarity</h4>
|
||||
<p><strong>Levenshtein Distance</strong> - Last 2 snapshots: {{ lev_info }}</p>
|
||||
<p style="max-width: 80%; font-size: 80%"><strong>Levenshtein Distance</strong> Calculates the minimum number of insertions, deletions, and substitutions required to change one text into the other.</p>
|
||||
{% if watch.history_n %}
|
||||
<p>
|
||||
<a href="{{url_for('ui.ui_edit.watch_get_latest_html', uuid=uuid)}}" class="pure-button button-small">Download latest HTML snapshot</a>
|
||||
|
||||
@@ -74,6 +74,11 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
|
||||
res = client.get(url_for("ui.ui_edit.watch_get_latest_html", uuid=uuid))
|
||||
assert b'which has this one new line' in res.data
|
||||
|
||||
# Check the 'levenshtein' distance calc showed something useful
|
||||
res = client.get(url_for("ui.ui_edit.edit_page", uuid=uuid))
|
||||
assert b'Last 2 snapshots: 17' in res.data
|
||||
|
||||
|
||||
# Now something should be ready, indicated by having a 'unviewed' class
|
||||
res = client.get(url_for("watchlist.index"))
|
||||
assert b'unviewed' in res.data
|
||||
|
||||
@@ -68,6 +68,8 @@ openpyxl
|
||||
jq~=1.3; python_version >= "3.8" and sys_platform == "darwin"
|
||||
jq~=1.3; python_version >= "3.8" and sys_platform == "linux"
|
||||
|
||||
levenshtein
|
||||
|
||||
# playwright is installed at Dockerfile build time because it's not available on all platforms
|
||||
|
||||
pyppeteer-ng==2.0.0rc9
|
||||
|
||||
Reference in New Issue
Block a user