#!/usr/bin/env python3 import time import os from flask import url_for from ..html_tools import * from .util import live_server_setup, wait_for_all_checks, delete_all_watches def set_response_with_multiple_index(datastore_path): data= """
Person 1 Person 2 Person 3
Emil Tobias Linus
16 14 10
""" with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f: f.write(data) def set_original_response(datastore_path): test_return_data = """

Header

Some initial text

Which is across multiple lines


So let's see what happens.
Some text that will change
""" with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f: f.write(test_return_data) def set_modified_response(datastore_path): test_return_data = """

Header changed

Some initial text

Which is across multiple lines


So let's see what happens.
Some text that changes
""" with open(os.path.join(datastore_path, "endpoint-content.txt"), "w") as f: f.write(test_return_data) def test_element_removal_output(): from inscriptis import get_text # Check text with sub-parts renders correctly content = """

Header

Some initial text

across multiple lines

Some text that changes
Some text should be matched by xPath // selector
Some text should be matched by xPath selector
Some text should be matched by xPath1 selector
""" html_blob = element_removal( [ "header", "footer", "nav", "#changetext", "//*[contains(text(), 'xPath // selector')]", "xpath://*[contains(text(), 'xPath selector')]", "xpath1://*[contains(text(), 'xPath1 selector')]" ], html_content=content ) text = get_text(html_blob) assert ( text == """Some initial text across multiple lines """ ) def test_element_removal_full(client, live_server, measure_memory_usage, datastore_path): set_original_response(datastore_path=datastore_path) # Add our URL to the import page test_url = url_for("test_endpoint", _external=True) res = client.post( url_for("imports.import_page"), data={"urls": test_url}, follow_redirects=True ) assert b"1 Imported" in res.data wait_for_all_checks(client) # Goto the edit page, add the filter data # Not sure why \r needs to be added - absent of the #changetext this is not necessary subtractive_selectors_data = "header\r\nfooter\r\nnav\r\n#changetext" res = client.post( url_for("ui.ui_edit.edit_page", uuid="first"), data={ "subtractive_selectors": subtractive_selectors_data, "url": test_url, "tags": "", "headers": "", "fetch_backend": "html_requests", "time_between_check_use_default": "y", }, follow_redirects=True, ) assert b"Updated watch." in res.data wait_for_all_checks(client) # Check it saved res = client.get( url_for("ui.ui_edit.edit_page", uuid="first"), ) assert bytes(subtractive_selectors_data.encode("utf-8")) in res.data # Trigger a check res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) assert b'Queued 1 watch for rechecking.' in res.data wait_for_all_checks(client) # so that we set the state to 'has-unread-changes' after all the edits client.get(url_for("ui.ui_views.diff_history_page", uuid="first")) # Make a change to header/footer/nav set_modified_response(datastore_path=datastore_path) # Trigger a check res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) assert b'Queued 1 watch for rechecking.' in res.data # Give the thread time to pick it up wait_for_all_checks(client) # There should not be an unviewed change, as changes should be removed res = client.get(url_for("watchlist.index")) assert b"unviewed" not in res.data # Re #2752 def test_element_removal_nth_offset_no_shift(client, live_server, measure_memory_usage, datastore_path): set_response_with_multiple_index(datastore_path=datastore_path) subtractive_selectors_data = [ ### css style ### """body > table > tr:nth-child(1) > th:nth-child(2) body > table > tr:nth-child(2) > td:nth-child(2) body > table > tr:nth-child(3) > td:nth-child(2) body > table > tr:nth-child(1) > th:nth-child(3) body > table > tr:nth-child(2) > td:nth-child(3) body > table > tr:nth-child(3) > td:nth-child(3)""", ### second type, xpath ### """//body/table/tr[1]/th[2] //body/table/tr[2]/td[2] //body/table/tr[3]/td[2] //body/table/tr[1]/th[3] //body/table/tr[2]/td[3] //body/table/tr[3]/td[3]"""] test_url = url_for("test_endpoint", _external=True) for selector_list in subtractive_selectors_data: delete_all_watches(client) uuid = client.application.config.get('DATASTORE').add_watch(url=test_url, extras={"subtractive_selectors": selector_list.splitlines()}) client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) wait_for_all_checks(client) res = client.get( url_for("ui.ui_views.preview_page", uuid="first"), follow_redirects=True ) # the filters above should have removed this but they never say to remove the "emil" column assert b"Tobias" not in res.data assert b"Linus" not in res.data assert b"Person 2" not in res.data assert b"Person 3" not in res.data # First column should exist assert b"Emil" in res.data