Compare commits

...

3 Commits

Author SHA1 Message Date
dgtlmoon
dc33d49840 best to not let it process this 2024-02-02 09:52:44 +01:00
dgtlmoon
c30f96c4cd update test 2024-02-02 09:45:09 +01:00
dgtlmoon
c8310b7e93 Updating inscriptis library, removing fixes from 2.2 2024-02-02 09:28:24 +01:00
4 changed files with 5 additions and 23 deletions

View File

@@ -409,23 +409,6 @@ def has_ldjson_product_info(content):
x=bool(pricing_data)
return x
def workarounds_for_obfuscations(content):
"""
Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
This could go into its own Pip package in the future, for faster updates
"""
# HomeDepot.com style <span>$<!-- -->90<!-- -->.<!-- -->74</span>
# https://github.com/weblyzard/inscriptis/issues/45
if not content:
return content
content = re.sub('<!--\s+-->', '', content)
return content
def get_triggered_text(content, trigger_text):
triggered_text = []
result = strip_ignore_text(content=content,

View File

@@ -151,7 +151,6 @@ class perform_site_check(difference_detection_processor):
if is_html or watch.is_source_type_url:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
html_content = self.fetcher.content
# If not JSON, and if it's not text/plain..

View File

@@ -2,7 +2,7 @@
import time
from flask import url_for
from .util import live_server_setup
from .util import live_server_setup, wait_for_all_checks
def set_original_ignore_response():
@@ -21,7 +21,7 @@ def set_original_ignore_response():
def test_obfuscations(client, live_server):
set_original_ignore_response()
live_server_setup(live_server)
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
@@ -32,12 +32,12 @@ def test_obfuscations(client, live_server):
assert b"1 Imported" in res.data
# Give the thread time to pick it up
time.sleep(3)
wait_for_all_checks(client)
# Check HTML conversion detected and workd
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
# whitespace appears but it renders https://github.com/weblyzard/inscriptis/issues/45#issuecomment-1923339265
assert b'$90.74' in res.data

View File

@@ -8,7 +8,7 @@ flask_expects_json~=1.7
flask_restful
flask_wtf~=1.2
flask~=2.3
inscriptis~=2.2
inscriptis~=2.4
pytz
timeago~=1.0
validators~=0.21