mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-04-29 22:37:09 +00:00
WIP
This commit is contained in:
@@ -10,6 +10,7 @@ import re
|
||||
# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
|
||||
TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
|
||||
|
||||
PERL_STYLE_REGEX = r'^/(.*?)/([a-z]?)?$'
|
||||
# 'price' , 'lowPrice', 'highPrice' are usually under here
|
||||
# all of those may or may not appear on different websites
|
||||
LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
|
||||
@@ -17,7 +18,21 @@ LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
|
||||
class JSONNotFound(ValueError):
|
||||
def __init__(self, msg):
|
||||
ValueError.__init__(self, msg)
|
||||
|
||||
|
||||
|
||||
# Doesn't look like python supports forward slash auto enclosure in re.findall
|
||||
# So convert it to inline flag "(?i)foobar" type configuration
|
||||
def perl_style_slash_enclosed_regex_to_options(regex):
|
||||
res = re.search(PERL_STYLE_REGEX, regex, re.IGNORECASE)
|
||||
|
||||
if res:
|
||||
flags = res.group(2) if res.group(2) else 'i'
|
||||
regex = f"(?{flags}){res.group(1)}"
|
||||
else:
|
||||
regex = f"(?i){regex}"
|
||||
|
||||
return regex
|
||||
|
||||
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
|
||||
def include_filters(include_filters, html_content, append_pretty_line_formatting=False):
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
@@ -195,23 +210,14 @@ def strip_ignore_text(content, wordlist, mode="content"):
|
||||
output = []
|
||||
ignore_text = []
|
||||
ignore_regex = []
|
||||
|
||||
ignored_line_numbers = []
|
||||
|
||||
for k in wordlist:
|
||||
# Is it a regex?
|
||||
x = re.search('^\/(.*)\/(.*)', k.strip())
|
||||
if x:
|
||||
# Starts with / but doesn't look like a regex
|
||||
p = x.group(1)
|
||||
try:
|
||||
# @Todo python regex options can go before the regex str, but not really many of the options apply on a per-line basis
|
||||
ignore_regex.append(re.compile(rf"{p}", re.IGNORECASE))
|
||||
except Exception as e:
|
||||
# Badly formed regex, treat as text
|
||||
ignore_text.append(k.strip())
|
||||
res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE)
|
||||
if res:
|
||||
ignore_regex.append(re.compile(perl_style_slash_enclosed_regex_to_options(k)))
|
||||
else:
|
||||
# Had a / but doesn't work as regex
|
||||
ignore_text.append(k.strip())
|
||||
|
||||
for line in content.splitlines():
|
||||
|
||||
@@ -37,19 +37,6 @@ class perform_site_check(difference_detection_processor):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.datastore = datastore
|
||||
|
||||
# Doesn't look like python supports forward slash auto enclosure in re.findall
|
||||
# So convert it to inline flag "(?i)foobar" type configuration
|
||||
def forward_slash_enclosed_regex_to_options(self, regex):
|
||||
res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE)
|
||||
|
||||
if res:
|
||||
regex = res.group(1)
|
||||
regex = f"(?{res.group(2)}){regex}"
|
||||
else:
|
||||
regex = f"(?i){regex}"
|
||||
|
||||
return regex
|
||||
|
||||
def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None):
|
||||
changed_detected = False
|
||||
screenshot = False # as bytes
|
||||
@@ -340,7 +327,7 @@ class perform_site_check(difference_detection_processor):
|
||||
regex_matched_output = []
|
||||
for s_re in extract_text:
|
||||
# incase they specified something in '/.../x'
|
||||
regex = self.forward_slash_enclosed_regex_to_options(s_re)
|
||||
regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re)
|
||||
result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
|
||||
|
||||
for l in result:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
import time
|
||||
from flask import url_for
|
||||
from . util import live_server_setup
|
||||
from .util import live_server_setup, wait_for_all_checks
|
||||
|
||||
|
||||
def set_original_ignore_response():
|
||||
@@ -26,13 +26,8 @@ def test_trigger_regex_functionality(client, live_server):
|
||||
|
||||
live_server_setup(live_server)
|
||||
|
||||
sleep_time_for_fetch_thread = 3
|
||||
|
||||
set_original_ignore_response()
|
||||
|
||||
# Give the endpoint time to spin up
|
||||
time.sleep(1)
|
||||
|
||||
# Add our URL to the import page
|
||||
test_url = url_for('test_endpoint', _external=True)
|
||||
res = client.post(
|
||||
@@ -43,7 +38,7 @@ def test_trigger_regex_functionality(client, live_server):
|
||||
assert b"1 Imported" in res.data
|
||||
|
||||
# Give the thread time to pick it up
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
wait_for_all_checks(client)
|
||||
|
||||
# It should report nothing found (just a new one shouldnt have anything)
|
||||
res = client.get(url_for("index"))
|
||||
@@ -57,7 +52,7 @@ def test_trigger_regex_functionality(client, live_server):
|
||||
"fetch_backend": "html_requests"},
|
||||
follow_redirects=True
|
||||
)
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
wait_for_all_checks(client)
|
||||
# so that we set the state to 'unviewed' after all the edits
|
||||
client.get(url_for("diff_history_page", uuid="first"))
|
||||
|
||||
@@ -65,7 +60,7 @@ def test_trigger_regex_functionality(client, live_server):
|
||||
f.write("some new noise")
|
||||
|
||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
wait_for_all_checks(client)
|
||||
|
||||
# It should report nothing found (nothing should match the regex)
|
||||
res = client.get(url_for("index"))
|
||||
@@ -75,7 +70,7 @@ def test_trigger_regex_functionality(client, live_server):
|
||||
f.write("regex test123<br>\nsomething 123")
|
||||
|
||||
client.get(url_for("form_watch_checknow"), follow_redirects=True)
|
||||
time.sleep(sleep_time_for_fetch_thread)
|
||||
wait_for_all_checks(client)
|
||||
res = client.get(url_for("index"))
|
||||
assert b'unviewed' in res.data
|
||||
|
||||
|
||||
Reference in New Issue
Block a user