Compare commits

..

3 Commits

Author SHA1 Message Date
dgtlmoon
9111710c83 undo change 2023-09-23 17:21:35 +02:00
dgtlmoon
ce1d4b039c Also make user-agent detect string case insensitive 2023-09-23 16:36:14 +02:00
dgtlmoon
e1f19a3265 Dont set user agent default if none is set 2023-09-23 14:34:56 +02:00
20 changed files with 115 additions and 227 deletions

View File

@@ -30,11 +30,11 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v4
uses: actions/checkout@v2
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v2
uses: github/codeql-action/init@v1
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
@@ -45,7 +45,7 @@ jobs:
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
uses: github/codeql-action/autobuild@v2
uses: github/codeql-action/autobuild@v1
# Command-line programs to run using the OS shell.
# 📚 https://git.io/JvXDl
@@ -59,4 +59,4 @@ jobs:
# make release
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v2
uses: github/codeql-action/analyze@v1

View File

@@ -39,9 +39,9 @@ jobs:
# Or if we are in a tagged release scenario.
if: ${{ github.event.workflow_run.conclusion == 'success' }} || ${{ github.event.release.tag_name }} != ''
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v2
- name: Set up Python 3.9
uses: actions/setup-python@v4
uses: actions/setup-python@v2
with:
python-version: 3.9
@@ -58,27 +58,27 @@ jobs:
echo ${{ github.ref }} > changedetectionio/tag.txt
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
uses: docker/setup-qemu-action@v1
with:
image: tonistiigi/binfmt:latest
platforms: all
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
uses: docker/login-action@v1
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Login to Docker Hub Container Registry
uses: docker/login-action@v3
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3
uses: docker/setup-buildx-action@v1
with:
install: true
version: latest
@@ -88,7 +88,7 @@ jobs:
- name: Build and push :dev
id: docker_build
if: ${{ github.ref }} == "refs/heads/master"
uses: docker/build-push-action@v5
uses: docker/build-push-action@v2
with:
context: ./
file: ./Dockerfile
@@ -105,7 +105,7 @@ jobs:
- name: Build and push :tag
id: docker_build_tag_release
if: github.event_name == 'release' && startsWith(github.event.release.tag_name, '0.')
uses: docker/build-push-action@v5
uses: docker/build-push-action@v2
with:
context: ./
file: ./Dockerfile
@@ -125,7 +125,7 @@ jobs:
run: echo step SHA ${{ steps.vars.outputs.sha_short }} tag ${{steps.vars.outputs.tag}} branch ${{steps.vars.outputs.branch}} digest ${{ steps.docker_build.outputs.digest }}
- name: Cache Docker layers
uses: actions/cache@v3
uses: actions/cache@v2
with:
path: /tmp/.buildx-cache
key: ${{ runner.os }}-buildx-${{ github.sha }}

View File

@@ -24,22 +24,22 @@ jobs:
test-container-build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v2
- name: Set up Python 3.9
uses: actions/setup-python@v4
uses: actions/setup-python@v2
with:
python-version: 3.9
# Just test that the build works, some libraries won't compile on ARM/rPi etc
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
uses: docker/setup-qemu-action@v1
with:
image: tonistiigi/binfmt:latest
platforms: all
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v3
uses: docker/setup-buildx-action@v1
with:
install: true
version: latest
@@ -49,7 +49,7 @@ jobs:
# Check we can still build under alpine/musl
- name: Test that the docker containers can build (musl via alpine check)
id: docker_build_musl
uses: docker/build-push-action@v5
uses: docker/build-push-action@v2
with:
context: ./
file: ./.github/test/Dockerfile-alpine
@@ -57,7 +57,7 @@ jobs:
- name: Test that the docker containers can build
id: docker_build
uses: docker/build-push-action@v5
uses: docker/build-push-action@v2
# https://github.com/docker/build-push-action#customizing
with:
context: ./

View File

@@ -7,11 +7,11 @@ jobs:
test-application:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v2
# Mainly just for link/flake8
- name: Set up Python 3.10
uses: actions/setup-python@v4
uses: actions/setup-python@v2
with:
python-version: '3.10'

View File

@@ -11,10 +11,10 @@ jobs:
test-pip-build-basics:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v2
- name: Set up Python 3.9
uses: actions/setup-python@v4
uses: actions/setup-python@v2
with:
python-version: 3.9

View File

@@ -57,11 +57,9 @@ def construct_blueprint(datastore: ChangeDetectionStore):
status.update({'status': 'ERROR OTHER', 'length': len(contents), 'text': f"Got empty reply with code {e.status_code} - Access denied"})
else:
status.update({'status': 'ERROR OTHER', 'length': len(contents) if contents else 0, 'text': f"Empty reply with code {e.status_code}, needs chrome?"})
except content_fetcher.ReplyWithContentButNoText as e:
txt = f"Got reply but with no content - Status code {e.status_code} - It's possible that the filters were found, but contained no usable text (or contained only an image)."
status.update({'status': 'ERROR', 'text': txt})
except Exception as e:
status.update({'status': 'ERROR OTHER', 'length': len(contents) if contents else 0, 'text': 'Error: '+type(e).__name__+str(e)})
status.update({'status': 'ERROR OTHER', 'length': len(contents) if contents else 0, 'text': 'Error: '+str(e)})
else:
status.update({'status': 'OK', 'length': len(contents), 'text': ''})

View File

@@ -77,13 +77,11 @@ class ScreenshotUnavailable(Exception):
class ReplyWithContentButNoText(Exception):
def __init__(self, status_code, url, screenshot=None, has_filters=False, html_content=''):
def __init__(self, status_code, url, screenshot=None):
# Set this so we can use it in other parts of the app
self.status_code = status_code
self.url = url
self.screenshot = screenshot
self.has_filters = has_filters
self.html_content = html_content
return

View File

@@ -229,19 +229,16 @@ class ValidateJinja2Template(object):
def __call__(self, form, field):
from changedetectionio import notification
from jinja2 import Environment, BaseLoader, TemplateSyntaxError, UndefinedError
from jinja2 import Environment, BaseLoader, TemplateSyntaxError
from jinja2.meta import find_undeclared_variables
try:
jinja2_env = Environment(loader=BaseLoader)
jinja2_env.globals.update(notification.valid_tokens)
rendered = jinja2_env.from_string(field.data).render()
except TemplateSyntaxError as e:
raise ValidationError(f"This is not a valid Jinja2 template: {e}") from e
except UndefinedError as e:
raise ValidationError(f"A variable or function is not defined: {e}") from e
ast = jinja2_env.parse(field.data)
undefined = ", ".join(find_undeclared_variables(ast))

View File

@@ -10,7 +10,6 @@ import re
# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'
# 'price' , 'lowPrice', 'highPrice' are usually under here
# all of those may or may not appear on different websites
LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
@@ -18,23 +17,7 @@ LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
class JSONNotFound(ValueError):
def __init__(self, msg):
ValueError.__init__(self, msg)
# Doesn't look like python supports forward slash auto enclosure in re.findall
# So convert it to inline flag "(?i)foobar" type configuration
def perl_style_slash_enclosed_regex_to_options(regex):
res = re.search(PERL_STYLE_REGEX, regex, re.IGNORECASE)
if res:
flags = res.group(2) if res.group(2) else 'i'
regex = f"(?{flags}){res.group(1)}"
else:
# Fall back to just ignorecase as an option
regex = f"(?i){regex}"
return regex
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
def include_filters(include_filters, html_content, append_pretty_line_formatting=False):
soup = BeautifulSoup(html_content, "html.parser")
@@ -212,14 +195,23 @@ def strip_ignore_text(content, wordlist, mode="content"):
output = []
ignore_text = []
ignore_regex = []
ignored_line_numbers = []
for k in wordlist:
# Is it a regex?
res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE)
if res:
ignore_regex.append(re.compile(perl_style_slash_enclosed_regex_to_options(k)))
x = re.search('^\/(.*)\/(.*)', k.strip())
if x:
# Starts with / but doesn't look like a regex
p = x.group(1)
try:
# @Todo python regex options can go before the regex str, but not really many of the options apply on a per-line basis
ignore_regex.append(re.compile(rf"{p}", re.IGNORECASE))
except Exception as e:
# Badly formed regex, treat as text
ignore_text.append(k.strip())
else:
# Had a / but doesn't work as regex
ignore_text.append(k.strip())
for line in content.splitlines():

View File

@@ -11,19 +11,17 @@ from changedetectionio import content_fetcher, html_tools
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
from copy import deepcopy
from . import difference_detection_processor
from ..html_tools import PERL_STYLE_REGEX
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
name = 'Webpage Text/HTML, JSON and PDF changes'
description = 'Detects all text changes where possible'
name = 'Webpage Text/HTML, JSON and PDF changes'
description = 'Detects all text changes where possible'
class FilterNotFoundInResponse(ValueError):
def __init__(self, msg):
ValueError.__init__(self, msg)
class PDFToHTMLToolNotFound(ValueError):
def __init__(self, msg):
ValueError.__init__(self, msg)
@@ -39,6 +37,19 @@ class perform_site_check(difference_detection_processor):
super().__init__(*args, **kwargs)
self.datastore = datastore
# Doesn't look like python supports forward slash auto enclosure in re.findall
# So convert it to inline flag "foobar(?i)" type configuration
def forward_slash_enclosed_regex_to_options(self, regex):
res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE)
if res:
regex = res.group(1)
regex += '(?{})'.format(res.group(2))
else:
regex += '(?{})'.format('i')
return regex
def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None):
changed_detected = False
screenshot = False # as bytes
@@ -124,8 +135,7 @@ class perform_site_check(difference_detection_processor):
# requests for PDF's, images etc should be passwd the is_binary flag
is_binary = watch.is_pdf
fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'),
is_binary=is_binary)
fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), is_binary=is_binary)
fetcher.quit()
self.screenshot = fetcher.screenshot
@@ -141,6 +151,7 @@ class perform_site_check(difference_detection_processor):
if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'):
raise content_fetcher.checksumFromPreviousCheckWasTheSame()
# Fetching complete, now filters
# @todo move to class / maybe inside of fetcher abstract base?
@@ -220,6 +231,8 @@ class perform_site_check(difference_detection_processor):
stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)
is_html = False
if is_html or is_source:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
@@ -270,6 +283,7 @@ class perform_site_check(difference_detection_processor):
# Re #340 - return the content before the 'ignore text' was applied
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
# @todo whitespace coming from missing rtrim()?
# stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
# Rewrite's the processing text based on only what diff result they want to see
@@ -279,13 +293,13 @@ class perform_site_check(difference_detection_processor):
# needs to not include (added) etc or it may get used twice
# Replace the processed text with the preferred result
rendered_diff = diff.render_diff(previous_version_file_contents=watch.get_last_fetched_before_filters(),
newest_version_file_contents=stripped_text_from_html,
include_equal=False, # not the same lines
include_added=watch.get('filter_text_added', True),
include_removed=watch.get('filter_text_removed', True),
include_replaced=watch.get('filter_text_replaced', True),
line_feed_sep="\n",
include_change_type_prefix=False)
newest_version_file_contents=stripped_text_from_html,
include_equal=False, # not the same lines
include_added=watch.get('filter_text_added', True),
include_removed=watch.get('filter_text_removed', True),
include_replaced=watch.get('filter_text_replaced', True),
line_feed_sep="\n",
include_change_type_prefix=False)
watch.save_last_fetched_before_filters(text_content_before_ignored_filter)
@@ -300,12 +314,7 @@ class perform_site_check(difference_detection_processor):
# Treat pages with no renderable text content as a change? No by default
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
raise content_fetcher.ReplyWithContentButNoText(url=url,
status_code=fetcher.get_last_status_code(),
screenshot=screenshot,
has_filters=has_filter_rule,
html_content=html_content
)
raise content_fetcher.ReplyWithContentButNoText(url=url, status_code=fetcher.get_last_status_code(), screenshot=screenshot)
# We rely on the actual text in the html output.. many sites have random script vars etc,
# in the future we'll implement other mechanisms.
@@ -326,25 +335,16 @@ class perform_site_check(difference_detection_processor):
regex_matched_output = []
for s_re in extract_text:
# incase they specified something in '/.../x'
if re.search(PERL_STYLE_REGEX, s_re, re.IGNORECASE):
regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re)
result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
regex = self.forward_slash_enclosed_regex_to_options(s_re)
result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
for l in result:
if type(l) is tuple:
# @todo - some formatter option default (between groups)
regex_matched_output += list(l) + [b'\n']
else:
# @todo - some formatter option default (between each ungrouped result)
regex_matched_output += [l] + [b'\n']
else:
# Doesnt look like regex, just hunt for plaintext and return that which matches
# `stripped_text_from_html` will be bytes, so we must encode s_re also to bytes
r = re.compile(re.escape(s_re.encode('utf-8')), re.IGNORECASE)
res = r.findall(stripped_text_from_html)
if res:
for match in res:
regex_matched_output += [match] + [b'\n']
for l in result:
if type(l) is tuple:
# @todo - some formatter option default (between groups)
regex_matched_output += list(l) + [b'\n']
else:
# @todo - some formatter option default (between each ungrouped result)
regex_matched_output += [l] + [b'\n']
# Now we will only show what the regex matched
stripped_text_from_html = b''

View File

@@ -5,19 +5,14 @@ function isItemInStock() {
'agotado',
'artikel zurzeit vergriffen',
'as soon as stock is available',
'ausverkauft', // sold out
'available for back order',
'back-order or out of stock',
'backordered',
'benachrichtigt mich', // notify me
'brak na stanie',
'brak w magazynie',
'coming soon',
'currently have any tickets for this',
'currently unavailable',
'dostępne wkrótce',
'en rupture de stock',
'ist derzeit nicht auf lager',
'item is no longer available',
'message if back in stock',
'nachricht bei',
@@ -42,7 +37,6 @@ function isItemInStock() {
'unavailable tickets',
'we do not currently have an estimate of when this product will be back in stock.',
'zur zeit nicht an lager',
'已售完',
];

View File

@@ -208,7 +208,7 @@ $(document).ready(function () {
console.log(x);
if (x && first_available.length) {
// @todo will it let you click shit that has a layer ontop? probably not.
if (x['tagtype'] === 'text' || x['tagtype'] === 'number' || x['tagtype'] === 'email' || x['tagName'] === 'textarea' || x['tagtype'] === 'password' || x['tagtype'] === 'search') {
if (x['tagtype'] === 'text' || x['tagtype'] === 'email' || x['tagName'] === 'textarea' || x['tagtype'] === 'password' || x['tagtype'] === 'search') {
$('select', first_available).val('Enter text in field').change();
$('input[type=text]', first_available).first().val(x['xpath']);
$('input[placeholder="Value"]', first_available).addClass('ok').click().focus();

View File

@@ -44,7 +44,7 @@
#browser-steps .flex-wrapper {
display: flex;
flex-flow: row;
height: 70vh;
height: 600px; /*@todo make this dynamic */
}
/* this is duplicate :( */

View File

@@ -50,7 +50,8 @@
#browser-steps .flex-wrapper {
display: flex;
flex-flow: row;
height: 70vh; }
height: 600px;
/*@todo make this dynamic */ }
/* this is duplicate :( */
#browsersteps-selector-wrapper {

View File

@@ -378,16 +378,15 @@ Unavailable") }}
{{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }}
<span class="pure-form-message-inline">
<ul>
<li>Extracts text in the final output (line by line) after other filters using regular expressions or string match;
<li>Extracts text in the final output (line by line) after other filters using regular expressions;
<ul>
<li>Regular expression &dash; example <code>/reports.+?2022/i</code></li>
<li>Don't forget to consider the white-space at the start of a line <code>/.+?reports.+?2022/i</code></li>
<li>Use <code>//(?aiLmsux))</code> type flags (more <a href="https://docs.python.org/3/library/re.html#index-15">information here</a>)<br></li>
<li>Keyword example &dash; example <code>Out of stock</code></li>
<li>Use groups to extract just that text &dash; example <code>/reports.+?(\d+)/i</code> returns a list of years only</li>
</ul>
</li>
<li>One line per regular-expression/string match</li>
<li>One line per regular-expression/ string match</li>
</ul>
</span>
</div>

View File

@@ -119,9 +119,6 @@
<a href="{{ url_for('settings_page', uuid=watch.uuid) }}#proxies">Try adding external proxies/locations</a>
{% endif %}
{% if 'empty result or contain only an image' in watch.last_error %}
<a href="https://github.com/dgtlmoon/changedetection.io/wiki/Detecting-changes-in-images">more help here</a>.
{% endif %}
</div>
{% endif %}
{% if watch.last_notification_error is defined and watch.last_notification_error != False %}

View File

@@ -2,7 +2,7 @@
import time
from flask import url_for
from .util import live_server_setup, wait_for_all_checks
from . util import live_server_setup
from ..html_tools import *
@@ -176,77 +176,3 @@ def test_check_multiple_filters(client, live_server):
assert b"Blob A" in res.data # CSS was ok
assert b"Blob B" in res.data # xPath was ok
assert b"Blob C" not in res.data # Should not be included
# The filter exists, but did not contain anything useful
# Mainly used when the filter contains just an IMG, this can happen when someone selects an image in the visual-selector
# Tests fetcher can throw a "ReplyWithContentButNoText" exception after applying filter and extracting text
def test_filter_is_empty_help_suggestion(client, live_server):
#live_server_setup(live_server)
include_filters = "#blob-a"
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("""<html><body>
<div id="blob-a">
<img src="something.jpg">
</div>
</body>
</html>
""")
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
# Goto the edit page, add our ignore text
# Add our URL to the import page
res = client.post(
url_for("edit_page", uuid="first"),
data={"include_filters": include_filters,
"url": test_url,
"tags": "",
"headers": "",
'fetch_backend': "html_requests"},
follow_redirects=True
)
assert b"Updated watch." in res.data
wait_for_all_checks(client)
res = client.get(
url_for("index"),
follow_redirects=True
)
assert b'empty result or contain only an image' in res.data
### Just an empty selector, no image
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("""<html><body>
<div id="blob-a">
<!-- doo doo -->
</div>
</body>
</html>
""")
res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.get(
url_for("index"),
follow_redirects=True
)
assert b'empty result or contain only an image' not in res.data
assert b'but contained no usable text' in res.data

View File

@@ -2,7 +2,7 @@
import time
from flask import url_for
from .util import live_server_setup, wait_for_all_checks
from .util import live_server_setup
from ..html_tools import *
@@ -55,8 +55,6 @@ def set_multiline_response():
</p>
<div>aaand something lines</div>
<br>
<div>and this should be</div>
</body>
</html>
"""
@@ -68,10 +66,11 @@ def set_multiline_response():
def test_setup(client, live_server):
live_server_setup(live_server)
def test_check_filter_multiline(client, live_server):
#live_server_setup(live_server)
set_multiline_response()
# Add our URL to the import page
@@ -83,15 +82,14 @@ def test_check_filter_multiline(client, live_server):
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
time.sleep(3)
# Goto the edit page, add our ignore text
# Add our URL to the import page
res = client.post(
url_for("edit_page", uuid="first"),
data={"include_filters": '',
# Test a regex and a plaintext
'extract_text': '/something.+?6 billion.+?lines/si\r\nand this should be',
'extract_text': '/something.+?6 billion.+?lines/si',
"url": test_url,
"tags": "",
"headers": "",
@@ -101,19 +99,13 @@ def test_check_filter_multiline(client, live_server):
)
assert b"Updated watch." in res.data
wait_for_all_checks(client)
res = client.get(url_for("index"))
# Issue 1828
assert b'not at the start of the expression' not in res.data
time.sleep(3)
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
# Plaintext that doesnt look like a regex should match also
assert b'and this should be' in res.data
assert b'<div class="">Something' in res.data
assert b'<div class="">across 6 billion multiple' in res.data
@@ -123,11 +115,14 @@ def test_check_filter_multiline(client, live_server):
assert b'aaand something lines' not in res.data
def test_check_filter_and_regex_extract(client, live_server):
sleep_time_for_fetch_thread = 3
include_filters = ".changetext"
set_original_response()
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
@@ -137,15 +132,19 @@ def test_check_filter_and_regex_extract(client, live_server):
)
assert b"1 Imported" in res.data
time.sleep(1)
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
# Goto the edit page, add our ignore text
# Add our URL to the import page
res = client.post(
url_for("edit_page", uuid="first"),
data={"include_filters": include_filters,
'extract_text': '/\d+ online/\r\n/\d+ guests/\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i\r\n/issue1828.+?2022/i',
'extract_text': '\d+ online\r\n\d+ guests\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i',
"url": test_url,
"tags": "",
"headers": "",
@@ -156,13 +155,8 @@ def test_check_filter_and_regex_extract(client, live_server):
assert b"Updated watch." in res.data
# Give the thread time to pick it up
wait_for_all_checks(client)
res = client.get(url_for("index"))
#issue 1828
assert b'not at the start of the expression' not in res.data
time.sleep(sleep_time_for_fetch_thread)
# Make a change
set_modified_response()
@@ -170,7 +164,7 @@ def test_check_filter_and_regex_extract(client, live_server):
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
# It should have 'unviewed' still
# Because it should be looking at only that 'sametext' id

View File

@@ -2,7 +2,7 @@
import time
from flask import url_for
from .util import live_server_setup, wait_for_all_checks
from . util import live_server_setup
def set_original_ignore_response():
@@ -26,8 +26,13 @@ def test_trigger_regex_functionality(client, live_server):
live_server_setup(live_server)
sleep_time_for_fetch_thread = 3
set_original_ignore_response()
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
@@ -38,7 +43,7 @@ def test_trigger_regex_functionality(client, live_server):
assert b"1 Imported" in res.data
# Give the thread time to pick it up
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (just a new one shouldnt have anything)
res = client.get(url_for("index"))
@@ -52,7 +57,7 @@ def test_trigger_regex_functionality(client, live_server):
"fetch_backend": "html_requests"},
follow_redirects=True
)
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
# so that we set the state to 'unviewed' after all the edits
client.get(url_for("diff_history_page", uuid="first"))
@@ -60,7 +65,7 @@ def test_trigger_regex_functionality(client, live_server):
f.write("some new noise")
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (nothing should match the regex)
res = client.get(url_for("index"))
@@ -70,7 +75,7 @@ def test_trigger_regex_functionality(client, live_server):
f.write("regex test123<br>\nsomething 123")
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
time.sleep(sleep_time_for_fetch_thread)
res = client.get(url_for("index"))
assert b'unviewed' in res.data

View File

@@ -3,7 +3,7 @@ import threading
import queue
import time
from changedetectionio import content_fetcher, html_tools
from changedetectionio import content_fetcher
from .processors.text_json_diff import FilterNotFoundInResponse
from .processors.restock_diff import UnableToExtractRestockData
@@ -251,20 +251,7 @@ class update_worker(threading.Thread):
# Totally fine, it's by choice - just continue on, nothing more to care about
# Page had elements/content but no renderable text
# Backend (not filters) gave zero output
extra_help = ""
if e.has_filters:
# Maybe it contains an image? offer a more helpful link
has_img = html_tools.include_filters(include_filters='img',
html_content=e.html_content)
if has_img:
extra_help = ", it's possible that the filters you have give an empty result or contain only an image."
else:
extra_help = ", it's possible that the filters were found, but contained no usable text."
self.datastore.update_watch(uuid=uuid, update_obj={
'last_error': f"Got HTML content but no text found (With {e.status_code} reply code){extra_help}"
})
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found (With {} reply code).".format(e.status_code)})
if e.screenshot:
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot)
process_changedetection_results = False