mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2026-02-21 05:36:06 +00:00
Compare commits
4 Commits
722-docker
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7a51f1e4bf | ||
|
|
91dee697f9 | ||
|
|
4128acf95a | ||
|
|
7c8d59c795 |
@@ -2,7 +2,7 @@
|
||||
|
||||
# Read more https://github.com/dgtlmoon/changedetection.io/wiki
|
||||
# Semver means never use .01, or 00. Should be .1.
|
||||
__version__ = '0.53.4'
|
||||
__version__ = '0.53.5'
|
||||
|
||||
from changedetectionio.strtobool import strtobool
|
||||
from json.decoder import JSONDecodeError
|
||||
|
||||
@@ -27,7 +27,6 @@ from flask import (
|
||||
session,
|
||||
url_for,
|
||||
)
|
||||
from flask_compress import Compress as FlaskCompress
|
||||
from flask_restful import abort, Api
|
||||
from flask_cors import CORS
|
||||
|
||||
@@ -74,14 +73,14 @@ CORS(app)
|
||||
# There's also a bug between flask compress and socketio that causes some kind of slow memory leak
|
||||
# It's better to use compression on your reverse proxy (nginx etc) instead.
|
||||
if strtobool(os.getenv("FLASK_ENABLE_COMPRESSION")):
|
||||
from flask_compress import Compress as FlaskCompress
|
||||
app.config['COMPRESS_MIN_SIZE'] = 2096
|
||||
app.config['COMPRESS_MIMETYPES'] = ['text/html', 'text/css', 'text/javascript', 'application/json', 'application/javascript', 'image/svg+xml']
|
||||
# Use gzip only - smaller memory footprint than zstd/brotli (4-8KB vs 200-500KB contexts)
|
||||
app.config['COMPRESS_ALGORITHM'] = ['gzip']
|
||||
compress = FlaskCompress()
|
||||
compress.init_app(app)
|
||||
|
||||
compress = FlaskCompress()
|
||||
|
||||
compress.init_app(app)
|
||||
app.config['TEMPLATES_AUTO_RELOAD'] = False
|
||||
|
||||
|
||||
|
||||
@@ -561,31 +561,33 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
|
||||
)
|
||||
else:
|
||||
parser_config = None
|
||||
|
||||
if is_rss:
|
||||
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
|
||||
html_content = re.sub(r'</title>', r'</h1>', html_content)
|
||||
else:
|
||||
# Strip bloat in one pass, SPA's often dump 10Mb+ into the <head> for styles, which is not needed
|
||||
# Causing inscriptis to silently exit when more than ~10MB is found.
|
||||
# All we are doing here is converting the HTML to text, no CSS layout etc
|
||||
# Use backreference (\1) to ensure opening/closing tags match (prevents <style> matching </svg> in CSS data URIs)
|
||||
html_content = re.sub(r'<(style|script|svg|noscript)[^>]*>.*?</\1>|<(?:link|meta)[^>]*/?>|<!--.*?-->',
|
||||
'', html_content, flags=re.DOTALL | re.IGNORECASE)
|
||||
# Use BS4 html.parser to strip bloat — SPA's often dump 10MB+ of CSS/JS into <head>,
|
||||
# causing inscriptis to silently give up. Regex-based stripping is unsafe because tags
|
||||
# can appear inside JSON data attributes with JS-escaped closing tags (e.g. <\/script>),
|
||||
# causing the regex to scan past the intended close and eat real page content.
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
# Strip tags that inscriptis cannot render as meaningful text and which can be very large.
|
||||
# svg/math: produce path-data/MathML garbage; canvas/iframe/template: no inscriptis handlers.
|
||||
# video/audio/picture are kept — they may contain meaningful fallback text or captions.
|
||||
for tag in soup.find_all(['head', 'script', 'style', 'noscript', 'svg',
|
||||
'math', 'canvas', 'iframe', 'template']):
|
||||
tag.decompose()
|
||||
|
||||
# SPAs often use <body style="display:none"> to hide content until JS loads
|
||||
# inscriptis respects CSS display rules, so we need to remove these hiding styles
|
||||
# to extract the actual page content
|
||||
body_style_pattern = r'(<body[^>]*)\s+style\s*=\s*["\']([^"\']*\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b[^"\']*)["\']'
|
||||
|
||||
# Check if body has hiding styles that need to be fixed
|
||||
body_match = re.search(body_style_pattern, html_content, flags=re.IGNORECASE)
|
||||
if body_match:
|
||||
from loguru import logger
|
||||
logger.debug(f"html_to_text: Removing hiding styles from body tag (found: '{body_match.group(2)}')")
|
||||
|
||||
html_content = re.sub(body_style_pattern, r'\1', html_content, flags=re.IGNORECASE)
|
||||
# SPAs often use <body style="display:none"> to hide content until JS loads.
|
||||
# inscriptis respects CSS display rules, so strip hiding styles from the body tag.
|
||||
body_tag = soup.find('body')
|
||||
if body_tag and body_tag.get('style'):
|
||||
style = body_tag['style']
|
||||
if re.search(r'\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b', style, re.IGNORECASE):
|
||||
logger.debug(f"html_to_text: Removing hiding styles from body tag (found: '{style}')")
|
||||
del body_tag['style']
|
||||
|
||||
html_content = str(soup)
|
||||
|
||||
text_content = get_text(html_content, config=parser_config)
|
||||
return text_content
|
||||
|
||||
@@ -199,11 +199,25 @@ def handle_watch_update(socketio, **kwargs):
|
||||
logger.error(f"Socket.IO error in handle_watch_update: {str(e)}")
|
||||
|
||||
|
||||
def _patch_flask_request_context_session():
|
||||
"""Flask 3.1 removed the session setter from RequestContext, but Flask-SocketIO 5.6.0
|
||||
still assigns to it directly (ctx.session = ...). Restore a setter that writes the
|
||||
private _session attribute so the two libraries work together.
|
||||
"""
|
||||
from flask.ctx import RequestContext
|
||||
if getattr(RequestContext.session, 'fset', None) is not None:
|
||||
return # Already has a setter (future Flask version restored it)
|
||||
original_prop = RequestContext.session
|
||||
RequestContext.session = original_prop.setter(lambda self, value: setattr(self, '_session', value))
|
||||
|
||||
|
||||
def init_socketio(app, datastore):
|
||||
"""Initialize SocketIO with the main Flask app"""
|
||||
import platform
|
||||
import sys
|
||||
|
||||
_patch_flask_request_context_session()
|
||||
|
||||
# Platform-specific async_mode selection for better stability
|
||||
system = platform.system().lower()
|
||||
python_version = sys.version_info
|
||||
|
||||
@@ -453,6 +453,175 @@ class TestHtmlToText(unittest.TestCase):
|
||||
|
||||
|
||||
|
||||
def test_script_with_closing_tag_in_string_does_not_eat_content(self):
|
||||
"""
|
||||
Script tag containing </script> inside a JS string must not prematurely end the block.
|
||||
|
||||
This is the classic regex failure mode: the old pattern would find the first </script>
|
||||
inside the JS string literal and stop there, leaving the tail of the script block
|
||||
(plus any following content) exposed as raw text. BS4 parses the HTML correctly.
|
||||
"""
|
||||
html = '''<html><body>
|
||||
<p>Before script</p>
|
||||
<script>
|
||||
var html = "<div>foo<\\/script><p>bar</p>";
|
||||
var also = 1;
|
||||
</script>
|
||||
<p>AFTER SCRIPT</p>
|
||||
</body></html>'''
|
||||
|
||||
text = html_to_text(html)
|
||||
assert 'Before script' in text
|
||||
assert 'AFTER SCRIPT' in text
|
||||
# Script internals must not leak
|
||||
assert 'var html' not in text
|
||||
assert 'var also' not in text
|
||||
|
||||
def test_content_sandwiched_between_multiple_body_scripts(self):
|
||||
"""Content between multiple script/style blocks in the body must all survive."""
|
||||
html = '''<html><body>
|
||||
<script>var a = 1;</script>
|
||||
<p>CONTENT A</p>
|
||||
<style>.x { color: red; }</style>
|
||||
<p>CONTENT B</p>
|
||||
<script>var b = 2;</script>
|
||||
<p>CONTENT C</p>
|
||||
<style>.y { color: blue; }</style>
|
||||
<p>CONTENT D</p>
|
||||
</body></html>'''
|
||||
|
||||
text = html_to_text(html)
|
||||
for label in ['CONTENT A', 'CONTENT B', 'CONTENT C', 'CONTENT D']:
|
||||
assert label in text, f"'{label}' was eaten by script/style stripping"
|
||||
assert 'var a' not in text
|
||||
assert 'var b' not in text
|
||||
assert 'color: red' not in text
|
||||
assert 'color: blue' not in text
|
||||
|
||||
def test_unicode_and_international_content_preserved(self):
|
||||
"""Non-ASCII content (umlauts, CJK, soft hyphens) must survive stripping."""
|
||||
html = '''<html><body>
|
||||
<style>.x{color:red}</style>
|
||||
<p>German: Aus\xadge\xadbucht! — ANMELDUNG — Fan\xadday 2026</p>
|
||||
<p>Chinese: \u6ce8\u518c</p>
|
||||
<p>Japanese: \u767b\u9332</p>
|
||||
<p>Korean: \ub4f1\ub85d</p>
|
||||
<p>Emoji: \U0001f4e2</p>
|
||||
<script>var x = 1;</script>
|
||||
</body></html>'''
|
||||
|
||||
text = html_to_text(html)
|
||||
assert 'ANMELDUNG' in text
|
||||
assert '\u6ce8\u518c' in text # Chinese
|
||||
assert '\u767b\u9332' in text # Japanese
|
||||
assert '\ub4f1\ub85d' in text # Korean
|
||||
|
||||
def test_style_with_type_attribute_is_stripped(self):
|
||||
"""<style type="text/css"> (with type attribute) must be stripped just like bare <style>."""
|
||||
html = '''<html><body>
|
||||
<style type="text/css">.important { display: none; }</style>
|
||||
<p>VISIBLE CONTENT</p>
|
||||
</body></html>'''
|
||||
|
||||
text = html_to_text(html)
|
||||
assert 'VISIBLE CONTENT' in text
|
||||
assert '.important' not in text
|
||||
assert 'display: none' not in text
|
||||
|
||||
def test_ldjson_script_is_stripped(self):
|
||||
"""<script type="application/ld+json"> must be stripped — raw JSON must not appear as text."""
|
||||
html = '''<html><body>
|
||||
<script type="application/ld+json">
|
||||
{"@type": "Product", "name": "Widget", "price": "9.99"}
|
||||
</script>
|
||||
<p>PRODUCT PAGE</p>
|
||||
</body></html>'''
|
||||
|
||||
text = html_to_text(html)
|
||||
assert 'PRODUCT PAGE' in text
|
||||
assert '@type' not in text
|
||||
assert '"price"' not in text
|
||||
|
||||
def test_inline_svg_is_stripped_entirely(self):
|
||||
"""
|
||||
Inline SVG elements in the body are stripped by BS4 before passing to inscriptis.
|
||||
SVGs can be huge (icon libraries, data visualisations) and produce garbage path-data
|
||||
text. The old regex code explicitly stripped <svg>; the BS4 path must do the same.
|
||||
"""
|
||||
html = '''<html><body>
|
||||
<p>Before SVG</p>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
|
||||
<path d="M14 5L7 12L14 19Z" fill="none"/>
|
||||
<circle cx="12" cy="12" r="10"/>
|
||||
</svg>
|
||||
<p>After SVG</p>
|
||||
</body></html>'''
|
||||
|
||||
text = html_to_text(html)
|
||||
assert 'Before SVG' in text
|
||||
assert 'After SVG' in text
|
||||
assert 'M14 5L7' not in text, "SVG path data should not appear in text output"
|
||||
assert 'viewBox' not in text, "SVG attributes should not appear in text output"
|
||||
|
||||
def test_tag_inside_json_data_attribute_does_not_eat_content(self):
|
||||
"""
|
||||
Tags inside JSON data attributes with JS-escaped closing tags must not eat real content.
|
||||
|
||||
Real-world case: Elementor/JetEngine WordPress widgets embed HTML (including SVG icons)
|
||||
inside JSON data attributes like data-slider-atts. The HTML inside is JS-escaped, so
|
||||
closing tags appear as <\\/svg> rather than </svg>.
|
||||
|
||||
The old regex approach would find <svg> inside the attribute value, then fail to find
|
||||
<\/svg> as a matching close tag, and scan forward to the next real </svg> in the DOM —
|
||||
eating tens of kilobytes of actual page content in the process.
|
||||
"""
|
||||
html = '''<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Test</title></head>
|
||||
<body>
|
||||
<div class="slider" data-slider-atts="{"prevArrow":"<i class=\\"icon\\"><svg width=\\"24\\" height=\\"24\\" viewBox=\\"0 0 24 24\\" xmlns=\\"http:\\/\\/www.w3.org\\/2000\\/svg\\"><path d=\\"M14 5L7 12L14 19\\"\\/><\\/svg><\\/i>"}">
|
||||
</div>
|
||||
<div class="content">
|
||||
<h1>IMPORTANT CONTENT</h1>
|
||||
<p>This text must not be eaten by the tag-stripping logic.</p>
|
||||
</div>
|
||||
<svg><circle cx="50" cy="50" r="40"/></svg>
|
||||
</body>
|
||||
</html>'''
|
||||
|
||||
text = html_to_text(html)
|
||||
|
||||
assert 'IMPORTANT CONTENT' in text, (
|
||||
"Content after a JS-escaped tag in a data attribute was incorrectly stripped. "
|
||||
"The tag-stripping logic is matching <tag> inside attribute values and scanning "
|
||||
"forward to the next real closing tag in the DOM."
|
||||
)
|
||||
assert 'This text must not be eaten' in text
|
||||
|
||||
def test_script_inside_json_data_attribute_does_not_eat_content(self):
|
||||
"""Same issue as above but with <script> embedded in a data attribute with JS-escaped closing tag."""
|
||||
html = '''<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Test</title></head>
|
||||
<body>
|
||||
<div data-config="{"template":"<script type=\\"text\\/javascript\\">var x=1;<\\/script>"}">
|
||||
</div>
|
||||
<div>
|
||||
<h1>MUST SURVIVE</h1>
|
||||
<p>Real content after the data attribute with embedded script tag.</p>
|
||||
</div>
|
||||
<script>var real = 1;</script>
|
||||
</body>
|
||||
</html>'''
|
||||
|
||||
text = html_to_text(html)
|
||||
|
||||
assert 'MUST SURVIVE' in text, (
|
||||
"Content after a JS-escaped <script> in a data attribute was incorrectly stripped."
|
||||
)
|
||||
assert 'Real content after the data attribute' in text
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Can run this file directly for quick testing
|
||||
unittest.main()
|
||||
|
||||
@@ -9,10 +9,15 @@ flask_restful
|
||||
flask_cors # For the Chrome extension to operate
|
||||
# janus # No longer needed - using pure threading.Queue for multi-loop support
|
||||
flask_wtf~=1.2
|
||||
flask~=3.1
|
||||
flask-socketio~=5.6.0
|
||||
python-socketio~=5.16.1
|
||||
python-engineio~=4.13.1
|
||||
# Flask 3.1 removed the session setter on RequestContext; the patch in
|
||||
# changedetectionio/realtime/socket_server.py restores it so Flask-SocketIO works.
|
||||
# Require >=3.1 so the patch is always needed; <4 guards against unknown breaking changes.
|
||||
flask>=3.1,<4
|
||||
# Flask-SocketIO 5.x still does ctx.session = ... directly; the patch above handles it.
|
||||
# >=5.5.0 ensures the threading async_mode we rely on is available.
|
||||
flask-socketio>=5.5.0,<6
|
||||
python-socketio>=5.11.0,<6
|
||||
python-engineio>=4.9.0,<5
|
||||
inscriptis~=2.2
|
||||
pytz
|
||||
timeago~=1.0
|
||||
|
||||
Reference in New Issue
Block a user