mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-11-19 16:06:10 +00:00
Process text/* non-HTML in their original format keeping line breaks, auto-detect attachments/downloads for text or HTML, WARNING - Will trigger false changes for some existing text file watches #3434 (#3435)
Some checks failed
Build and push containers / metadata (push) Has been cancelled
Build and push containers / build-push-containers (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built 📦 package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
Some checks failed
Build and push containers / metadata (push) Has been cancelled
Build and push containers / build-push-containers (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Build distribution 📦 (push) Has been cancelled
ChangeDetection.io App Test / lint-code (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Test the built 📦 package works basically. (push) Has been cancelled
Publish Python 🐍distribution 📦 to PyPI and TestPyPI / Publish Python 🐍 distribution 📦 to PyPI (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-10 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-11 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-12 (push) Has been cancelled
ChangeDetection.io App Test / test-application-3-13 (push) Has been cancelled
This commit is contained in:
@@ -153,12 +153,26 @@ class perform_site_check(difference_detection_processor):
|
||||
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
||||
self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
|
||||
html_content = self.fetcher.content
|
||||
content_type = self.fetcher.get_all_headers().get('content-type', '').lower()
|
||||
is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower()
|
||||
|
||||
# If not JSON, and if it's not text/plain..
|
||||
if 'text/plain' in self.fetcher.get_all_headers().get('content-type', '').lower():
|
||||
# Try to detect better mime types if its a download or not announced as HTML
|
||||
if is_attachment or 'octet-stream' in content_type or not 'html' in content_type:
|
||||
logger.debug(f"Got a reply that may be a download or possibly a text attachment, checking..")
|
||||
try:
|
||||
import magic
|
||||
mime = magic.from_buffer(html_content, mime=True)
|
||||
logger.debug(f"Guessing mime type, original content_type '{content_type}', mime type detected '{mime}'")
|
||||
if mime and "/" in mime: # looks valid and is a valid mime type
|
||||
content_type = mime
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}")
|
||||
|
||||
if 'text/' in content_type and not 'html' in content_type:
|
||||
# Don't run get_text or xpath/css filters on plaintext
|
||||
stripped_text_from_html = html_content
|
||||
else:
|
||||
# If not JSON, and if it's not text/plain..
|
||||
# Does it have some ld+json price data? used for easier monitoring
|
||||
update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(self.fetcher.content)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user