# HTML to TEXT/JSON DIFFERENCE self.fetcher
import hashlib
import json
import os
import re
import urllib3
from changedetectionio.conditions import execute_ruleset_against_all_plugins
from changedetectionio.processors import difference_detection_processor
from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text, TRANSLATE_WHITESPACE_TABLE
from changedetectionio import html_tools, content_fetchers
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
from loguru import logger
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
name = 'Webpage Text/HTML, JSON and PDF changes'
description = 'Detects all text changes where possible'
json_filter_prefixes = ['json:', 'jq:', 'jqraw:']
# Assume it's this type if the server says nothing on content-type
DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER = 'text/html'
# When to apply the 'cdata to real HTML' hack
# @todo Some heuristic check instead? first and last bytes? maybe some new def that gets header+first 200 bytes? then we can unittest
RSS_XML_CONTENT_TYPES = [
"application/rss+xml",
"application/rdf+xml",
"text/xml",
"application/xml",
"application/atom+xml",
"text/rss+xml", # rare, non-standard
"application/x-rss+xml", # legacy (older feed software)
"application/x-atom+xml", # legacy (older Atom)
]
# JSON Content-types
# @todo Some heuristic check instead? first and last bytes? maybe some new def that gets header+first 200 bytes? then we can unittest
JSON_CONTENT_TYPES = [
"application/activity+json",
"application/feed+json",
"application/json",
"application/ld+json",
"application/vnd.api+json",
]
class FilterNotFoundInResponse(ValueError):
def __init__(self, msg, screenshot=None, xpath_data=None):
self.screenshot = screenshot
self.xpath_data = xpath_data
ValueError.__init__(self, msg)
class PDFToHTMLToolNotFound(ValueError):
def __init__(self, msg):
ValueError.__init__(self, msg)
# Some common stuff here that can be moved to a base class
# (set_proxy_from_list)
class perform_site_check(difference_detection_processor):
def run_changedetection(self, watch):
changed_detected = False
html_content = ""
screenshot = False # as bytes
stripped_text_from_html = ""
if not watch:
raise Exception("Watch no longer exists.")
ctype_header = self.fetcher.get_all_headers().get('content-type', DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER).lower()
# Try to detect better mime types if its a download or not announced as HTML
#@todo also goes into our heurestic detcet class
if 'attachment' in self.fetcher.get_all_headers().get('content-disposition','').lower() or 'octet-stream' in ctype_header:
logger.debug(f"Got a reply that may be a download or possibly a text attachment, checking..")
try:
import magic
mime = magic.from_buffer(self.fetcher.content, mime=True)
logger.debug(f"Guessing mime type, original content_type '{ctype_header}', mime type detected '{mime}'")
if mime and "/" in mime: # looks valid and is a valid mime type
ctype_header = mime
except Exception as e:
logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}")
# Unset any existing notification error
update_obj = {'last_notification_error': False, 'last_error': False}
url = watch.link
self.screenshot = self.fetcher.screenshot
self.xpath_data = self.fetcher.xpath_data
# Track the content type
update_obj['content_type'] = ctype_header
# Watches added automatically in the queue manager will skip if its the same checksum as the previous run
# Saves a lot of CPU
update_obj['previous_md5_before_filters'] = hashlib.md5(self.fetcher.content.encode('utf-8')).hexdigest()
# Fetching complete, now filters
# @note: I feel like the following should be in a more obvious chain system
# - Check filter text
# - Is the checksum different?
# - Do we convert to JSON?
# https://stackoverflow.com/questions/41817578/basic-method-chaining ?
# return content().textfilter().jsonextract().checksumcompare() ?
is_json = any(s in ctype_header for s in JSON_CONTENT_TYPES)
is_html = not is_json
is_rss = False
# Go into RSS preprocess for converting CDATA/comment to usable text
if '', metadata + '