# HTML to TEXT/JSON DIFFERENCE self.fetcher import hashlib import json import os import re import urllib3 from changedetectionio.conditions import execute_ruleset_against_all_plugins from changedetectionio.processors import difference_detection_processor from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text, TRANSLATE_WHITESPACE_TABLE from changedetectionio import html_tools, content_fetchers from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT from loguru import logger from changedetectionio.processors.magic import guess_stream_type urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) name = 'Webpage Text/HTML, JSON and PDF changes' description = 'Detects all text changes where possible' json_filter_prefixes = ['json:', 'jq:', 'jqraw:'] # Assume it's this type if the server says nothing on content-type DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER = 'text/html' class FilterNotFoundInResponse(ValueError): def __init__(self, msg, screenshot=None, xpath_data=None): self.screenshot = screenshot self.xpath_data = xpath_data ValueError.__init__(self, msg) class PDFToHTMLToolNotFound(ValueError): def __init__(self, msg): ValueError.__init__(self, msg) # Some common stuff here that can be moved to a base class # (set_proxy_from_list) class perform_site_check(difference_detection_processor): def run_changedetection(self, watch): changed_detected = False html_content = "" screenshot = False # as bytes stripped_text_from_html = "" if not watch: raise Exception("Watch no longer exists.") ctype_header = self.fetcher.get_all_headers().get('content-type', DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER).lower() stream_content_type = guess_stream_type(http_content_header=ctype_header, content=self.fetcher.content) # Unset any existing notification error update_obj = {'last_notification_error': False, 'last_error': False} url = watch.link self.screenshot = self.fetcher.screenshot self.xpath_data = self.fetcher.xpath_data # Track the content type update_obj['content_type'] = ctype_header # Watches added automatically in the queue manager will skip if its the same checksum as the previous run # Saves a lot of CPU update_obj['previous_md5_before_filters'] = hashlib.md5(self.fetcher.content.encode('utf-8')).hexdigest() # Fetching complete, now filters # @note: I feel like the following should be in a more obvious chain system # - Check filter text # - Is the checksum different? # - Do we convert to JSON? # https://stackoverflow.com/questions/41817578/basic-method-chaining ? # return content().textfilter().jsonextract().checksumcompare() ? # Go into RSS preprocess for converting CDATA/comment to usable text if stream_content_type.is_rss: self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content) if watch.is_pdf or stream_content_type.is_pdf: from shutil import which tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml") if not which(tool): raise PDFToHTMLToolNotFound("Command-line `{}` tool was not found in system PATH, was it installed?".format(tool)) import subprocess proc = subprocess.Popen( [tool, '-stdout', '-', '-s', 'out.pdf', '-i'], stdout=subprocess.PIPE, stdin=subprocess.PIPE) proc.stdin.write(self.fetcher.raw_content) proc.stdin.close() self.fetcher.content = proc.stdout.read().decode('utf-8') proc.wait(timeout=60) # Add a little metadata so we know if the file changes (like if an image changes, but the text is the same # @todo may cause problems with non-UTF8? metadata = "
Added by changedetection.io: Document checksum - {} Filesize - {} bytes
".format( hashlib.md5(self.fetcher.raw_content).hexdigest().upper(), len(self.fetcher.content)) self.fetcher.content = self.fetcher.content.replace('