diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py index 7845c897..60ec7753 100644 --- a/changedetectionio/processors/text_json_diff/processor.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -227,7 +227,7 @@ class ContentProcessor: """Convert CDATA/comments in RSS to usable text.""" return cdata_in_document_to_text(html_content=content) - def preprocess_pdf(self, content, raw_content): + def preprocess_pdf(self, raw_content): """Convert PDF to HTML using external tool.""" from shutil import which tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml") @@ -251,7 +251,7 @@ class ContentProcessor: metadata = ( f"

Added by changedetection.io: Document checksum - " f"{hashlib.md5(raw_content).hexdigest().upper()} " - f"Filesize - {len(html_content)} bytes

" + f"Original file size - {len(raw_content)} bytes

" ) return html_content.replace('', metadata + '') @@ -384,7 +384,8 @@ class perform_site_check(difference_detection_processor): # PDF preprocessing if watch.is_pdf or stream_content_type.is_pdf: - content = content_processor.preprocess_pdf(content, self.fetcher.raw_content) + content = content_processor.preprocess_pdf(raw_content=self.fetcher.raw_content) + stream_content_type.is_html = True # JSON preprocessing if stream_content_type.is_json: @@ -414,6 +415,9 @@ class perform_site_check(difference_detection_processor): if watch.is_source_type_url: # For source URLs, keep raw content stripped_text = html_content + elif stream_content_type.is_plaintext: + # For plaintext, keep as-is without HTML-to-text conversion + stripped_text = html_content else: # Extract text from HTML/RSS content (not generic XML) if stream_content_type.is_html or stream_content_type.is_rss: diff --git a/changedetectionio/tests/test_pdf.py b/changedetectionio/tests/test_pdf.py index 74858cfa..64212bae 100644 --- a/changedetectionio/tests/test_pdf.py +++ b/changedetectionio/tests/test_pdf.py @@ -8,25 +8,30 @@ from .util import set_original_response, set_modified_response, live_server_setu # `subtractive_selectors` should still work in `source:` type requests def test_fetch_pdf(client, live_server, measure_memory_usage): import shutil - shutil.copy("tests/test.pdf", "test-datastore/endpoint-test.pdf") + import os + + shutil.copy("tests/test.pdf", "test-datastore/endpoint-test.pdf") + first_version_size = os.path.getsize("test-datastore/endpoint-test.pdf") - # live_server_setup(live_server) # Setup on conftest per function test_url = url_for('test_pdf_endpoint', _external=True) - # Add our URL to the import page uuid = client.application.config.get('DATASTORE').add_watch(url=test_url) client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) - wait_for_all_checks(client) - res = client.get( - url_for("ui.ui_views.preview_page", uuid="first"), - follow_redirects=True - ) + watch = live_server.app.config['DATASTORE'].data['watching'][uuid] + dates = list(watch.history.keys()) + snapshot_contents = watch.get_history_snapshot(dates[0]) # PDF header should not be there (it was converted to text) - assert b'PDF' not in res.data[:10] - assert b'hello world' in res.data + assert 'PDF' not in snapshot_contents + # Was converted away from HTML + assert 'pdftohtml' not in snapshot_contents.lower() # Generator tag shouldnt be there + assert f'Original file size - {first_version_size}' in snapshot_contents + assert 'html' not in snapshot_contents.lower() # is converted from html + assert 'body' not in snapshot_contents.lower() # is converted from html + # And our text content was there + assert 'hello world' in snapshot_contents # So we know if the file changes in other ways import hashlib @@ -34,8 +39,7 @@ def test_fetch_pdf(client, live_server, measure_memory_usage): # We should have one assert len(original_md5) >0 # And it's going to be in the document - assert b'Document checksum - '+bytes(str(original_md5).encode('utf-8')) in res.data - + assert f'Document checksum - {original_md5}' in snapshot_contents shutil.copy("tests/test2.pdf", "test-datastore/endpoint-test.pdf") changed_md5 = hashlib.md5(open("test-datastore/endpoint-test.pdf", 'rb').read()).hexdigest().upper() @@ -58,7 +62,6 @@ def test_fetch_pdf(client, live_server, measure_memory_usage): assert original_md5.encode('utf-8') not in res.data assert changed_md5.encode('utf-8') in res.data - res = client.get( url_for("ui.ui_views.diff_history_page", uuid="first"), follow_redirects=True @@ -66,6 +69,16 @@ def test_fetch_pdf(client, live_server, measure_memory_usage): assert original_md5.encode('utf-8') in res.data assert changed_md5.encode('utf-8') in res.data - assert b'here is a change' in res.data + + + dates = list(watch.history.keys()) + # new snapshot was also OK, no HTML + snapshot_contents = watch.get_history_snapshot(dates[1]) + assert 'html' not in snapshot_contents.lower() + assert f'Original file size - {os.path.getsize("test-datastore/endpoint-test.pdf")}' in snapshot_contents + assert f'here is a change' in snapshot_contents + assert os.path.getsize("test-datastore/endpoint-test.pdf") != first_version_size # And the disk change worked + + \ No newline at end of file