mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-10-31 14:47:21 +00:00 
			
		
		
		
	Compare commits
	
		
			1 Commits
		
	
	
		
			fixing-pos
			...
			1874-rss-t
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | d2c09cfc7d | 
| @@ -98,10 +98,7 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False | ||||
|         elif type(element) == etree._ElementUnicodeResult: | ||||
|             html_block += str(element) | ||||
|         else: | ||||
|             if not is_rss: | ||||
|                 html_block += etree.tostring(element, pretty_print=True).decode('utf-8') | ||||
|             else: | ||||
|                 html_block += f"<div>{element.text}</div>\n" | ||||
|             html_block += etree.tostring(element, pretty_print=True).decode('utf-8') | ||||
|  | ||||
|     return html_block | ||||
|  | ||||
| @@ -274,7 +271,7 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False | ||||
|     pattern = '<!\[CDATA\[(\s*(?:.(?<!\]\]>)\s*)*)\]\]>' | ||||
|     def repl(m): | ||||
|         text = m.group(1) | ||||
|         return xml_escape(html_to_text(html_content=text)) | ||||
|         return xml_escape(html_to_text(html_content=text)).strip() | ||||
|  | ||||
|     return re.sub(pattern, repl, html_content) | ||||
|  | ||||
| @@ -295,7 +292,8 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals | ||||
|     #  extracting this content | ||||
|     if render_anchor_tag_content: | ||||
|         parser_config = ParserConfig( | ||||
|             annotation_rules={"a": ["hyperlink"]}, display_links=True | ||||
|             annotation_rules={"a": ["hyperlink"]}, | ||||
|             display_links=True | ||||
|         ) | ||||
|     # otherwise set config to None/default | ||||
|     else: | ||||
| @@ -303,13 +301,12 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals | ||||
|  | ||||
|     # RSS Mode - Inscriptis will treat `title` as something else. | ||||
|     # Make it as a regular block display element (//item/title) | ||||
|     # This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874 | ||||
|     if is_rss: | ||||
|         css = CSS_PROFILES['strict'].copy() | ||||
|         css['title'] = HtmlElement(display=Display.block) | ||||
|         text_content = get_text(html_content, ParserConfig(css=css)) | ||||
|     else: | ||||
|         # get text and annotations via inscriptis | ||||
|         text_content = get_text(html_content, config=parser_config) | ||||
|         html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content) | ||||
|         html_content = re.sub(r'</title>', r'</h1>', html_content) | ||||
|  | ||||
|     text_content = get_text(html_content, config=parser_config) | ||||
|  | ||||
|     return text_content | ||||
|  | ||||
|   | ||||
| @@ -274,7 +274,7 @@ class perform_site_check(difference_detection_processor): | ||||
|                         html_tools.html_to_text( | ||||
|                             html_content=html_content, | ||||
|                             render_anchor_tag_content=do_anchor, | ||||
|                             is_rss=is_rss | ||||
|                             is_rss=is_rss # #1874 activate the <title workaround hack | ||||
|                         ) | ||||
|  | ||||
|         # Re #340 - return the content before the 'ignore text' was applied | ||||
|   | ||||
| @@ -118,7 +118,7 @@ def test_basic_cdata_rss_markup(client, live_server): | ||||
|     res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) | ||||
|  | ||||
| def test_rss_xpath_filtering(client, live_server): | ||||
| #    live_server_setup(live_server) | ||||
|     #live_server_setup(live_server) | ||||
|  | ||||
|     set_original_cdata_xml() | ||||
|  | ||||
| @@ -154,6 +154,9 @@ def test_rss_xpath_filtering(client, live_server): | ||||
|     ) | ||||
|     assert b'CDATA' not in res.data | ||||
|     assert b'<![' not in res.data | ||||
|     # #1874  All but the first <title was getting selected | ||||
|     # Convert any HTML with just a top level <title> to <h1> to be sure title renders | ||||
|  | ||||
|     assert b'Hackers can access your computer' in res.data # Should ONLY be selected by the xpath | ||||
|     assert b'Some other title' in res.data  # Should ONLY be selected by the xpath | ||||
|     assert b'The days of Terminator' not in res.data # Should NOT be selected by the xpath | ||||
|   | ||||
		Reference in New Issue
	
	Block a user