mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-11-03 16:17:51 +00:00 
			
		
		
		
	Compare commits
	
		
			1 Commits
		
	
	
		
			notificato
			...
			1874-rss-t
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					d2c09cfc7d | 
@@ -98,10 +98,7 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False
 | 
			
		||||
        elif type(element) == etree._ElementUnicodeResult:
 | 
			
		||||
            html_block += str(element)
 | 
			
		||||
        else:
 | 
			
		||||
            if not is_rss:
 | 
			
		||||
                html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
 | 
			
		||||
            else:
 | 
			
		||||
                html_block += f"<div>{element.text}</div>\n"
 | 
			
		||||
            html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
 | 
			
		||||
 | 
			
		||||
    return html_block
 | 
			
		||||
 | 
			
		||||
@@ -274,7 +271,7 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
 | 
			
		||||
    pattern = '<!\[CDATA\[(\s*(?:.(?<!\]\]>)\s*)*)\]\]>'
 | 
			
		||||
    def repl(m):
 | 
			
		||||
        text = m.group(1)
 | 
			
		||||
        return xml_escape(html_to_text(html_content=text))
 | 
			
		||||
        return xml_escape(html_to_text(html_content=text)).strip()
 | 
			
		||||
 | 
			
		||||
    return re.sub(pattern, repl, html_content)
 | 
			
		||||
 | 
			
		||||
@@ -295,7 +292,8 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
 | 
			
		||||
    #  extracting this content
 | 
			
		||||
    if render_anchor_tag_content:
 | 
			
		||||
        parser_config = ParserConfig(
 | 
			
		||||
            annotation_rules={"a": ["hyperlink"]}, display_links=True
 | 
			
		||||
            annotation_rules={"a": ["hyperlink"]},
 | 
			
		||||
            display_links=True
 | 
			
		||||
        )
 | 
			
		||||
    # otherwise set config to None/default
 | 
			
		||||
    else:
 | 
			
		||||
@@ -303,13 +301,12 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
 | 
			
		||||
 | 
			
		||||
    # RSS Mode - Inscriptis will treat `title` as something else.
 | 
			
		||||
    # Make it as a regular block display element (//item/title)
 | 
			
		||||
    # This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
 | 
			
		||||
    if is_rss:
 | 
			
		||||
        css = CSS_PROFILES['strict'].copy()
 | 
			
		||||
        css['title'] = HtmlElement(display=Display.block)
 | 
			
		||||
        text_content = get_text(html_content, ParserConfig(css=css))
 | 
			
		||||
    else:
 | 
			
		||||
        # get text and annotations via inscriptis
 | 
			
		||||
        text_content = get_text(html_content, config=parser_config)
 | 
			
		||||
        html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
 | 
			
		||||
        html_content = re.sub(r'</title>', r'</h1>', html_content)
 | 
			
		||||
 | 
			
		||||
    text_content = get_text(html_content, config=parser_config)
 | 
			
		||||
 | 
			
		||||
    return text_content
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -274,7 +274,7 @@ class perform_site_check(difference_detection_processor):
 | 
			
		||||
                        html_tools.html_to_text(
 | 
			
		||||
                            html_content=html_content,
 | 
			
		||||
                            render_anchor_tag_content=do_anchor,
 | 
			
		||||
                            is_rss=is_rss
 | 
			
		||||
                            is_rss=is_rss # #1874 activate the <title workaround hack
 | 
			
		||||
                        )
 | 
			
		||||
 | 
			
		||||
        # Re #340 - return the content before the 'ignore text' was applied
 | 
			
		||||
 
 | 
			
		||||
@@ -118,7 +118,7 @@ def test_basic_cdata_rss_markup(client, live_server):
 | 
			
		||||
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
 | 
			
		||||
 | 
			
		||||
def test_rss_xpath_filtering(client, live_server):
 | 
			
		||||
#    live_server_setup(live_server)
 | 
			
		||||
    #live_server_setup(live_server)
 | 
			
		||||
 | 
			
		||||
    set_original_cdata_xml()
 | 
			
		||||
 | 
			
		||||
@@ -154,6 +154,9 @@ def test_rss_xpath_filtering(client, live_server):
 | 
			
		||||
    )
 | 
			
		||||
    assert b'CDATA' not in res.data
 | 
			
		||||
    assert b'<![' not in res.data
 | 
			
		||||
    # #1874  All but the first <title was getting selected
 | 
			
		||||
    # Convert any HTML with just a top level <title> to <h1> to be sure title renders
 | 
			
		||||
 | 
			
		||||
    assert b'Hackers can access your computer' in res.data # Should ONLY be selected by the xpath
 | 
			
		||||
    assert b'Some other title' in res.data  # Should ONLY be selected by the xpath
 | 
			
		||||
    assert b'The days of Terminator' not in res.data # Should NOT be selected by the xpath
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user