Filters - Add support for also removing HTML elements using XPath selectors (#2632)

This commit is contained in:
Michael McMillan
2024-09-17 22:43:04 +02:00
committed by GitHub
parent 8c1527c1ad
commit dc936a2e8a
7 changed files with 50 additions and 17 deletions

View File

@@ -1,4 +1,5 @@
from typing import List
from lxml import etree
import json
import re
@@ -57,11 +58,26 @@ def subtractive_css_selector(css_selector, html_content):
item.decompose()
return str(soup)
def subtractive_xpath_selector(xpath_selector, html_content):
html_tree = etree.HTML(html_content)
elements_to_remove = html_tree.xpath(xpath_selector)
for element in elements_to_remove:
element.getparent().remove(element)
modified_html = etree.tostring(html_tree, method="html").decode("utf-8")
return modified_html
def element_removal(selectors: List[str], html_content):
"""Joins individual filters into one css filter."""
selector = ",".join(selectors)
return subtractive_css_selector(selector, html_content)
"""Removes elements that match a list of CSS or xPath selectors."""
modified_html = html_content
for selector in selectors:
if selector.startswith(('xpath:', 'xpath1:', '//')):
xpath_selector = selector.removeprefix('xpath:').removeprefix('xpath1:')
modified_html = subtractive_xpath_selector(xpath_selector, modified_html)
else:
modified_html = subtractive_css_selector(selector, modified_html)
return modified_html
def elementpath_tostring(obj):
"""