from lxml import html, etree import re from playwright.async_api import Page from api.backend.models import CapturedElement from api.backend.job.scraping.scraping_utils import clean_format_characters def convert_to_markdown(html_str: str): parser = html.HTMLParser() tree = html.fromstring(html_str, parser=parser) root = tree.getroottree() def format_attributes(el: etree._Element) -> str: """Convert element attributes into a string.""" return " ".join(f'{k}="{v}"' for k, v in el.attrib.items()) def is_visible(el: etree._Element) -> bool: style = el.attrib.get("style", "").lower() class_ = el.attrib.get("class", "").lower() # Check for visibility styles if "display: none" in style or "visibility: hidden" in style: return False if "opacity: 0" in style or "opacity:0" in style: return False if "height: 0" in style or "width: 0" in style: return False # Check for common hidden classes if any( hidden in class_ for hidden in ["hidden", "invisible", "truncate", "collapse"] ): return False # Check for hidden attributes if el.attrib.get("hidden") is not None: return False if el.attrib.get("aria-hidden") == "true": return False # Check for empty or whitespace-only content if not el.text and len(el) == 0: return False return True def is_layout_or_decorative(el: etree._Element) -> bool: tag = el.tag.lower() # Layout elements if tag in {"nav", "footer", "header", "aside", "main", "section"}: return True # Decorative elements if tag in {"svg", "path", "circle", "rect", "line", "polygon", "polyline"}: return True # Check id and class for layout/decorative keywords id_class = " ".join( [el.attrib.get("id", ""), el.attrib.get("class", "")] ).lower() layout_keywords = { "sidebar", "nav", "header", "footer", "menu", "advert", "ads", "breadcrumb", "container", "wrapper", "layout", "grid", "flex", "row", "column", "section", "banner", "hero", "card", "modal", "popup", "tooltip", "dropdown", "overlay", } return any(keyword in id_class for keyword in layout_keywords) # Tags to ignore in the final markdown output included_tags = { "div", "span", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "img", "button", "input", "textarea", "ul", "ol", "li", "table", "tr", "td", "th", "input", "textarea", "select", "option", "optgroup", "fieldset", "legend", } special_elements = [] normal_elements = [] for el in tree.iter(): if el.tag is etree.Comment: continue tag = el.tag.lower() if tag not in included_tags: continue if not is_visible(el): continue if is_layout_or_decorative(el): continue path = root.getpath(el) attrs = format_attributes(el) attrs_str = f" {attrs}" if attrs else "" text = el.text.strip() if el.text else "" if not text and not attrs: continue # input elements if tag == "button": prefix = "🔘 **