wip: add in job rerunning

2025-12-14 03:36:01 +00:00 · 2024-07-06 16:56:56 -05:00
parent 70bdd01d9d
commit 8808b493e6
13 changed files with 607 additions and 70 deletions
--- a/api/backend/scraping.py
+++ b/api/backend/scraping.py
@@ -1,3 +1,6 @@
+# STL
+import logging
+
 # PDM
 from bs4 import BeautifulSoup
 from lxml import etree
@@ -14,10 +17,26 @@ from selenium.webdriver.chrome.service import Service
 # LOCAL
 from api.backend.models import Element, CapturedElement

+LOG = logging.getLogger(__name__)
+

 class HtmlElement(_Element): ...


+def clean_xpath(xpath: str) -> str:
+    parts = xpath.split("/")
+    clean_parts: list[str] = []
+    for part in parts:
+        if part == "":
+            clean_parts.append("/")
+        else:
+            clean_parts.append(part)
+    clean_xpath = "//".join(clean_parts).replace("////", "//")
+
+    clean_xpath = clean_xpath.replace("'", "\\'")
+    return clean_xpath
+
+
 def sxpath(context: _Element, xpath: str) -> list[HtmlElement]:
    return context.xpath(xpath)  # type: ignore [reportReturnType]

@@ -44,7 +63,7 @@ async def make_site_request(url: str) -> str:
    finally:
        driver.quit()

-    print(page_source)
+    LOG.debug(f"Page source for url: {url}\n{page_source}")
    return page_source


@@ -55,7 +74,7 @@ async def collect_scraped_elements(page: str, xpaths: list[Element]):
    elements: dict[str, list[CapturedElement]] = dict()

    for elem in xpaths:
-        el = sxpath(root, elem.xpath)
+        el = sxpath(root, clean_xpath(elem.xpath))
        text = ["".join(str(e) for e in e.itertext()) for e in el]
        captured_element = CapturedElement(
            xpath=elem.xpath, text=",".join(text), name=elem.name