From fe5bc8859c94b9f9bda0f1e1c8e9afe095a46c0e Mon Sep 17 00:00:00 2001 From: Jayden Pyles Date: Wed, 6 Nov 2024 19:06:50 -0600 Subject: [PATCH] feat: add page scrolling to sites --- api/backend/scraping.py | 19 +++++++++++++++++-- docker-compose.dev.yml | 2 +- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/api/backend/scraping.py b/api/backend/scraping.py index 1238c82..485f0d1 100644 --- a/api/backend/scraping.py +++ b/api/backend/scraping.py @@ -39,6 +39,7 @@ def clean_xpath(xpath: str) -> str: clean_parts.append(part) clean_xpath = "//".join(clean_parts).replace("////", "//") clean_xpath = clean_xpath.replace("'", "\\'") + LOG.info(f"Cleaned xpath: {clean_xpath}") return clean_xpath @@ -100,8 +101,22 @@ async def make_site_request( _ = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.TAG_NAME, "body")) ) - time.sleep(5) + + last_height = driver.execute_script("return document.body.scrollHeight") + while True: + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + + time.sleep(2) # Wait for the page to load + new_height = driver.execute_script("return document.body.scrollHeight") + + if new_height == last_height: + break + + last_height = new_height + + driver.execute_script("return document.body.scrollHeight") page_source = driver.page_source + LOG.debug(f"Page source for url: {url}\n{page_source}") pages.add((page_source, final_url)) finally: @@ -138,7 +153,7 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]) elements: dict[str, list[CapturedElement]] = dict() for elem in xpaths: - el = sxpath(root, clean_xpath(elem.xpath)) + el = sxpath(root, elem.xpath) for e in el: text = "\t".join(str(t) for t in e.itertext()) diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 0c47476..dcb3b3b 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -19,4 +19,4 @@ services: ports: - "8000:8000" volumes: - - "$PWD/api:/project/api" + - "$PWD/api:/project/app/api"