Files
Scraperr/api/backend/job/scraping/scraping_utils.py
Jayden Pyles 031572325f
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
Fix/UI and backend fixes (#67)
* chore: wip

* chore: wip

* chore: wip

* fix: cypress test

* chore: cleanup code
2025-05-11 17:33:29 -05:00

33 lines
849 B
Python

import asyncio
from typing import Set, Tuple
from playwright.async_api import Page
from api.backend.utils import LOG
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
async def scrape_content(
page: Page, pages: Set[Tuple[str, str]], collect_media: bool
) -> str:
last_height = await page.evaluate("document.body.scrollHeight")
while True:
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
await asyncio.sleep(3)
new_height = await page.evaluate("document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
html = await page.content()
pages.add((html, page.url))
if collect_media:
LOG.info("Collecting media")
await collect_media_utils(page)
return html