mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-11-19 23:56:23 +00:00
33 lines
849 B
Python
33 lines
849 B
Python
import asyncio
|
|
from typing import Set, Tuple
|
|
from playwright.async_api import Page
|
|
|
|
from api.backend.utils import LOG
|
|
|
|
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
|
|
|
|
|
|
async def scrape_content(
|
|
page: Page, pages: Set[Tuple[str, str]], collect_media: bool
|
|
) -> str:
|
|
last_height = await page.evaluate("document.body.scrollHeight")
|
|
|
|
while True:
|
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
|
await asyncio.sleep(3)
|
|
new_height = await page.evaluate("document.body.scrollHeight")
|
|
|
|
if new_height == last_height:
|
|
break
|
|
|
|
last_height = new_height
|
|
|
|
html = await page.content()
|
|
pages.add((html, page.url))
|
|
|
|
if collect_media:
|
|
LOG.info("Collecting media")
|
|
await collect_media_utils(page)
|
|
|
|
return html
|