mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-12-14 11:46:17 +00:00
* feat: add media viewer + other fixes * chore: remove logging [skip ci] * chore: remove logging [skip ci] * feat: add unit test for media * feat: add unit test for media * feat: add unit test for media [skip ci] * feat: add unit test for media [skip ci] * feat: add unit test for media [skip ci] * feat: add unit test for media [skip ci] * chore: update docs [skip ci]
33 lines
862 B
Python
33 lines
862 B
Python
import asyncio
|
|
from typing import Set, Tuple
|
|
from playwright.async_api import Page
|
|
|
|
from api.backend.utils import LOG
|
|
|
|
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
|
|
|
|
|
|
async def scrape_content(
|
|
id: str, page: Page, pages: Set[Tuple[str, str]], collect_media: bool
|
|
) -> str:
|
|
last_height = await page.evaluate("document.body.scrollHeight")
|
|
|
|
while True:
|
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
|
await asyncio.sleep(3)
|
|
new_height = await page.evaluate("document.body.scrollHeight")
|
|
|
|
if new_height == last_height:
|
|
break
|
|
|
|
last_height = new_height
|
|
|
|
html = await page.content()
|
|
pages.add((html, page.url))
|
|
|
|
if collect_media:
|
|
LOG.info("Collecting media")
|
|
await collect_media_utils(id, page)
|
|
|
|
return html
|