mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-12-13 19:25:58 +00:00
* chore: wip agent mode * wip: add agent mode frontend * wip: add agent mode frontend * chore: cleanup code * chore: cleanup code * chore: cleanup code
46 lines
1.2 KiB
Python
46 lines
1.2 KiB
Python
import asyncio
|
|
from typing import Set, Tuple
|
|
from playwright.async_api import Page
|
|
|
|
from api.backend.utils import LOG
|
|
|
|
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
|
|
|
|
|
|
async def scrape_content(
|
|
id: str, page: Page, pages: Set[Tuple[str, str]], collect_media: bool
|
|
) -> str:
|
|
last_height = await page.evaluate("document.body.scrollHeight")
|
|
|
|
while True:
|
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
|
await asyncio.sleep(3)
|
|
new_height = await page.evaluate("document.body.scrollHeight")
|
|
|
|
if new_height == last_height:
|
|
break
|
|
|
|
last_height = new_height
|
|
|
|
html = await page.content()
|
|
pages.add((html, page.url))
|
|
|
|
if collect_media:
|
|
LOG.info("Collecting media")
|
|
await collect_media_utils(id, page)
|
|
|
|
return html
|
|
|
|
|
|
def clean_format_characters(text: str) -> str:
|
|
text = text.strip()
|
|
text = text.replace("\n", " ")
|
|
text = text.replace("\t", " ")
|
|
text = text.replace("\r", " ")
|
|
text = text.replace("\f", " ")
|
|
text = text.replace("\v", " ")
|
|
text = text.replace("\b", " ")
|
|
text = text.replace("\a", " ")
|
|
|
|
return text
|