Files
Scraperr/api/backend/job/scraping/scraping_utils.py
Jayden Pyles 5ebd96b62b feat: add agent mode (#81)
* chore: wip agent mode

* wip: add agent mode frontend

* wip: add agent mode frontend

* chore: cleanup code

* chore: cleanup code

* chore: cleanup code
2025-05-19 20:44:41 -05:00

46 lines
1.2 KiB
Python

import asyncio
from typing import Set, Tuple
from playwright.async_api import Page
from api.backend.utils import LOG
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
async def scrape_content(
id: str, page: Page, pages: Set[Tuple[str, str]], collect_media: bool
) -> str:
last_height = await page.evaluate("document.body.scrollHeight")
while True:
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
await asyncio.sleep(3)
new_height = await page.evaluate("document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
html = await page.content()
pages.add((html, page.url))
if collect_media:
LOG.info("Collecting media")
await collect_media_utils(id, page)
return html
def clean_format_characters(text: str) -> str:
text = text.strip()
text = text.replace("\n", " ")
text = text.replace("\t", " ")
text = text.replace("\r", " ")
text = text.replace("\f", " ")
text = text.replace("\v", " ")
text = text.replace("\b", " ")
text = text.replace("\a", " ")
return text