mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-12-12 02:35:43 +00:00
feat: add agent mode (#81)
* chore: wip agent mode * wip: add agent mode frontend * wip: add agent mode frontend * chore: cleanup code * chore: cleanup code * chore: cleanup code
This commit is contained in:
94
api/backend/ai/agent/agent.py
Normal file
94
api/backend/ai/agent/agent.py
Normal file
@@ -0,0 +1,94 @@
|
||||
import random
|
||||
from typing import Any
|
||||
|
||||
from camoufox import AsyncCamoufox
|
||||
from playwright.async_api import Page
|
||||
|
||||
from api.backend.ai.agent.utils import (
|
||||
capture_elements,
|
||||
convert_to_markdown,
|
||||
parse_response,
|
||||
)
|
||||
|
||||
from api.backend.ai.clients import ask_open_ai, ask_ollama, open_ai_key
|
||||
|
||||
from api.backend.ai.agent.prompts import (
|
||||
ELEMENT_EXTRACTION_PROMPT,
|
||||
EXTRACT_ELEMENTS_PROMPT,
|
||||
)
|
||||
|
||||
from api.backend.job.scraping.collect_media import collect_media
|
||||
from api.backend.worker.logger import LOG
|
||||
|
||||
from api.backend.job.scraping.add_custom import add_custom_items
|
||||
|
||||
from api.backend.models import CapturedElement
|
||||
|
||||
|
||||
ask_ai = ask_open_ai if open_ai_key else ask_ollama
|
||||
|
||||
|
||||
async def scrape_with_agent(agent_job: dict[str, Any]):
|
||||
LOG.info(f"Starting work for agent job: {agent_job}")
|
||||
pages = set()
|
||||
|
||||
if agent_job["job_options"]["proxies"]:
|
||||
proxy = random.choice(agent_job["job_options"]["proxies"])
|
||||
LOG.info(f"Using proxy: {proxy}")
|
||||
|
||||
async with AsyncCamoufox(headless=True) as browser:
|
||||
page: Page = await browser.new_page()
|
||||
|
||||
await add_custom_items(
|
||||
agent_job["url"],
|
||||
page,
|
||||
agent_job["job_options"]["custom_cookies"],
|
||||
agent_job["job_options"]["custom_headers"],
|
||||
)
|
||||
|
||||
try:
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
await page.goto(agent_job["url"], timeout=60000)
|
||||
|
||||
if agent_job["job_options"]["collect_media"]:
|
||||
await collect_media(agent_job["id"], page)
|
||||
|
||||
html_content = await page.content()
|
||||
markdown_content = convert_to_markdown(html_content)
|
||||
|
||||
response = await ask_ai(
|
||||
ELEMENT_EXTRACTION_PROMPT.format(
|
||||
extraction_prompt=EXTRACT_ELEMENTS_PROMPT,
|
||||
webpage=markdown_content,
|
||||
prompt=agent_job["prompt"],
|
||||
)
|
||||
)
|
||||
|
||||
xpaths = parse_response(response)
|
||||
|
||||
captured_elements = await capture_elements(page, xpaths)
|
||||
|
||||
final_url = page.url
|
||||
|
||||
pages.add((html_content, final_url))
|
||||
finally:
|
||||
await page.close()
|
||||
await browser.close()
|
||||
|
||||
name_to_elements = {}
|
||||
|
||||
for page in pages:
|
||||
for element in captured_elements:
|
||||
if element.name not in name_to_elements:
|
||||
name_to_elements[element.name] = []
|
||||
|
||||
name_to_elements[element.name].append(element)
|
||||
|
||||
scraped_elements: list[dict[str, dict[str, list[CapturedElement]]]] = [
|
||||
{
|
||||
page[1]: name_to_elements,
|
||||
}
|
||||
for page in pages
|
||||
]
|
||||
|
||||
return scraped_elements
|
||||
Reference in New Issue
Block a user