diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 6328f95..117ec00 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -30,7 +30,7 @@ jobs:
run: pdm run playwright install
- name: Run tests
- run: PYTHONPATH=. pdm run pytest api/backend/tests
+ run: PYTHONPATH=. pdm run pytest -v -ra api/backend/tests
cypress-tests:
runs-on: ubuntu-latest
diff --git a/api/backend/ai/agent/actions.py b/api/backend/ai/agent/actions.py
new file mode 100644
index 0000000..7b81ef9
--- /dev/null
+++ b/api/backend/ai/agent/actions.py
@@ -0,0 +1,6 @@
+from typing_extensions import TypedDict
+
+
+class Action(TypedDict):
+ type: str
+ url: str
diff --git a/api/backend/ai/agent/agent.py b/api/backend/ai/agent/agent.py
new file mode 100644
index 0000000..50a2863
--- /dev/null
+++ b/api/backend/ai/agent/agent.py
@@ -0,0 +1,94 @@
+import random
+from typing import Any
+
+from camoufox import AsyncCamoufox
+from playwright.async_api import Page
+
+from api.backend.ai.agent.utils import (
+ capture_elements,
+ convert_to_markdown,
+ parse_response,
+)
+
+from api.backend.ai.clients import ask_open_ai, ask_ollama, open_ai_key
+
+from api.backend.ai.agent.prompts import (
+ ELEMENT_EXTRACTION_PROMPT,
+ EXTRACT_ELEMENTS_PROMPT,
+)
+
+from api.backend.job.scraping.collect_media import collect_media
+from api.backend.worker.logger import LOG
+
+from api.backend.job.scraping.add_custom import add_custom_items
+
+from api.backend.models import CapturedElement
+
+
+ask_ai = ask_open_ai if open_ai_key else ask_ollama
+
+
+async def scrape_with_agent(agent_job: dict[str, Any]):
+ LOG.info(f"Starting work for agent job: {agent_job}")
+ pages = set()
+
+ if agent_job["job_options"]["proxies"]:
+ proxy = random.choice(agent_job["job_options"]["proxies"])
+ LOG.info(f"Using proxy: {proxy}")
+
+ async with AsyncCamoufox(headless=True) as browser:
+ page: Page = await browser.new_page()
+
+ await add_custom_items(
+ agent_job["url"],
+ page,
+ agent_job["job_options"]["custom_cookies"],
+ agent_job["job_options"]["custom_headers"],
+ )
+
+ try:
+ await page.set_viewport_size({"width": 1920, "height": 1080})
+ await page.goto(agent_job["url"], timeout=60000)
+
+ if agent_job["job_options"]["collect_media"]:
+ await collect_media(agent_job["id"], page)
+
+ html_content = await page.content()
+ markdown_content = convert_to_markdown(html_content)
+
+ response = await ask_ai(
+ ELEMENT_EXTRACTION_PROMPT.format(
+ extraction_prompt=EXTRACT_ELEMENTS_PROMPT,
+ webpage=markdown_content,
+ prompt=agent_job["prompt"],
+ )
+ )
+
+ xpaths = parse_response(response)
+
+ captured_elements = await capture_elements(page, xpaths)
+
+ final_url = page.url
+
+ pages.add((html_content, final_url))
+ finally:
+ await page.close()
+ await browser.close()
+
+ name_to_elements = {}
+
+ for page in pages:
+ for element in captured_elements:
+ if element.name not in name_to_elements:
+ name_to_elements[element.name] = []
+
+ name_to_elements[element.name].append(element)
+
+ scraped_elements: list[dict[str, dict[str, list[CapturedElement]]]] = [
+ {
+ page[1]: name_to_elements,
+ }
+ for page in pages
+ ]
+
+ return scraped_elements
diff --git a/api/backend/ai/agent/prompts.py b/api/backend/ai/agent/prompts.py
new file mode 100644
index 0000000..d158e88
--- /dev/null
+++ b/api/backend/ai/agent/prompts.py
@@ -0,0 +1,58 @@
+EXTRACT_ELEMENTS_PROMPT = """
+You are an assistant that extracts XPath expressions from webpages.
+
+You will receive HTML content in markdown format.
+
+Each element in the markdown has their xpath shown above them in a path like:
+
+
+Respond only with a list of general XPath expressions inside `...` tags.
+
+You will also decide the decision of what to do next. If there is no decision available, return nothing for that section.
+"""
+
+ELEMENT_EXTRACTION_PROMPT = """
+{extraction_prompt}
+
+**Guidelines:**
+- Prefer shorter, more general XPaths like `//div[...]` or `//span[...]`.
+- Avoid overly specific or deep paths like `//div[3]/ul/li[2]/a`.
+- Do **not** chain multiple elements deeply (e.g., `//div/span/a`).
+- Use XPaths further down the tree when possible.
+- Do not include any extra explanation or text.
+- One XPath is acceptable if that's all that's needed.
+- Try and limit it down to 1 - 3 xpaths.
+- Include a name for each xpath.
+
+
+- USE THE MOST SIMPLE XPATHS POSSIBLE.
+- USE THE MOST GENERAL XPATHS POSSIBLE.
+- USE THE MOST SPECIFIC XPATHS POSSIBLE.
+- USE THE MOST GENERAL XPATHS POSSIBLE.
+
+
+**Example Format:**
+```xml
+
+- :
+- :
+- :
+- :
+- :
+- etc
+
+
+
+
+ - //a[@href='next_page_url']
+
+
+```
+
+**Input webpage:**
+{webpage}
+
+**Target content:**
+{prompt}
+
+"""
diff --git a/api/backend/ai/agent/utils.py b/api/backend/ai/agent/utils.py
new file mode 100644
index 0000000..048ed5b
--- /dev/null
+++ b/api/backend/ai/agent/utils.py
@@ -0,0 +1,252 @@
+from lxml import html, etree
+import re
+from playwright.async_api import Page
+
+from api.backend.models import CapturedElement
+
+from api.backend.job.scraping.scraping_utils import clean_format_characters
+
+
+def convert_to_markdown(html_str: str):
+ parser = html.HTMLParser()
+ tree = html.fromstring(html_str, parser=parser)
+ root = tree.getroottree()
+
+ def format_attributes(el: etree._Element) -> str:
+ """Convert element attributes into a string."""
+ return " ".join(f'{k}="{v}"' for k, v in el.attrib.items())
+
+ def is_visible(el: etree._Element) -> bool:
+ style = el.attrib.get("style", "").lower()
+ class_ = el.attrib.get("class", "").lower()
+
+ # Check for visibility styles
+ if "display: none" in style or "visibility: hidden" in style:
+ return False
+ if "opacity: 0" in style or "opacity:0" in style:
+ return False
+ if "height: 0" in style or "width: 0" in style:
+ return False
+
+ # Check for common hidden classes
+ if any(
+ hidden in class_
+ for hidden in ["hidden", "invisible", "truncate", "collapse"]
+ ):
+ return False
+
+ # Check for hidden attributes
+ if el.attrib.get("hidden") is not None:
+ return False
+ if el.attrib.get("aria-hidden") == "true":
+ return False
+
+ # Check for empty or whitespace-only content
+ if not el.text and len(el) == 0:
+ return False
+
+ return True
+
+ def is_layout_or_decorative(el: etree._Element) -> bool:
+ tag = el.tag.lower()
+
+ # Layout elements
+ if tag in {"nav", "footer", "header", "aside", "main", "section"}:
+ return True
+
+ # Decorative elements
+ if tag in {"svg", "path", "circle", "rect", "line", "polygon", "polyline"}:
+ return True
+
+ # Check id and class for layout/decorative keywords
+ id_class = " ".join(
+ [el.attrib.get("id", ""), el.attrib.get("class", "")]
+ ).lower()
+
+ layout_keywords = {
+ "sidebar",
+ "nav",
+ "header",
+ "footer",
+ "menu",
+ "advert",
+ "ads",
+ "breadcrumb",
+ "container",
+ "wrapper",
+ "layout",
+ "grid",
+ "flex",
+ "row",
+ "column",
+ "section",
+ "banner",
+ "hero",
+ "card",
+ "modal",
+ "popup",
+ "tooltip",
+ "dropdown",
+ "overlay",
+ }
+
+ return any(keyword in id_class for keyword in layout_keywords)
+
+ # Tags to ignore in the final markdown output
+ included_tags = {
+ "div",
+ "span",
+ "a",
+ "p",
+ "h1",
+ "h2",
+ "h3",
+ "h4",
+ "h5",
+ "h6",
+ "img",
+ "button",
+ "input",
+ "textarea",
+ "ul",
+ "ol",
+ "li",
+ "table",
+ "tr",
+ "td",
+ "th",
+ "input",
+ "textarea",
+ "select",
+ "option",
+ "optgroup",
+ "fieldset",
+ "legend",
+ }
+
+ special_elements = []
+ normal_elements = []
+
+ for el in tree.iter():
+ if el.tag is etree.Comment:
+ continue
+
+ tag = el.tag.lower()
+
+ if tag not in included_tags:
+ continue
+
+ if not is_visible(el):
+ continue
+
+ if is_layout_or_decorative(el):
+ continue
+
+ path = root.getpath(el)
+ attrs = format_attributes(el)
+ attrs_str = f" {attrs}" if attrs else ""
+ text = el.text.strip() if el.text else ""
+
+ if not text and not attrs:
+ continue
+
+ # input elements
+ if tag == "button":
+ prefix = "🔘 **