From 5ebd96b62be25ada8035d3e0c768af699689c274 Mon Sep 17 00:00:00 2001 From: Jayden Pyles <111098627+jaypyles@users.noreply.github.com> Date: Mon, 19 May 2025 20:44:41 -0500 Subject: [PATCH] feat: add agent mode (#81) * chore: wip agent mode * wip: add agent mode frontend * wip: add agent mode frontend * chore: cleanup code * chore: cleanup code * chore: cleanup code --- .github/workflows/unit-tests.yml | 2 +- api/backend/ai/agent/actions.py | 6 + api/backend/ai/agent/agent.py | 94 +++++++ api/backend/ai/agent/prompts.py | 58 ++++ api/backend/ai/agent/utils.py | 252 ++++++++++++++++++ api/backend/ai/ai_router.py | 23 +- api/backend/ai/clients.py | 38 +++ api/backend/database/queries/queries.py | 4 +- api/backend/database/schema/schema.py | 3 + api/backend/database/startup.py | 16 +- api/backend/job/job.py | 2 + api/backend/job/scraping/scraping_utils.py | 13 + api/backend/job/site_mapping/site_mapping.py | 2 - api/backend/models.py | 2 + api/backend/scraping.py | 14 +- api/backend/worker/job_worker.py | 28 +- pdm.lock | 13 +- pyproject.toml | 1 + .../advanced-job-options.tsx | 3 + .../dialog/advanced-job-options-dialog.tsx | 11 +- src/components/common/disabled/disabled.tsx | 29 ++ src/components/common/disabled/index.ts | 1 + .../common/nav-drawer/nav-items/nav-items.tsx | 8 +- src/components/jobs/JobQueue.tsx | 26 +- src/components/pages/agent/agent.tsx | 228 ++++++++++++++++ src/components/pages/agent/index.ts | 1 + .../submit/job-submitter/job-submitter.tsx | 2 - src/lib/helpers/parse-job-options.ts | 4 +- .../hooks/use-advanced-job-options/index.ts | 1 + .../use-advanced-job-options.ts | 29 ++ src/pages/agent.tsx | 1 + .../api-service/functions/submit-job.ts | 6 +- src/types/job.ts | 2 + 33 files changed, 869 insertions(+), 54 deletions(-) create mode 100644 api/backend/ai/agent/actions.py create mode 100644 api/backend/ai/agent/agent.py create mode 100644 api/backend/ai/agent/prompts.py create mode 100644 api/backend/ai/agent/utils.py create mode 100644 api/backend/ai/clients.py create mode 100644 src/components/common/disabled/disabled.tsx create mode 100644 src/components/common/disabled/index.ts create mode 100644 src/components/pages/agent/agent.tsx create mode 100644 src/components/pages/agent/index.ts create mode 100644 src/lib/hooks/use-advanced-job-options/index.ts create mode 100644 src/lib/hooks/use-advanced-job-options/use-advanced-job-options.ts create mode 100644 src/pages/agent.tsx diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 6328f95..117ec00 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -30,7 +30,7 @@ jobs: run: pdm run playwright install - name: Run tests - run: PYTHONPATH=. pdm run pytest api/backend/tests + run: PYTHONPATH=. pdm run pytest -v -ra api/backend/tests cypress-tests: runs-on: ubuntu-latest diff --git a/api/backend/ai/agent/actions.py b/api/backend/ai/agent/actions.py new file mode 100644 index 0000000..7b81ef9 --- /dev/null +++ b/api/backend/ai/agent/actions.py @@ -0,0 +1,6 @@ +from typing_extensions import TypedDict + + +class Action(TypedDict): + type: str + url: str diff --git a/api/backend/ai/agent/agent.py b/api/backend/ai/agent/agent.py new file mode 100644 index 0000000..50a2863 --- /dev/null +++ b/api/backend/ai/agent/agent.py @@ -0,0 +1,94 @@ +import random +from typing import Any + +from camoufox import AsyncCamoufox +from playwright.async_api import Page + +from api.backend.ai.agent.utils import ( + capture_elements, + convert_to_markdown, + parse_response, +) + +from api.backend.ai.clients import ask_open_ai, ask_ollama, open_ai_key + +from api.backend.ai.agent.prompts import ( + ELEMENT_EXTRACTION_PROMPT, + EXTRACT_ELEMENTS_PROMPT, +) + +from api.backend.job.scraping.collect_media import collect_media +from api.backend.worker.logger import LOG + +from api.backend.job.scraping.add_custom import add_custom_items + +from api.backend.models import CapturedElement + + +ask_ai = ask_open_ai if open_ai_key else ask_ollama + + +async def scrape_with_agent(agent_job: dict[str, Any]): + LOG.info(f"Starting work for agent job: {agent_job}") + pages = set() + + if agent_job["job_options"]["proxies"]: + proxy = random.choice(agent_job["job_options"]["proxies"]) + LOG.info(f"Using proxy: {proxy}") + + async with AsyncCamoufox(headless=True) as browser: + page: Page = await browser.new_page() + + await add_custom_items( + agent_job["url"], + page, + agent_job["job_options"]["custom_cookies"], + agent_job["job_options"]["custom_headers"], + ) + + try: + await page.set_viewport_size({"width": 1920, "height": 1080}) + await page.goto(agent_job["url"], timeout=60000) + + if agent_job["job_options"]["collect_media"]: + await collect_media(agent_job["id"], page) + + html_content = await page.content() + markdown_content = convert_to_markdown(html_content) + + response = await ask_ai( + ELEMENT_EXTRACTION_PROMPT.format( + extraction_prompt=EXTRACT_ELEMENTS_PROMPT, + webpage=markdown_content, + prompt=agent_job["prompt"], + ) + ) + + xpaths = parse_response(response) + + captured_elements = await capture_elements(page, xpaths) + + final_url = page.url + + pages.add((html_content, final_url)) + finally: + await page.close() + await browser.close() + + name_to_elements = {} + + for page in pages: + for element in captured_elements: + if element.name not in name_to_elements: + name_to_elements[element.name] = [] + + name_to_elements[element.name].append(element) + + scraped_elements: list[dict[str, dict[str, list[CapturedElement]]]] = [ + { + page[1]: name_to_elements, + } + for page in pages + ] + + return scraped_elements diff --git a/api/backend/ai/agent/prompts.py b/api/backend/ai/agent/prompts.py new file mode 100644 index 0000000..d158e88 --- /dev/null +++ b/api/backend/ai/agent/prompts.py @@ -0,0 +1,58 @@ +EXTRACT_ELEMENTS_PROMPT = """ +You are an assistant that extracts XPath expressions from webpages. + +You will receive HTML content in markdown format. + +Each element in the markdown has their xpath shown above them in a path like: + + +Respond only with a list of general XPath expressions inside `...` tags. + +You will also decide the decision of what to do next. If there is no decision available, return nothing for that section. +""" + +ELEMENT_EXTRACTION_PROMPT = """ +{extraction_prompt} + +**Guidelines:** +- Prefer shorter, more general XPaths like `//div[...]` or `//span[...]`. +- Avoid overly specific or deep paths like `//div[3]/ul/li[2]/a`. +- Do **not** chain multiple elements deeply (e.g., `//div/span/a`). +- Use XPaths further down the tree when possible. +- Do not include any extra explanation or text. +- One XPath is acceptable if that's all that's needed. +- Try and limit it down to 1 - 3 xpaths. +- Include a name for each xpath. + + +- USE THE MOST SIMPLE XPATHS POSSIBLE. +- USE THE MOST GENERAL XPATHS POSSIBLE. +- USE THE MOST SPECIFIC XPATHS POSSIBLE. +- USE THE MOST GENERAL XPATHS POSSIBLE. + + +**Example Format:** +```xml + +- : +- : +- : +- : +- : +- etc + + + + + - //a[@href='next_page_url'] + + +``` + +**Input webpage:** +{webpage} + +**Target content:** +{prompt} + +""" diff --git a/api/backend/ai/agent/utils.py b/api/backend/ai/agent/utils.py new file mode 100644 index 0000000..048ed5b --- /dev/null +++ b/api/backend/ai/agent/utils.py @@ -0,0 +1,252 @@ +from lxml import html, etree +import re +from playwright.async_api import Page + +from api.backend.models import CapturedElement + +from api.backend.job.scraping.scraping_utils import clean_format_characters + + +def convert_to_markdown(html_str: str): + parser = html.HTMLParser() + tree = html.fromstring(html_str, parser=parser) + root = tree.getroottree() + + def format_attributes(el: etree._Element) -> str: + """Convert element attributes into a string.""" + return " ".join(f'{k}="{v}"' for k, v in el.attrib.items()) + + def is_visible(el: etree._Element) -> bool: + style = el.attrib.get("style", "").lower() + class_ = el.attrib.get("class", "").lower() + + # Check for visibility styles + if "display: none" in style or "visibility: hidden" in style: + return False + if "opacity: 0" in style or "opacity:0" in style: + return False + if "height: 0" in style or "width: 0" in style: + return False + + # Check for common hidden classes + if any( + hidden in class_ + for hidden in ["hidden", "invisible", "truncate", "collapse"] + ): + return False + + # Check for hidden attributes + if el.attrib.get("hidden") is not None: + return False + if el.attrib.get("aria-hidden") == "true": + return False + + # Check for empty or whitespace-only content + if not el.text and len(el) == 0: + return False + + return True + + def is_layout_or_decorative(el: etree._Element) -> bool: + tag = el.tag.lower() + + # Layout elements + if tag in {"nav", "footer", "header", "aside", "main", "section"}: + return True + + # Decorative elements + if tag in {"svg", "path", "circle", "rect", "line", "polygon", "polyline"}: + return True + + # Check id and class for layout/decorative keywords + id_class = " ".join( + [el.attrib.get("id", ""), el.attrib.get("class", "")] + ).lower() + + layout_keywords = { + "sidebar", + "nav", + "header", + "footer", + "menu", + "advert", + "ads", + "breadcrumb", + "container", + "wrapper", + "layout", + "grid", + "flex", + "row", + "column", + "section", + "banner", + "hero", + "card", + "modal", + "popup", + "tooltip", + "dropdown", + "overlay", + } + + return any(keyword in id_class for keyword in layout_keywords) + + # Tags to ignore in the final markdown output + included_tags = { + "div", + "span", + "a", + "p", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "img", + "button", + "input", + "textarea", + "ul", + "ol", + "li", + "table", + "tr", + "td", + "th", + "input", + "textarea", + "select", + "option", + "optgroup", + "fieldset", + "legend", + } + + special_elements = [] + normal_elements = [] + + for el in tree.iter(): + if el.tag is etree.Comment: + continue + + tag = el.tag.lower() + + if tag not in included_tags: + continue + + if not is_visible(el): + continue + + if is_layout_or_decorative(el): + continue + + path = root.getpath(el) + attrs = format_attributes(el) + attrs_str = f" {attrs}" if attrs else "" + text = el.text.strip() if el.text else "" + + if not text and not attrs: + continue + + # input elements + if tag == "button": + prefix = "🔘 ** + ); + + return ( + + + {snackbarMessage} + + + ); + }; + + const handleSubmit = async () => { + if (!validateURL(url)) { + setUrlError("Please enter a valid URL."); + return; + } + + setUrlError(null); + + await ApiService.submitJob( + url, + [], + "", + { + collect_media: jobOptions.collect_media, + multi_page_scrape: jobOptions.multi_page_scrape, + }, + jobOptions.custom_headers, + jobOptions.custom_cookies, + null, + true, + prompt + ) + .then(async (response) => { + if (!response.ok) { + return response.json().then((error) => { + throw new Error(error.error); + }); + } + return response.json(); + }) + .then((data) => { + setSnackbarMessage( + `Agent job: ${data.id} submitted successfully.` || + "Agent job submitted successfully." + ); + setSnackbarSeverity("info"); + setSnackbarOpen(true); + }) + .catch((error) => { + setSnackbarMessage(error || "An error occurred."); + setSnackbarSeverity("error"); + setSnackbarOpen(true); + }); + }; + + if (!aiEnabled) { + return ( + + ); + } + + return ( + + + + Agent Mode + + + Use AI to scrape a website + + + + Website URL + + setUrl(e.target.value)} + error={!!urlError} + helperText={urlError} + autoComplete="agent-url" + fullWidth + placeholder="https://www.example.com" + variant="outlined" + size="small" + /> + + Prompt + + setPrompt(e.target.value)} + autoComplete="agent-prompt" + fullWidth + placeholder="Collect all the links on the page" + variant="outlined" + size="small" + /> + + + + + {snackbarSeverity === "info" ? : } + + + ); +}; diff --git a/src/components/pages/agent/index.ts b/src/components/pages/agent/index.ts new file mode 100644 index 0000000..ed4e7c1 --- /dev/null +++ b/src/components/pages/agent/index.ts @@ -0,0 +1 @@ +export * from "./agent"; diff --git a/src/components/submit/job-submitter/job-submitter.tsx b/src/components/submit/job-submitter/job-submitter.tsx index 62ab126..8cbaa18 100644 --- a/src/components/submit/job-submitter/job-submitter.tsx +++ b/src/components/submit/job-submitter/job-submitter.tsx @@ -41,8 +41,6 @@ export const JobSubmitter = () => { const [jobOptions, setJobOptions] = useState(initialJobOptions); - console.log(jobOptions); - const handleSubmit = async () => { if (!validateURL(submittedURL)) { setIsValidUrl(false); diff --git a/src/lib/helpers/parse-job-options.ts b/src/lib/helpers/parse-job-options.ts index a13561e..ed40661 100644 --- a/src/lib/helpers/parse-job-options.ts +++ b/src/lib/helpers/parse-job-options.ts @@ -5,7 +5,7 @@ import { RawJobOptions, SiteMap } from "@/types"; export const parseJobOptions = ( job_options: string, setJobOptions: Dispatch>, - setSiteMap: Dispatch> + setSiteMap?: Dispatch> ) => { if (job_options) { const jsonOptions = JSON.parse(job_options as string); @@ -38,7 +38,7 @@ export const parseJobOptions = ( newJobOptions.proxies = jsonOptions.proxies.join(","); } - if (jsonOptions.site_map) { + if (jsonOptions.site_map && setSiteMap) { setSiteMap(jsonOptions.site_map); } diff --git a/src/lib/hooks/use-advanced-job-options/index.ts b/src/lib/hooks/use-advanced-job-options/index.ts new file mode 100644 index 0000000..c7ff244 --- /dev/null +++ b/src/lib/hooks/use-advanced-job-options/index.ts @@ -0,0 +1 @@ +export * from "./use-advanced-job-options"; diff --git a/src/lib/hooks/use-advanced-job-options/use-advanced-job-options.ts b/src/lib/hooks/use-advanced-job-options/use-advanced-job-options.ts new file mode 100644 index 0000000..8ce8199 --- /dev/null +++ b/src/lib/hooks/use-advanced-job-options/use-advanced-job-options.ts @@ -0,0 +1,29 @@ +import { useEffect, useState } from "react"; + +import { RawJobOptions } from "@/types"; +import { parseJobOptions } from "@/lib/helpers/parse-job-options"; +import { useRouter } from "next/router"; + +export const useAdvancedJobOptions = () => { + const initialJobOptions: RawJobOptions = { + multi_page_scrape: false, + custom_headers: null, + proxies: null, + collect_media: false, + custom_cookies: null, + }; + + const router = useRouter(); + const { job_options } = router.query; + + const [jobOptions, setJobOptions] = + useState(initialJobOptions); + + useEffect(() => { + if (job_options) { + parseJobOptions(job_options as string, setJobOptions); + } + }, [job_options]); + + return { jobOptions, setJobOptions }; +}; diff --git a/src/pages/agent.tsx b/src/pages/agent.tsx new file mode 100644 index 0000000..705bb11 --- /dev/null +++ b/src/pages/agent.tsx @@ -0,0 +1 @@ +export { Agent as default } from "@/components/pages/agent"; diff --git a/src/services/api-service/functions/submit-job.ts b/src/services/api-service/functions/submit-job.ts index 9b0a967..bd2eb24 100644 --- a/src/services/api-service/functions/submit-job.ts +++ b/src/services/api-service/functions/submit-job.ts @@ -7,7 +7,9 @@ export const submitJob = async ( jobOptions: any, customHeaders: any, customCookies: any, - siteMap: SiteMap | null + siteMap: SiteMap | null, + agentMode: boolean = false, + prompt?: string ) => { return await fetch(`/api/submit-scrape-job`, { method: "POST", @@ -26,6 +28,8 @@ export const submitJob = async ( site_map: siteMap, custom_cookies: customCookies || [], }, + agent_mode: agentMode, + prompt: prompt || "", }, }), }); diff --git a/src/types/job.ts b/src/types/job.ts index ca7a21e..d1d7c65 100644 --- a/src/types/job.ts +++ b/src/types/job.ts @@ -10,6 +10,8 @@ export interface Job { job_options: RawJobOptions; favorite: boolean; chat?: Message[]; + agent_mode?: boolean; + prompt?: string; } export type JobOptions = {