diff --git a/.github/actions/run-cypress-tests/action.yaml b/.github/actions/run-cypress-tests/action.yaml index ba8ddf4..0061f6b 100644 --- a/.github/actions/run-cypress-tests/action.yaml +++ b/.github/actions/run-cypress-tests/action.yaml @@ -73,5 +73,8 @@ runs: - name: Run Cypress tests shell: bash - run: npm run cy:run + run: | + set -e + npm run cy:run + diff --git a/.github/workflows/cypress-tests.yml b/.github/workflows/cypress-tests.yml index f77b39c..ecd34b8 100644 --- a/.github/workflows/cypress-tests.yml +++ b/.github/workflows/cypress-tests.yml @@ -18,15 +18,14 @@ jobs: uses: ./.github/actions/run-cypress-tests with: openai_key: ${{ secrets.openai_key }} - continue-on-error: true - name: Check container logs on failure - if: steps.run-tests.outcome == 'failure' + if: steps.run-tests.conclusion == 'failure' run: | echo "Cypress tests failed. Dumping container logs..." docker logs scraperr_api || true - name: Fail job if Cypress failed - if: steps.run-tests.outcome == 'failure' + if: steps.run-tests.conclusion == 'failure' run: exit 1 - - uses: actions/checkout@v4 + diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0af91e7..789e2d8 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -22,7 +22,6 @@ jobs: uses: ./.github/actions/run-cypress-tests with: openai_key: ${{ secrets.openai_key }} - continue-on-error: true success-message: runs-on: ubuntu-latest diff --git a/api/backend/ai/agent/agent.py b/api/backend/ai/agent/agent.py index d788e30..c594c4c 100644 --- a/api/backend/ai/agent/agent.py +++ b/api/backend/ai/agent/agent.py @@ -63,7 +63,9 @@ async def scrape_with_agent(agent_job: dict[str, Any]): xpaths = parse_response(response) - captured_elements = await capture_elements(page, xpaths) + captured_elements = await capture_elements( + page, xpaths, agent_job["job_options"]["return_html"] + ) final_url = page.url diff --git a/api/backend/ai/agent/utils.py b/api/backend/ai/agent/utils.py index 5413a6f..1bc84bb 100644 --- a/api/backend/ai/agent/utils.py +++ b/api/backend/ai/agent/utils.py @@ -206,7 +206,7 @@ def parse_next_page(text: str) -> str | None: async def capture_elements( - page: Page, xpaths: list[dict[str, str]] + page: Page, xpaths: list[dict[str, str]], return_html: bool ) -> list[CapturedElement]: captured_elements = [] seen_texts = set() @@ -217,6 +217,23 @@ async def capture_elements( count = await locator.count() for i in range(count): + if return_html: + element_text = ( + await page.locator(f"xpath={xpath['xpath']}") + .nth(i) + .inner_html() + ) + + seen_texts.add(element_text) + captured_elements.append( + CapturedElement( + name=xpath["name"], + text=element_text, + xpath=xpath["xpath"], + ) + ) + continue + element_text = "" element_handle = await locator.nth(i).element_handle() diff --git a/api/backend/job/models/job_options.py b/api/backend/job/models/job_options.py index 9e415c5..f3547de 100644 --- a/api/backend/job/models/job_options.py +++ b/api/backend/job/models/job_options.py @@ -25,3 +25,4 @@ class JobOptions(BaseModel): site_map: Optional[SiteMap] = None collect_media: bool = False custom_cookies: list[dict[str, Any]] = [] + return_html: bool = False diff --git a/api/backend/job/scraping/scraping.py b/api/backend/job/scraping/scraping.py index 743f9b9..bfc5ead 100644 --- a/api/backend/job/scraping/scraping.py +++ b/api/backend/job/scraping/scraping.py @@ -110,7 +110,9 @@ async def make_site_request( ) -async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]): +async def collect_scraped_elements( + page: tuple[str, str], xpaths: list[Element], return_html: bool +): soup = BeautifulSoup(page[0], "lxml") root = etree.HTML(str(soup)) @@ -120,6 +122,16 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]) el = sxpath(root, elem.xpath) for e in el: # type: ignore + if return_html: + elements[elem.name] = [ + CapturedElement( + xpath=elem.xpath, + text=page[0], + name=elem.name, + ) + ] + continue + text = ( " ".join(str(t) for t in e.itertext()) if isinstance(e, etree._Element) @@ -161,6 +173,8 @@ async def scrape( elements: list[dict[str, dict[str, list[CapturedElement]]]] = [] for page in pages: - elements.append(await collect_scraped_elements(page, xpaths)) + elements.append( + await collect_scraped_elements(page, xpaths, job_options["return_html"]) + ) return elements diff --git a/cypress/utilities/job.utilities.ts b/cypress/utilities/job.utilities.ts index d32f0cb..7d153ce 100644 --- a/cypress/utilities/job.utilities.ts +++ b/cypress/utilities/job.utilities.ts @@ -100,13 +100,13 @@ export const waitForJobCompletion = (url: string) => { }; export const enableMultiPageScraping = () => { - cy.get("button").contains("Advanced Job Options").click(); + cy.get("button").contains("Advanced Options").click(); cy.get('[data-cy="multi-page-toggle"]').click(); cy.get("body").type("{esc}"); }; export const addCustomHeaders = (headers: Record) => { - cy.get("button").contains("Advanced Job Options").click(); + cy.get("button").contains("Advanced Options").click(); cy.get('[name="custom_headers"]').type(JSON.stringify(headers), { parseSpecialCharSequences: false, }); @@ -114,13 +114,13 @@ export const addCustomHeaders = (headers: Record) => { }; export const addCustomCookies = (cookies: Record) => { - cy.get("button").contains("Advanced Job Options").click(); + cy.get("button").contains("Advanced Options").click(); cy.get('[name="custom_cookies"]').type(JSON.stringify(cookies)); cy.get("body").type("{esc}"); }; export const openAdvancedJobOptions = () => { - cy.get("button").contains("Advanced Job Options").click(); + cy.get("button").contains("Advanced Options").click(); }; export const selectJobFromSelector = () => { diff --git a/docs/main_page.png b/docs/main_page.png index 1e351a3..5e61aa7 100644 Binary files a/docs/main_page.png and b/docs/main_page.png differ diff --git a/src/components/common/advanced-job-options/advanced-job-options.tsx b/src/components/common/advanced-job-options/advanced-job-options.tsx index 6156a89..f563691 100644 --- a/src/components/common/advanced-job-options/advanced-job-options.tsx +++ b/src/components/common/advanced-job-options/advanced-job-options.tsx @@ -1,7 +1,8 @@ -import { Box, Link, Typography } from "@mui/material"; -import { SetStateAction, Dispatch, useState } from "react"; -import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog"; import { RawJobOptions } from "@/types"; +import SettingsIcon from "@mui/icons-material/Settings"; +import { Box, Button, Typography } from "@mui/material"; +import { Dispatch, SetStateAction, useState } from "react"; +import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog"; export type AdvancedJobOptionsProps = { jobOptions: RawJobOptions; @@ -17,26 +18,27 @@ export const AdvancedJobOptions = ({ const [open, setOpen] = useState(false); return ( - - + (jobOptions); - // Update local state when prop changes useEffect(() => { setLocalJobOptions(jobOptions); }, [jobOptions]); - const handleMultiPageScrapeChange = () => { + const handleCheckboxChange = (key: keyof RawJobOptions) => { setLocalJobOptions((prevJobOptions) => ({ ...prevJobOptions, - multi_page_scrape: !prevJobOptions.multi_page_scrape, + [key]: !prevJobOptions[key], })); }; @@ -65,15 +64,7 @@ export const AdvancedJobOptionsDialog = ({ })); }; - const handleCollectMediaChange = () => { - setLocalJobOptions((prevJobOptions) => ({ - ...prevJobOptions, - collect_media: !prevJobOptions.collect_media, - })); - }; - const handleClose = () => { - // Save the local state back to the parent before closing setJobOptions(localJobOptions); onClose(); }; @@ -137,7 +128,7 @@ export const AdvancedJobOptionsDialog = ({ control={ handleCheckboxChange("multi_page_scrape")} disabled={!multiPageScrapeEnabled} /> } @@ -158,11 +149,12 @@ export const AdvancedJobOptionsDialog = ({ } /> + handleCheckboxChange("collect_media")} data-cy="collect-media-checkbox" /> } @@ -177,6 +169,26 @@ export const AdvancedJobOptionsDialog = ({ } /> + + handleCheckboxChange("return_html")} + data-cy="return-html-checkbox" + /> + } + label={ + + Return HTML + + + + + + + } + /> diff --git a/src/components/jobs/job-queue.tsx b/src/components/jobs/job-queue.tsx index 3fe205a..edddae0 100644 --- a/src/components/jobs/job-queue.tsx +++ b/src/components/jobs/job-queue.tsx @@ -213,6 +213,7 @@ export const JobQueue = ({ query: { url: row.url, prompt: row.prompt, + job_options: JSON.stringify(row.job_options), }, }); } else { diff --git a/src/components/pages/home/home.tsx b/src/components/pages/home/home.tsx index 8b33cd3..7d6b92e 100644 --- a/src/components/pages/home/home.tsx +++ b/src/components/pages/home/home.tsx @@ -1,14 +1,14 @@ "use client"; -import React, { useEffect, useRef } from "react"; -import { Container, Box } from "@mui/material"; -import { useRouter } from "next/router"; -import { ElementTable, JobSubmitter } from "@/components/submit/job-submitter"; -import { useJobSubmitterProvider } from "@/components/submit/job-submitter/provider"; import { ErrorSnackbar, JobNotifySnackbar, } from "@/components/common/snackbars"; +import { ElementTable, JobSubmitter } from "@/components/submit/job-submitter"; +import { useJobSubmitterProvider } from "@/components/submit/job-submitter/provider"; +import { Box, Container } from "@mui/material"; +import { useRouter } from "next/router"; +import { useEffect, useRef } from "react"; export const Home = () => { const { @@ -50,19 +50,18 @@ export const Home = () => { flexDirection="column" justifyContent="center" alignItems="center" - height="100%" + minHeight="100vh" py={4} > - - - - {submittedURL.length > 0 ? ( + + + - ) : null} + {snackbarSeverity === "info" ? ( diff --git a/src/components/submit/job-submitter/element-table/element-table.tsx b/src/components/submit/job-submitter/element-table/element-table.tsx index d693346..7a08aed 100644 --- a/src/components/submit/job-submitter/element-table/element-table.tsx +++ b/src/components/submit/job-submitter/element-table/element-table.tsx @@ -1,24 +1,24 @@ "use client"; -import React, { useState, Dispatch, SetStateAction } from "react"; +import { Element } from "@/types"; +import AddIcon from "@mui/icons-material/Add"; +import DeleteIcon from "@mui/icons-material/Delete"; import { - Typography, - TextField, - Button, + Box, + Divider, + IconButton, + Paper, Table, TableBody, - TableContainer, TableCell, + TableContainer, TableHead, TableRow, - Box, - IconButton, + TextField, Tooltip, - useTheme, - Divider, + Typography, } from "@mui/material"; -import AddIcon from "@mui/icons-material/Add"; -import { Element } from "@/types"; +import { Dispatch, SetStateAction, useState } from "react"; import { SiteMap } from "../site-map"; interface Props { @@ -28,7 +28,6 @@ interface Props { } export const ElementTable = ({ rows, setRows, submittedURL }: Props) => { - const theme = useTheme(); const [newRow, setNewRow] = useState({ name: "", xpath: "", @@ -42,142 +41,219 @@ export const ElementTable = ({ rows, setRows, submittedURL }: Props) => { }; const handleDeleteRow = (elementName: string) => { - setRows( - rows.filter((r) => { - return elementName !== r.name; - }) - ); + setRows(rows.filter((r) => elementName !== r.name)); }; return ( - - - - Elements to Scrape - + + + + + Elements to Scrape + + + Add elements to scrape from the target URL using XPath selectors + + + -
- - - - - Name - - - XPath - - - Actions - - - - - - - - setNewRow({ ...newRow, name: e.target.value }) - } - /> - - - - setNewRow({ ...newRow, xpath: e.target.value }) - } - /> - - - 0 && newRow.name.length > 0 - ? "Add Element" - : "Fill out all fields to add an element" - } - placement="top" - > - - 0 && newRow.name.length > 0) - } - > - - - - - - - {rows.map((row, index) => ( - - - {row.name} - - - {row.xpath} - - -
+ + + Name + XPath + + Actions + + + + + + + + setNewRow({ ...newRow, name: e.target.value }) + } + sx={{ + "& .MuiOutlinedInput-root": { + borderRadius: 2, + bgcolor: "background.default", + "&:hover": { + "& .MuiOutlinedInput-notchedOutline": { + borderColor: "primary.main", + }, + }, + }, + }} + /> + + + + setNewRow({ ...newRow, xpath: e.target.value }) + } + sx={{ + "& .MuiOutlinedInput-root": { + borderRadius: 2, + bgcolor: "background.default", + "&:hover": { + "& .MuiOutlinedInput-notchedOutline": { + borderColor: "primary.main", + }, + }, + }, + }} + /> + + + 0 && newRow.name.length > 0 + ? "Add Element" + : "Fill out all fields to add an element" + } + placement="top" + > + + 0 && newRow.name.length > 0) + } + sx={{ + bgcolor: "primary.main", + color: "primary.contrastText", + borderRadius: 2, + "&:hover": { + bgcolor: "primary.dark", + transform: "translateY(-1px)", + }, + "&.Mui-disabled": { + bgcolor: "action.disabledBackground", + color: "action.disabled", + }, + }} > - Delete - - - - ))} - -
-
+ + + + + + + {rows.map((row, index) => ( + + + + {row.name} + + + + + {row.xpath} + + + + handleDeleteRow(row.name)} + size="small" + color="error" + sx={{ + "&:hover": { + bgcolor: "error.main", + color: "error.contrastText", + transform: "translateY(-1px)", + }, + }} + > + + + + + ))} + +
+ + +
- - -
+ ); }; diff --git a/src/components/submit/job-submitter/job-submitter-header/job-submitter-header.module.css b/src/components/submit/job-submitter/job-submitter-header/job-submitter-header.module.css index 89762a2..7bd179e 100644 --- a/src/components/submit/job-submitter/job-submitter-header/job-submitter-header.module.css +++ b/src/components/submit/job-submitter/job-submitter-header/job-submitter-header.module.css @@ -2,3 +2,14 @@ margin-bottom: 1rem; text-align: center; } + +.container { + text-align: left; + margin-bottom: 8px; +} + +.title { + font-weight: 600; + color: var(--mui-palette-text-primary); + margin-bottom: 8px; +} diff --git a/src/components/submit/job-submitter/job-submitter-header/job-submitter-header.tsx b/src/components/submit/job-submitter/job-submitter-header/job-submitter-header.tsx index 7a2fc87..76f47db 100644 --- a/src/components/submit/job-submitter/job-submitter-header/job-submitter-header.tsx +++ b/src/components/submit/job-submitter/job-submitter-header/job-submitter-header.tsx @@ -1,6 +1,6 @@ +import { Box, Typography } from "@mui/material"; import React, { ReactNode } from "react"; -import { Typography } from "@mui/material"; -import classes from "./job-submitter-header.module.css"; +import styles from "./job-submitter-header.module.css"; interface JobSubmitterHeaderProps { title?: string; @@ -8,13 +8,15 @@ interface JobSubmitterHeaderProps { } export const JobSubmitterHeader: React.FC = ({ - title = "Scraping Made Easy", + title = "Scrape Webpage", children, }) => { return ( -
- {title} + + + {title} + {children} -
+
); }; diff --git a/src/components/submit/job-submitter/job-submitter-input/job-submitter-input.module.css b/src/components/submit/job-submitter/job-submitter-input/job-submitter-input.module.css index e69de29..18b3cc0 100644 --- a/src/components/submit/job-submitter/job-submitter-input/job-submitter-input.module.css +++ b/src/components/submit/job-submitter/job-submitter-input/job-submitter-input.module.css @@ -0,0 +1,52 @@ +.container { + display: flex; + flex-direction: column; + gap: 16px; + align-items: stretch; +} + +@media (min-width: 600px) { + .container { + flex-direction: row; + align-items: center; + } +} + +.input { + width: 100%; +} + +.input :global(.MuiOutlinedInput-root) { + border-radius: 16px; + transition: all 0.2s ease-in-out; +} + +.input + :global(.MuiOutlinedInput-root:hover) + :global(.MuiOutlinedInput-notchedOutline) { + border-color: var(--mui-palette-primary-main); +} + +.submitButton { + height: 48px !important; + border-radius: 16px; + font-size: 1rem !important; + font-weight: 500 !important; +} + +.submitButton:hover { + transform: translateY(-1px); + box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15); +} + +.submitButton:disabled { + transform: none; + box-shadow: none; +} + +@media (min-width: 600px) { + .submitButton { + min-width: 120px; + height: 56px; + } +} diff --git a/src/components/submit/job-submitter/job-submitter-input/job-submitter-input.tsx b/src/components/submit/job-submitter/job-submitter-input/job-submitter-input.tsx index 28e874d..614a47a 100644 --- a/src/components/submit/job-submitter/job-submitter-input/job-submitter-input.tsx +++ b/src/components/submit/job-submitter/job-submitter-input/job-submitter-input.tsx @@ -1,6 +1,6 @@ -import React from "react"; -import { TextField, Button, CircularProgress } from "@mui/material"; +import { Box, Button, CircularProgress, TextField } from "@mui/material"; import { useJobSubmitterProvider } from "../provider"; +import styles from "./job-submitter-input.module.css"; export type JobSubmitterInputProps = { urlError: string | null; @@ -17,7 +17,7 @@ export const JobSubmitterInput = ({ useJobSubmitterProvider(); return ( -
+ setSubmittedURL(e.target.value)} error={!isValidURL} helperText={!isValidURL ? urlError : ""} - className="rounded-md" + className={styles.input} /> -
+ ); }; diff --git a/src/components/submit/job-submitter/job-submitter.tsx b/src/components/submit/job-submitter/job-submitter.tsx index f52ffce..0d3d845 100644 --- a/src/components/submit/job-submitter/job-submitter.tsx +++ b/src/components/submit/job-submitter/job-submitter.tsx @@ -4,6 +4,7 @@ import { AdvancedJobOptions } from "@/components/common/advanced-job-options"; import { useSubmitJob } from "@/hooks/use-submit-job"; import { parseJobOptions } from "@/lib"; import { useUser } from "@/store/hooks"; +import { Box, Paper } from "@mui/material"; import { useRouter } from "next/router"; import { useEffect } from "react"; import { JobSubmitterHeader } from "./job-submitter-header"; @@ -29,23 +30,34 @@ export const JobSubmitter = () => { await submitJob(submittedURL, rows, user, jobOptions, siteMap, false, null); }; - console.log(jobOptions); - useEffect(() => { - console.log(jobOptions); - }, [jobOptions]); - return ( -
- - - -
+ + + + + + + + + ); }; diff --git a/src/components/submit/job-submitter/site-map/site-map-input/site-map-input.tsx b/src/components/submit/job-submitter/site-map/site-map-input/site-map-input.tsx index 43195bd..94f0487 100644 --- a/src/components/submit/job-submitter/site-map/site-map-input/site-map-input.tsx +++ b/src/components/submit/job-submitter/site-map/site-map-input/site-map-input.tsx @@ -1,17 +1,17 @@ -import { useState } from "react"; -import { useJobSubmitterProvider } from "../../provider"; +import { ActionOption } from "@/types/job"; import { + Box, + Button, + Checkbox, + FormControl, + FormControlLabel, + InputLabel, MenuItem, Select, TextField, - FormControl, - Button, - Checkbox, - FormControlLabel, } from "@mui/material"; -import { ActionOption } from "@/types/job"; -import classes from "./site-map-input.module.css"; -import { clsx } from "clsx"; +import { useState } from "react"; +import { useJobSubmitterProvider } from "../../provider"; export type SiteMapInputProps = { disabled?: boolean; @@ -28,7 +28,6 @@ export const SiteMapInput = ({ clickOnce, input, }: SiteMapInputProps) => { - console.log(clickOnce); const [optionState, setOptionState] = useState( option || "click" ); @@ -43,8 +42,6 @@ export const SiteMapInput = ({ const handleAdd = () => { if (!siteMap) return; - console.log(optionState, xpathState, clickOnceState, inputState); - setSiteMap((prevSiteMap) => ({ ...prevSiteMap, actions: [ @@ -60,6 +57,7 @@ export const SiteMapInput = ({ })); setXpathState(""); + setInputState(""); }; const handleRemove = () => { @@ -72,14 +70,22 @@ export const SiteMapInput = ({ }; return ( -
-
- + + + + Action Type