diff --git a/api/backend/job/models/job_options.py b/api/backend/job/models/job_options.py index ab35f44..4158f35 100644 --- a/api/backend/job/models/job_options.py +++ b/api/backend/job/models/job_options.py @@ -13,3 +13,4 @@ class JobOptions(BaseModel): proxies: list[str] = [] site_map: Optional[SiteMap] = None collect_media: bool = False + custom_cookies: list[dict[str, Any]] = [] diff --git a/api/backend/job/scraping/add_custom.py b/api/backend/job/scraping/add_custom.py new file mode 100644 index 0000000..b150e83 --- /dev/null +++ b/api/backend/job/scraping/add_custom.py @@ -0,0 +1,48 @@ +from typing import Any, Optional +from urllib.parse import urlparse + +from playwright.async_api import Page, BrowserContext + +import logging + +LOG = logging.getLogger(__name__) + + +async def add_custom_cookies( + custom_cookies: list[dict[str, Any]], + url: str, + context: BrowserContext, +) -> None: + parsed_url = urlparse(url) + domain = parsed_url.netloc + + for cookie in custom_cookies: + cookie_dict = { + "name": cookie.get("name", "default_name"), + "value": cookie.get("value", "default_value"), + "domain": domain, + "path": "/", + } + + LOG.info(f"Adding cookie: {cookie_dict}") + await context.add_cookies([cookie_dict]) # type: ignore + + +async def add_custom_headers( + custom_headers: dict[str, Any], + page: Page, +) -> None: + await page.set_extra_http_headers(custom_headers) + + +async def add_custom_items( + url: str, + page: Page, + cookies: Optional[list[dict[str, Any]]] = None, + headers: Optional[dict[str, Any]] = None, +) -> None: + if cookies: + await add_custom_cookies(cookies, url, page.context) + + if headers: + await add_custom_headers(headers, page) diff --git a/api/backend/scraping.py b/api/backend/scraping.py index 0e57595..f4b3a62 100644 --- a/api/backend/scraping.py +++ b/api/backend/scraping.py @@ -12,6 +12,8 @@ from api.backend.models import Element, CapturedElement from api.backend.job.scraping.scraping_utils import scrape_content from api.backend.job.site_mapping.site_mapping import handle_site_mapping +from api.backend.job.scraping.add_custom import add_custom_items + LOG = logging.getLogger(__name__) @@ -44,11 +46,13 @@ async def make_site_request( proxies: Optional[list[str]] = None, site_map: Optional[dict[str, Any]] = None, collect_media: bool = False, + custom_cookies: Optional[list[dict[str, Any]]] = None, ): if url in visited_urls: return proxy = None + if proxies: proxy = random.choice(proxies) LOG.info(f"Using proxy: {proxy}") @@ -56,8 +60,8 @@ async def make_site_request( async with AsyncCamoufox(headless=True, proxy=proxy) as browser: page: Page = await browser.new_page() - if headers: - await page.set_extra_http_headers(headers) + # Add cookies and headers + await add_custom_items(url, page, custom_cookies, headers) LOG.info(f"Visiting URL: {url}") @@ -113,6 +117,7 @@ async def make_site_request( proxies=proxies, site_map=site_map, collect_media=collect_media, + custom_cookies=custom_cookies, ) @@ -152,6 +157,7 @@ async def scrape( proxies: Optional[list[str]] = None, site_map: Optional[dict[str, Any]] = None, collect_media: bool = False, + custom_cookies: Optional[list[dict[str, Any]]] = None, ): visited_urls: set[str] = set() pages: set[tuple[str, str]] = set() @@ -166,6 +172,7 @@ async def scrape( proxies=proxies, site_map=site_map, collect_media=collect_media, + custom_cookies=custom_cookies, ) elements: list[dict[str, dict[str, list[CapturedElement]]]] = [] diff --git a/api/backend/tests/scraping/test_scraping.py b/api/backend/tests/scraping/test_scraping.py index 41dfa10..5b0704b 100644 --- a/api/backend/tests/scraping/test_scraping.py +++ b/api/backend/tests/scraping/test_scraping.py @@ -1,25 +1,53 @@ import pytest import logging -from playwright.async_api import async_playwright, Error +from typing import Dict +from playwright.async_api import async_playwright, Cookie, Route +from api.backend.job.scraping.add_custom import add_custom_items logging.basicConfig(level=logging.DEBUG) LOG = logging.getLogger(__name__) @pytest.mark.asyncio -async def test_proxy(): - proxy = "127.0.0.1:8080" +async def test_add_custom_items(): + test_cookies = [{"name": "big", "value": "cookie"}] + test_headers = {"User-Agent": "test-agent", "Accept": "application/json"} async with async_playwright() as p: - browser = await p.firefox.launch( - headless=True, proxy={"server": f"http://{proxy}"} - ) + browser = await p.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() - with pytest.raises(Error) as excinfo: - await page.goto("http://example.com") + # Set up request interception + captured_headers: Dict[str, str] = {} - assert "NS_ERROR_PROXY_CONNECTION_REFUSED" in str(excinfo.value) + async def handle_route(route: Route) -> None: + nonlocal captured_headers + captured_headers = route.request.headers + await route.continue_() + + await page.route("**/*", handle_route) + + await add_custom_items( + url="http://example.com", + page=page, + cookies=test_cookies, + headers=test_headers, + ) + + # Navigate to example.com + await page.goto("http://example.com") + + # Verify cookies were added + cookies: list[Cookie] = await page.context.cookies() + test_cookie = next((c for c in cookies if c.get("name") == "big"), None) + + assert test_cookie is not None + assert test_cookie.get("value") == "cookie" + assert test_cookie.get("path") == "/" # Default path should be set + assert test_cookie.get("sameSite") == "Lax" # Default sameSite should be set + + # Verify headers were added + assert captured_headers.get("user-agent") == "test-agent" await browser.close() diff --git a/api/backend/worker/job_worker.py b/api/backend/worker/job_worker.py index 15c43a5..5863049 100644 --- a/api/backend/worker/job_worker.py +++ b/api/backend/worker/job_worker.py @@ -1,4 +1,5 @@ import os +import json from api.backend.job import get_queued_job, update_job from api.backend.scraping import scrape @@ -34,14 +35,25 @@ async def process_job(): LOG.info(f"Beginning processing job: {job}.") try: _ = await update_job([job["id"]], field="status", value="Scraping") + + proxies = job["job_options"]["proxies"] + + if proxies and isinstance(proxies[0], str) and proxies[0].startswith("{"): + try: + proxies = [json.loads(p) for p in proxies] + except json.JSONDecodeError: + LOG.error(f"Failed to parse proxy JSON: {proxies}") + proxies = [] + scraped = await scrape( job["url"], [Element(**j) for j in job["elements"]], job["job_options"]["custom_headers"], job["job_options"]["multi_page_scrape"], - job["job_options"]["proxies"], + proxies, job["job_options"]["site_map"], job["job_options"]["collect_media"], + job["job_options"]["custom_cookies"], ) LOG.info( f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}" diff --git a/src/components/common/advanced-job-options/advanced-job-options.tsx b/src/components/common/advanced-job-options/advanced-job-options.tsx new file mode 100644 index 0000000..8fd67a9 --- /dev/null +++ b/src/components/common/advanced-job-options/advanced-job-options.tsx @@ -0,0 +1,45 @@ +import { Box, Link, Typography } from "@mui/material"; +import { SetStateAction, Dispatch, useState } from "react"; +import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog"; +import { RawJobOptions } from "@/types"; + +export type AdvancedJobOptionsProps = { + jobOptions: RawJobOptions; + setJobOptions: Dispatch>; +}; + +export const AdvancedJobOptions = ({ + jobOptions, + setJobOptions, +}: AdvancedJobOptionsProps) => { + const [open, setOpen] = useState(false); + return ( + + setOpen(true)} + sx={{ + textDecoration: "none", + color: "primary.main", + "&:hover": { + color: "primary.dark", + textDecoration: "underline", + }, + paddingLeft: 1, + display: "inline-flex", + alignItems: "center", + gap: 0.5, + }} + > + Advanced Job Options + + setOpen(false)} + jobOptions={jobOptions} + setJobOptions={setJobOptions} + /> + + ); +}; diff --git a/src/components/common/advanced-job-options/dialog/advanced-job-options-dialog.tsx b/src/components/common/advanced-job-options/dialog/advanced-job-options-dialog.tsx new file mode 100644 index 0000000..588bcc0 --- /dev/null +++ b/src/components/common/advanced-job-options/dialog/advanced-job-options-dialog.tsx @@ -0,0 +1,269 @@ +import { + Accordion, + AccordionDetails, + AccordionSummary, + Box, + Checkbox, + Dialog, + DialogContent, + DialogTitle, + Divider, + FormControl, + FormControlLabel, + FormGroup, + IconButton, + TextField, + Tooltip, + Typography, + useTheme, +} from "@mui/material"; +import { + ExpandMore as ExpandMoreIcon, + InfoOutlined, + Code as CodeIcon, + Settings, +} from "@mui/icons-material"; +import { Dispatch, SetStateAction } from "react"; +import { RawJobOptions } from "@/types"; +import { ExpandedTableInput } from "../../expanded-table-input"; + +export type AdvancedJobOptionsDialogProps = { + open: boolean; + onClose: () => void; + jobOptions: RawJobOptions; + setJobOptions: Dispatch>; +}; + +export const AdvancedJobOptionsDialog = ({ + open, + onClose, + jobOptions, + setJobOptions, +}: AdvancedJobOptionsDialogProps) => { + const theme = useTheme(); + const handleMultiPageScrapeChange = () => { + setJobOptions((prevJobOptions) => ({ + ...prevJobOptions, + multi_page_scrape: !prevJobOptions.multi_page_scrape, + })); + }; + + const handleProxiesChange = (e: React.ChangeEvent) => { + setJobOptions((prevJobOptions) => ({ + ...prevJobOptions, + proxies: e.target.value, + })); + }; + + const handleCollectMediaChange = () => { + setJobOptions((prevJobOptions) => ({ + ...prevJobOptions, + collect_media: !prevJobOptions.collect_media, + })); + }; + + return ( + + + + Advanced Job Options + + + + + + + + + Collection Options + + + + + + } + label={ + + Multi Page Scrape + + + + + + + } + /> + + } + label={ + + Collect Media + + + + + + + } + /> + + + + + + Custom Options + + + + {/* Proxies Section */} + + } + sx={{ + backgroundColor: theme.palette.background.paper, + borderBottom: `1px solid ${theme.palette.divider}`, + "&.Mui-expanded": { + borderBottom: `1px solid ${theme.palette.divider}`, + }, + }} + > + +
+ + Proxies + + + + + +
+
+
+ + + ), + }} + /> + +
+ + {/* Custom Headers Section */} + { + setJobOptions((prevJobOptions) => ({ + ...prevJobOptions, + custom_headers: value, + })); + }} + /> + + {/* Custom Cookies Section */} + { + setJobOptions((prevJobOptions) => ({ + ...prevJobOptions, + custom_cookies: value, + })); + }} + /> +
+
+
+
+ ); +}; diff --git a/src/components/common/advanced-job-options/dialog/index.ts b/src/components/common/advanced-job-options/dialog/index.ts new file mode 100644 index 0000000..28343fe --- /dev/null +++ b/src/components/common/advanced-job-options/dialog/index.ts @@ -0,0 +1 @@ +export * from "./advanced-job-options-dialog"; diff --git a/src/components/common/advanced-job-options/index.ts b/src/components/common/advanced-job-options/index.ts new file mode 100644 index 0000000..d793eee --- /dev/null +++ b/src/components/common/advanced-job-options/index.ts @@ -0,0 +1 @@ +export * from "./advanced-job-options"; diff --git a/src/components/common/expanded-table-input/expanded-table-input.tsx b/src/components/common/expanded-table-input/expanded-table-input.tsx new file mode 100644 index 0000000..c72c164 --- /dev/null +++ b/src/components/common/expanded-table-input/expanded-table-input.tsx @@ -0,0 +1,204 @@ +import { + Accordion, + AccordionSummary, + TableCell, + TableRow, + Paper, + TableBody, + useTheme, + TextField, + Box, + Typography, + AccordionDetails, + TableHead, + TableContainer, + Table, +} from "@mui/material"; +import { useEffect, useState } from "react"; +import ExpandMoreIcon from "@mui/icons-material/ExpandMore"; +import { parseJsonToEntries } from "@/lib/helpers/parse-json-to-entries"; + +export type ExpandedTableInputProps = { + label: string; + onChange: (value: any) => void; + placeholder: string; + urlParam: string; +}; + +export const ExpandedTableInput = ({ + label, + onChange, + placeholder, + urlParam, +}: ExpandedTableInputProps) => { + const theme = useTheme(); + const [value, setValue] = useState(""); + const [parsedHeaders, setParsedHeaders] = useState<[string, string][] | null>( + null + ); + + const [jsonError, setJsonError] = useState(null); + + const urlParams = new URLSearchParams(window.location.search); + + const validateAndParse = (val: string) => { + if (val.trim() === "") { + setParsedHeaders(null); + setJsonError(null); + return null; + } + + try { + const parsed = JSON.parse(val); + const entries = parseJsonToEntries(val); + + if (entries === null) { + setParsedHeaders(null); + setJsonError("Invalid JSON object"); + return null; + } else { + setParsedHeaders(entries); + setJsonError(null); + return parsed; + } + } catch (e) { + setParsedHeaders(null); + setJsonError("Invalid JSON format"); + return null; + } + }; + + const handleChange = (e: React.ChangeEvent) => { + const val = e.target.value; + setValue(val); + const parsed = validateAndParse(val); + onChange(parsed); + }; + + useEffect(() => { + const jobOptions = urlParams.get("job_options"); + + if (!jobOptions) { + setParsedHeaders(null); + setJsonError(null); + return; + } + + const jobOptionsObject = JSON.parse(jobOptions || "{}"); + let val = jobOptionsObject[urlParam]; + + if (val.length === 0 || Object.keys(val).length === 0) { + setParsedHeaders(null); + setJsonError(null); + return; + } + + if (typeof val === "string") { + try { + val = JSON.parse(val); + } catch {} + } + + const finalVal = + typeof val === "string" ? val : val != null ? JSON.stringify(val) : ""; + + setValue(finalVal); + const parsed = validateAndParse(finalVal); + onChange(parsed); + }, [urlParam]); + + return ( + + } + sx={{ + backgroundColor: theme.palette.background.paper, + borderBottom: `1px solid ${theme.palette.divider}`, + "&.Mui-expanded": { + borderBottom: `1px solid ${theme.palette.divider}`, + }, + }} + > + + + {label} + + + + + + + {parsedHeaders && parsedHeaders.length > 0 && ( + + + + + + Header + Value + + + + {parsedHeaders.map(([key, val]) => ( + + {key} + {val} + + ))} + +
+
+
+ )} +
+
+ ); +}; diff --git a/src/components/common/expanded-table-input/index.ts b/src/components/common/expanded-table-input/index.ts new file mode 100644 index 0000000..4a837c5 --- /dev/null +++ b/src/components/common/expanded-table-input/index.ts @@ -0,0 +1 @@ +export * from "./expanded-table-input"; diff --git a/src/components/submit/job-submitter/job-submitter.tsx b/src/components/submit/job-submitter/job-submitter.tsx index d1a051a..62ab126 100644 --- a/src/components/submit/job-submitter/job-submitter.tsx +++ b/src/components/submit/job-submitter/job-submitter.tsx @@ -10,12 +10,14 @@ import { JobSubmitterInput } from "./job-submitter-input"; import { JobSubmitterOptions } from "./job-submitter-options"; import { ApiService } from "@/services"; import { useJobSubmitterProvider } from "./provider"; +import { AdvancedJobOptions } from "@/components/common/advanced-job-options"; const initialJobOptions: RawJobOptions = { multi_page_scrape: false, custom_headers: null, proxies: null, collect_media: false, + custom_cookies: null, }; export const JobSubmitter = () => { @@ -38,12 +40,8 @@ export const JobSubmitter = () => { const [loading, setLoading] = useState(false); const [jobOptions, setJobOptions] = useState(initialJobOptions); - const [customJSONSelected, setCustomJSONSelected] = useState(false); - const [proxiesSelected, setProxiesSelected] = useState(false); - const handleSelectProxies = () => { - setProxiesSelected(!proxiesSelected); - }; + console.log(jobOptions); const handleSubmit = async () => { if (!validateURL(submittedURL)) { @@ -57,12 +55,13 @@ export const JobSubmitter = () => { setLoading(true); let customHeaders; + let customCookies; try { - customHeaders = jobOptions.custom_headers - ? JSON.parse(jobOptions.custom_headers) - : null; - } catch (error) { + customHeaders = jobOptions.custom_headers || null; + customCookies = jobOptions.custom_cookies || null; + } catch (error: any) { + console.error(error); setSnackbarMessage("Invalid JSON in custom headers."); setSnackbarOpen(true); setSnackbarSeverity("error"); @@ -76,6 +75,7 @@ export const JobSubmitter = () => { user, jobOptions, customHeaders, + customCookies, siteMap ) .then(async (response) => { @@ -102,16 +102,9 @@ export const JobSubmitter = () => { .finally(() => setLoading(false)); }; - // Parse the job options from the query string useEffect(() => { if (job_options) { - parseJobOptions( - job_options as string, - setCustomJSONSelected, - setProxiesSelected, - setJobOptions, - setSiteMap - ); + parseJobOptions(job_options as string, setJobOptions, setSiteMap); } }, [job_options]); @@ -123,13 +116,9 @@ export const JobSubmitter = () => { handleSubmit={handleSubmit} loading={loading} /> - ); diff --git a/src/lib/helpers/parse-job-options.ts b/src/lib/helpers/parse-job-options.ts index 106b780..5b17a1d 100644 --- a/src/lib/helpers/parse-job-options.ts +++ b/src/lib/helpers/parse-job-options.ts @@ -4,10 +4,8 @@ import { RawJobOptions, SiteMap } from "@/types"; export const parseJobOptions = ( job_options: string, - setCustomJSONSelected: Dispatch>, - setProxiesSelected: Dispatch>, setJobOptions: Dispatch>, - setSiteMap: Dispatch> + setSiteMap: Dispatch> ) => { if (job_options) { const jsonOptions = JSON.parse(job_options as string); @@ -16,20 +14,23 @@ export const parseJobOptions = ( custom_headers: null, proxies: null, collect_media: false, + custom_cookies: null, }; if ( jsonOptions.custom_headers && Object.keys(jsonOptions.custom_headers).length ) { - setCustomJSONSelected(true); - newJobOptions.custom_headers = JSON.stringify(jsonOptions.custom_headers); + newJobOptions.custom_headers = jsonOptions.custom_headers; + } + + if (jsonOptions.custom_cookies && jsonOptions.custom_cookies.length > 0) { + newJobOptions.custom_cookies = jsonOptions.custom_cookies; } newJobOptions.multi_page_scrape = jsonOptions.multi_page_scrape; if (jsonOptions.proxies.length > 0) { - setProxiesSelected(true); newJobOptions.proxies = jsonOptions.proxies.join(","); } diff --git a/src/lib/helpers/parse-json-to-entries.ts b/src/lib/helpers/parse-json-to-entries.ts new file mode 100644 index 0000000..367350e --- /dev/null +++ b/src/lib/helpers/parse-json-to-entries.ts @@ -0,0 +1,37 @@ +export const parseJsonToEntries = (json: string): [string, string][] | null => { + try { + const parsed = JSON.parse(json); + + if (Array.isArray(parsed)) { + if ( + parsed.length > 0 && + Array.isArray(parsed[0]) && + parsed[0].length === 2 && + typeof parsed[0][0] === "string" + ) { + // Already array of [key, val] tuples + // Just ensure values are strings + return parsed.map(([k, v]) => [k, String(v)]); + } + + // Array of objects + const allEntries: [string, string][] = []; + for (const item of parsed) { + if (typeof item === "object" && item !== null) { + allEntries.push( + // @ts-ignore + ...Object.entries(item).map(([k, v]) => [k, String(v)]) + ); + } else { + return null; + } + } + return allEntries.length > 0 ? allEntries : null; + } else if (typeof parsed === "object" && parsed !== null) { + return Object.entries(parsed).map(([k, v]) => [k, String(v)]); + } + return null; + } catch { + return null; + } +}; diff --git a/src/services/api-service/functions/submit-job.ts b/src/services/api-service/functions/submit-job.ts index 699f515..9b0a967 100644 --- a/src/services/api-service/functions/submit-job.ts +++ b/src/services/api-service/functions/submit-job.ts @@ -6,6 +6,7 @@ export const submitJob = async ( user: any, jobOptions: any, customHeaders: any, + customCookies: any, siteMap: SiteMap | null ) => { return await fetch(`/api/submit-scrape-job`, { @@ -23,6 +24,7 @@ export const submitJob = async ( custom_headers: customHeaders || {}, proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [], site_map: siteMap, + custom_cookies: customCookies || [], }, }, }), diff --git a/src/styles/themes.ts b/src/styles/themes.ts index f314084..9b961b0 100644 --- a/src/styles/themes.ts +++ b/src/styles/themes.ts @@ -70,6 +70,16 @@ const commonThemeOptions = { }, }, }, + MuiCheckbox: { + styleOverrides: { + colorPrimary: { + color: "#1976d2", + "&.Mui-checked": { + color: "#034efc", + }, + }, + }, + }, MuiPaper: { styleOverrides: { root: { @@ -85,6 +95,7 @@ const lightTheme = createTheme({ mode: "light", primary: { main: "#1976d2", + contrastText: "#000000", }, secondary: { main: "#dc004e", @@ -139,6 +150,7 @@ const darkTheme = createTheme({ mode: "dark", primary: { main: "#90caf9", + contrastText: "#fff", }, secondary: { main: "#f48fb1", diff --git a/src/types/job.ts b/src/types/job.ts index f5430df..9433500 100644 --- a/src/types/job.ts +++ b/src/types/job.ts @@ -24,6 +24,7 @@ export type RawJobOptions = { custom_headers: string | null; proxies: string | null; collect_media: boolean; + custom_cookies: string | null; }; export type ActionOption = "click" | "input";