Feat: Site Mapping (#46)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled

* wip: add site mapping

* chore: cleanup
This commit is contained in:
Jayden Pyles
2024-11-16 20:55:23 -06:00
committed by GitHub
parent 3a0762f1e3
commit 7d80ff5c7f
35 changed files with 853 additions and 349 deletions

View File

@@ -0,0 +1,19 @@
from .job import (
query,
insert,
update_job,
delete_jobs,
get_jobs_per_day,
get_queued_job,
average_elements_per_link,
)
__all__ = [
"query",
"insert",
"update_job",
"delete_jobs",
"get_jobs_per_day",
"get_queued_job",
"average_elements_per_link",
]

View File

@@ -6,8 +6,8 @@ from typing import Any, Optional
from pymongo import DESCENDING
# LOCAL
from api.backend.models import FetchOptions
from api.backend.database import get_job_collection
from api.backend.job.models.job_options import FetchOptions
LOG = logging.getLogger(__name__)

View File

View File

@@ -0,0 +1,14 @@
from pydantic import BaseModel
from typing import Any, Optional
from api.backend.job.models.site_map import SiteMap
class FetchOptions(BaseModel):
chat: Optional[bool] = None
class JobOptions(BaseModel):
multi_page_scrape: bool = False
custom_headers: dict[str, Any] = {}
proxies: list[str] = []
site_map: Optional[SiteMap] = None

View File

@@ -0,0 +1,14 @@
from pydantic import BaseModel
from typing import Literal
class Action(BaseModel):
type: Literal["click", "input"]
xpath: str
name: str
input: str = ""
do_once: bool = True
class SiteMap(BaseModel):
actions: list[Action]

View File

@@ -0,0 +1,30 @@
import time
from typing import cast
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
def scrape_content(driver: webdriver.Chrome, pages: set[tuple[str, str]]):
_ = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
last_height = cast(str, driver.execute_script("return document.body.scrollHeight"))
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3) # Wait for the page to load
new_height = cast(
str, driver.execute_script("return document.body.scrollHeight")
)
if new_height == last_height:
break
last_height = new_height
pages.add((driver.page_source, driver.current_url))
return driver.page_source

View File

View File

@@ -0,0 +1,94 @@
from api.backend.job.models.site_map import Action, SiteMap
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from typing import Any
import logging
import time
from copy import deepcopy
from api.backend.job.scraping.scraping_utils import scrape_content
from selenium.webdriver.support.ui import WebDriverWait
from seleniumwire.inspect import TimeoutException
from seleniumwire.webdriver import Chrome
from selenium.webdriver.support import expected_conditions as EC
LOG = logging.getLogger(__name__)
def clear_done_actions(site_map: dict[str, Any]):
"""Clear all actions that have been clicked."""
cleared_site_map = deepcopy(site_map)
cleared_site_map["actions"] = [
action for action in cleared_site_map["actions"] if not action["do_once"]
]
return cleared_site_map
def handle_input(action: Action, driver: webdriver.Chrome):
try:
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, action.xpath))
)
LOG.info(f"Sending keys: {action.input} to element: {element}")
element.send_keys(action.input)
except NoSuchElementException:
LOG.info(f"Element not found: {action.xpath}")
return False
except TimeoutException:
LOG.info(f"Timeout waiting for element: {action.xpath}")
return False
except Exception as e:
LOG.info(f"Error handling input: {e}")
return False
return True
def handle_click(action: Action, driver: webdriver.Chrome):
try:
element = driver.find_element(By.XPATH, action.xpath)
LOG.info(f"Clicking element: {element}")
element.click()
except NoSuchElementException:
LOG.info(f"Element not found: {action.xpath}")
return False
return True
ACTION_MAP = {
"click": handle_click,
"input": handle_input,
}
async def handle_site_mapping(
site_map_dict: dict[str, Any],
driver: Chrome,
pages: set[tuple[str, str]],
):
site_map = SiteMap(**site_map_dict)
LOG.info(f"Handling site map: {site_map}")
for action in site_map.actions:
action_handler = ACTION_MAP[action.type]
if not action_handler(action, driver):
return
time.sleep(2)
_ = scrape_content(driver, pages)
cleared_site_map_dict = clear_done_actions(site_map_dict)
if cleared_site_map_dict["actions"]:
await handle_site_mapping(cleared_site_map_dict, driver, pages)

View File

@@ -2,12 +2,14 @@
from typing import Any, Optional, Union
from datetime import datetime
# LOCAL
from api.backend.job.models.job_options import JobOptions
# PDM
import pydantic
class FetchOptions(pydantic.BaseModel):
chat: Optional[bool] = None
class Element(pydantic.BaseModel):
@@ -22,12 +24,6 @@ class CapturedElement(pydantic.BaseModel):
name: str
class JobOptions(pydantic.BaseModel):
multi_page_scrape: bool = False
custom_headers: Optional[dict[str, Any]] = {}
proxies: Optional[list[str]] = []
class RetrieveScrapeJobs(pydantic.BaseModel):
user: str

View File

@@ -12,22 +12,17 @@ from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse, StreamingResponse
# LOCAL
from api.backend.job import (
query,
insert,
update_job,
delete_jobs,
)
from api.backend.job import query, insert, update_job, delete_jobs
from api.backend.models import (
UpdateJobs,
DownloadJob,
FetchOptions,
DeleteScrapeJobs,
Job,
)
from api.backend.schemas import User
from api.backend.auth.auth_utils import get_current_user
from api.backend.utils import clean_text
from api.backend.job.models.job_options import FetchOptions
LOG = logging.getLogger(__name__)

View File

@@ -1,19 +1,20 @@
import logging
from typing import Any, Optional
import time
import random
from bs4 import BeautifulSoup
from lxml import etree
from seleniumwire import webdriver
from lxml.etree import _Element # type: ignore [reportPrivateImport]
from lxml.etree import _Element # pyright: ignore [reportPrivateUsage]
from fake_useragent import UserAgent
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options as ChromeOptions
from urllib.parse import urlparse, urljoin
from api.backend.models import Element, CapturedElement
from api.backend.job.site_mapping.site_mapping import (
handle_site_mapping,
)
from api.backend.job.scraping.scraping_utils import scrape_content
from api.backend.job.models.site_map import SiteMap
LOG = logging.getLogger(__name__)
@@ -95,6 +96,7 @@ async def make_site_request(
pages: set[tuple[str, str]] = set(),
original_url: str = "",
proxies: Optional[list[str]] = [],
site_map: Optional[dict[str, Any]] = None,
) -> None:
"""Make basic `GET` request to site using Selenium."""
# Check if URL has already been visited
@@ -114,27 +116,16 @@ async def make_site_request(
final_url = driver.current_url
visited_urls.add(url)
visited_urls.add(final_url)
_ = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_source = scrape_content(driver, pages)
time.sleep(3) # Wait for the page to load
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
final_height = driver.execute_script("return document.body.scrollHeight")
page_source = driver.page_source
LOG.debug(f"Page source for url: {url}\n{page_source}")
pages.add((page_source, final_url))
if site_map:
LOG.info("Site map: %s", site_map)
_ = await handle_site_mapping(
site_map,
driver,
pages,
)
finally:
driver.quit()
@@ -192,6 +183,7 @@ async def scrape(
headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False,
proxies: Optional[list[str]] = [],
site_map: Optional[SiteMap] = None,
):
visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set()
@@ -204,6 +196,7 @@ async def scrape(
pages=pages,
original_url=url,
proxies=proxies,
site_map=site_map,
)
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()

View File

@@ -24,6 +24,7 @@ async def process_job():
job["job_options"]["custom_headers"],
job["job_options"]["multi_page_scrape"],
job["job_options"]["proxies"],
job["job_options"]["site_map"],
)
LOG.info(
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"

View File

@@ -10,5 +10,8 @@ services:
- "$PWD/package-lock.json:/app/package-lock.json"
- "$PWD/tsconfig.json:/app/tsconfig.json"
scraperr_api:
environment:
- LOG_LEVEL=INFO
volumes:
- "$PWD/api:/project/api"
- "$PWD/scraping:/project/scraping"

View File

@@ -15,6 +15,7 @@ import {
Button,
Tooltip,
IconButton,
TableContainer,
} from "@mui/material";
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
import StarIcon from "@mui/icons-material/Star";
@@ -52,145 +53,147 @@ export const JobQueue = ({
const router = useRouter();
return (
<Table sx={{ tableLayout: "fixed", width: "100%" }}>
<TableHead>
<TableRow>
<TableCell>Select</TableCell>
<TableCell>Id</TableCell>
<TableCell>Url</TableCell>
<TableCell>Elements</TableCell>
<TableCell>Result</TableCell>
<TableCell>Time Created</TableCell>
<TableCell>Status</TableCell>
<TableCell>Actions</TableCell>
</TableRow>
</TableHead>
<TableBody>
{filteredJobs.map((row, index) => (
<TableRow key={index}>
<TableCell padding="checkbox">
<Checkbox
checked={selectedJobs.has(row.id)}
onChange={() => onSelectJob(row.id)}
/>
<Tooltip title="Chat with AI">
<span>
<IconButton
onClick={() => {
router.push({
pathname: "/chat",
query: {
job: row.id,
},
});
}}
>
<AutoAwesome />
</IconButton>
</span>
</Tooltip>
<Tooltip title="Favorite Job">
<span>
<IconButton
color={row.favorite ? "warning" : "default"}
onClick={() => {
onFavorite([row.id], "favorite", !row.favorite);
row.favorite = !row.favorite;
}}
>
<StarIcon />
</IconButton>
</span>
</Tooltip>
</TableCell>
<TableCell sx={{ maxWidth: 100, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.id}</Box>
</TableCell>
<TableCell sx={{ maxWidth: 200, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.url}</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
{JSON.stringify(row.elements)}
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto", padding: 0 }}>
<Accordion sx={{ margin: 0, padding: 0.5 }}>
<AccordionSummary
expandIcon={<ExpandMoreIcon />}
aria-controls="panel1a-content"
id="panel1a-header"
sx={{
minHeight: 0,
"&.Mui-expanded": { minHeight: 0 },
}}
>
<Box
sx={{
maxHeight: 150,
overflow: "auto",
width: "100%",
}}
>
<Typography sx={{ fontSize: "0.875rem" }}>
Show Result
</Typography>
</Box>
</AccordionSummary>
<AccordionDetails sx={{ padding: 1 }}>
<Box sx={{ maxHeight: 200, overflow: "auto" }}>
<Typography
sx={{
fontSize: "0.875rem",
whiteSpace: "pre-wrap",
<TableContainer component={Box} sx={{ maxHeight: "90dvh" }}>
<Table sx={{ tableLayout: "fixed", width: "100%" }}>
<TableHead>
<TableRow>
<TableCell>Select</TableCell>
<TableCell>Id</TableCell>
<TableCell>Url</TableCell>
<TableCell>Elements</TableCell>
<TableCell>Result</TableCell>
<TableCell>Time Created</TableCell>
<TableCell>Status</TableCell>
<TableCell>Actions</TableCell>
</TableRow>
</TableHead>
<TableBody sx={{ overflow: "auto" }}>
{filteredJobs.map((row, index) => (
<TableRow key={index}>
<TableCell padding="checkbox">
<Checkbox
checked={selectedJobs.has(row.id)}
onChange={() => onSelectJob(row.id)}
/>
<Tooltip title="Chat with AI">
<span>
<IconButton
onClick={() => {
router.push({
pathname: "/chat",
query: {
job: row.id,
},
});
}}
>
{JSON.stringify(row.result, null, 2)}
</Typography>
</Box>
</AccordionDetails>
</Accordion>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
{new Date(row.time_created).toLocaleString()}
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 50, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
<Box
className="rounded-md p-2 text-center"
sx={{ bgcolor: colors[row.status] }}
>
{row.status}
<AutoAwesome />
</IconButton>
</span>
</Tooltip>
<Tooltip title="Favorite Job">
<span>
<IconButton
color={row.favorite ? "warning" : "default"}
onClick={() => {
onFavorite([row.id], "favorite", !row.favorite);
row.favorite = !row.favorite;
}}
>
<StarIcon />
</IconButton>
</span>
</Tooltip>
</TableCell>
<TableCell sx={{ maxWidth: 100, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.id}</Box>
</TableCell>
<TableCell sx={{ maxWidth: 200, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.url}</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
{JSON.stringify(row.elements)}
</Box>
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ display: "flex", gap: 1 }}>
<Button
onClick={() => {
onDownload([row.id]);
}}
size="small"
sx={{ minWidth: 0, padding: "4px 8px" }}
>
Download
</Button>
<Button
onClick={() =>
onNavigate(row.elements, row.url, row.job_options)
}
size="small"
sx={{ minWidth: 0, padding: "4px 8px" }}
>
Rerun
</Button>
</Box>
</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto", padding: 0 }}>
<Accordion sx={{ margin: 0, padding: 0.5 }}>
<AccordionSummary
expandIcon={<ExpandMoreIcon />}
aria-controls="panel1a-content"
id="panel1a-header"
sx={{
minHeight: 0,
"&.Mui-expanded": { minHeight: 0 },
}}
>
<Box
sx={{
maxHeight: 150,
overflow: "auto",
width: "100%",
}}
>
<Typography sx={{ fontSize: "0.875rem" }}>
Show Result
</Typography>
</Box>
</AccordionSummary>
<AccordionDetails sx={{ padding: 1 }}>
<Box sx={{ maxHeight: 200, overflow: "auto" }}>
<Typography
sx={{
fontSize: "0.875rem",
whiteSpace: "pre-wrap",
}}
>
{JSON.stringify(row.result, null, 2)}
</Typography>
</Box>
</AccordionDetails>
</Accordion>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
{new Date(row.time_created).toLocaleString()}
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 50, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
<Box
className="rounded-md p-2 text-center"
sx={{ bgcolor: colors[row.status] }}
>
{row.status}
</Box>
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ display: "flex", gap: 1 }}>
<Button
onClick={() => {
onDownload([row.id]);
}}
size="small"
sx={{ minWidth: 0, padding: "4px 8px" }}
>
Download
</Button>
<Button
onClick={() =>
onNavigate(row.elements, row.url, row.job_options)
}
size="small"
sx={{ minWidth: 0, padding: "4px 8px" }}
>
Rerun
</Button>
</Box>
</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</TableContainer>
);
};

View File

@@ -0,0 +1,107 @@
"use client";
import React, { useState, useEffect, useRef } from "react";
import { Button, Container, Box, Snackbar, Alert } from "@mui/material";
import { useRouter } from "next/router";
import { Element, Result } from "@/types";
import { ElementTable, JobSubmitter } from "@/components/submit/job-submitter";
import { useJobSubmitterProvider } from "@/components/submit/job-submitter/provider";
export const Home = () => {
const {
submittedURL,
setSubmittedURL,
rows,
setRows,
results,
snackbarOpen,
setSnackbarOpen,
snackbarMessage,
snackbarSeverity,
} = useJobSubmitterProvider();
const router = useRouter();
const { elements, url } = router.query;
const resultsRef = useRef<HTMLTableElement | null>(null);
useEffect(() => {
if (elements) {
setRows(JSON.parse(elements as string));
}
if (url) {
setSubmittedURL(url as string);
}
}, [elements, url]);
useEffect(() => {
if (results && resultsRef.current) {
resultsRef.current.scrollIntoView({ behavior: "smooth" });
}
}, [results]);
const handleCloseSnackbar = () => {
setSnackbarOpen(false);
};
const ErrorSnackbar = () => {
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="error">
{snackbarMessage}
</Alert>
</Snackbar>
);
};
const NotifySnackbar = () => {
const goTo = () => {
router.push("/jobs");
};
const action = (
<Button color="inherit" size="small" onClick={goTo}>
Go To Job
</Button>
);
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="info" action={action}>
{snackbarMessage}
</Alert>
</Snackbar>
);
};
return (
<Box
bgcolor="background.default"
display="flex"
flexDirection="column"
justifyContent="center"
alignItems="center"
height="100%"
py={4}
>
<Container maxWidth="lg" className="overflow-y-auto max-h-full">
<JobSubmitter />
{submittedURL.length ? (
<ElementTable
rows={rows}
setRows={setRows}
submittedURL={submittedURL}
/>
) : null}
</Container>
{snackbarSeverity === "info" ? <NotifySnackbar /> : <ErrorSnackbar />}
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./home";

View File

@@ -1,2 +1 @@
export * from "./ElementTable";
export * from "./job-submitter";

View File

@@ -15,9 +15,11 @@ import {
IconButton,
Tooltip,
useTheme,
Divider,
} from "@mui/material";
import AddIcon from "@mui/icons-material/Add";
import { Element } from "../../types";
import { Element } from "@/types";
import { SiteMap } from "../site-map";
interface Props {
rows: Element[];
@@ -169,6 +171,13 @@ export const ElementTable = ({ rows, setRows, submittedURL }: Props) => {
</div>
</TableContainer>
</Box>
<Divider
sx={{
borderColor: theme.palette.mode === "dark" ? "#ffffff" : "0000000",
marginBottom: 2,
}}
/>
<SiteMap />
</Box>
);
};

View File

@@ -0,0 +1 @@
export { ElementTable } from "./element-table";

View File

@@ -1 +1,2 @@
export { JobSubmitter } from "./job-submitter";
export { ElementTable } from "./element-table";

View File

@@ -1,26 +1,20 @@
import React, { Dispatch } from "react";
import React from "react";
import { TextField, Button, CircularProgress } from "@mui/material";
import { Element } from "@/types";
import { useJobSubmitterProvider } from "../provider";
export type JobSubmitterInputProps = {
submittedURL: string;
setSubmittedURL: Dispatch<React.SetStateAction<string>>;
isValidURL: boolean;
urlError: string | null;
handleSubmit: () => void;
loading: boolean;
rows: Element[];
};
export const JobSubmitterInput = ({
submittedURL,
setSubmittedURL,
isValidURL,
urlError,
handleSubmit,
loading,
rows,
urlError,
}: JobSubmitterInputProps) => {
const { submittedURL, setSubmittedURL, isValidURL, rows } =
useJobSubmitterProvider();
return (
<div className="flex flex-row space-x-4 items-center mb-2">
<TextField

View File

@@ -1,6 +1,7 @@
import { RawJobOptions } from "@/types/job";
import { Box, FormControlLabel, Checkbox, TextField } from "@mui/material";
import { Dispatch, SetStateAction } from "react";
import { useJobSubmitterProvider } from "../provider";
export type JobSubmitterOptionsProps = {
jobOptions: RawJobOptions;
@@ -14,9 +15,9 @@ export type JobSubmitterOptionsProps = {
export const JobSubmitterOptions = ({
jobOptions,
setJobOptions,
handleSelectProxies,
customJSONSelected,
setCustomJSONSelected,
handleSelectProxies,
proxiesSelected,
}: JobSubmitterOptionsProps) => {
const handleMultiPageScrapeChange = () => {

View File

@@ -1,7 +1,6 @@
"use client";
import React, { useEffect, useState, Dispatch } from "react";
import { Element } from "@/types";
import React, { useEffect, useState } from "react";
import { useAuth } from "@/contexts/AuthContext";
import { useRouter } from "next/router";
import { RawJobOptions } from "@/types/job";
@@ -10,21 +9,7 @@ import { JobSubmitterHeader } from "./job-submitter-header";
import { JobSubmitterInput } from "./job-submitter-input";
import { JobSubmitterOptions } from "./job-submitter-options";
import { ApiService } from "@/services";
interface StateProps {
submittedURL: string;
setSubmittedURL: Dispatch<React.SetStateAction<string>>;
rows: Element[];
isValidURL: boolean;
setIsValidUrl: Dispatch<React.SetStateAction<boolean>>;
setSnackbarMessage: Dispatch<React.SetStateAction<string>>;
setSnackbarOpen: Dispatch<React.SetStateAction<boolean>>;
setSnackbarSeverity: Dispatch<React.SetStateAction<string>>;
}
interface Props {
stateProps: StateProps;
}
import { useJobSubmitterProvider } from "./provider";
const initialJobOptions: RawJobOptions = {
multi_page_scrape: false,
@@ -32,7 +17,7 @@ const initialJobOptions: RawJobOptions = {
proxies: null,
};
export const JobSubmitter = ({ stateProps }: Props) => {
export const JobSubmitter = () => {
const { user } = useAuth();
const router = useRouter();
const { job_options } = router.query;
@@ -40,11 +25,13 @@ export const JobSubmitter = ({ stateProps }: Props) => {
const {
submittedURL,
rows,
siteMap,
setIsValidUrl,
setSnackbarMessage,
setSnackbarOpen,
setSnackbarSeverity,
} = stateProps;
setSiteMap,
} = useJobSubmitterProvider();
const [urlError, setUrlError] = useState<string | null>(null);
const [loading, setLoading] = useState<boolean>(false);
@@ -87,7 +74,8 @@ export const JobSubmitter = ({ stateProps }: Props) => {
rows,
user,
jobOptions,
customHeaders
customHeaders,
siteMap
)
.then(async (response) => {
if (!response.ok) {
@@ -120,31 +108,28 @@ export const JobSubmitter = ({ stateProps }: Props) => {
job_options as string,
setCustomJSONSelected,
setProxiesSelected,
setJobOptions
setJobOptions,
setSiteMap
);
}
}, [job_options]);
return (
<>
<div>
<JobSubmitterHeader />
<JobSubmitterInput
{...stateProps}
urlError={urlError}
handleSubmit={handleSubmit}
loading={loading}
/>
<JobSubmitterOptions
{...stateProps}
jobOptions={jobOptions}
setJobOptions={setJobOptions}
customJSONSelected={customJSONSelected}
setCustomJSONSelected={setCustomJSONSelected}
handleSelectProxies={handleSelectProxies}
proxiesSelected={proxiesSelected}
/>
</div>
</>
<div>
<JobSubmitterHeader />
<JobSubmitterInput
urlError={urlError}
handleSubmit={handleSubmit}
loading={loading}
/>
<JobSubmitterOptions
jobOptions={jobOptions}
setJobOptions={setJobOptions}
customJSONSelected={customJSONSelected}
setCustomJSONSelected={setCustomJSONSelected}
handleSelectProxies={handleSelectProxies}
proxiesSelected={proxiesSelected}
/>
</div>
);
};

View File

@@ -0,0 +1,84 @@
import React, {
createContext,
PropsWithChildren,
useContext,
useState,
Dispatch,
useMemo,
} from "react";
import { Element, Result, SiteMap } from "@/types";
type JobSubmitterProviderType = {
submittedURL: string;
setSubmittedURL: Dispatch<React.SetStateAction<string>>;
rows: Element[];
setRows: Dispatch<React.SetStateAction<Element[]>>;
results: Result;
setResults: Dispatch<React.SetStateAction<Result>>;
snackbarOpen: boolean;
setSnackbarOpen: Dispatch<React.SetStateAction<boolean>>;
snackbarMessage: string;
setSnackbarMessage: Dispatch<React.SetStateAction<string>>;
snackbarSeverity: string;
setSnackbarSeverity: Dispatch<React.SetStateAction<string>>;
isValidURL: boolean;
setIsValidUrl: Dispatch<React.SetStateAction<boolean>>;
siteMap: SiteMap | null;
setSiteMap: Dispatch<React.SetStateAction<SiteMap | null>>;
};
const JobSubmitterProvider = createContext<JobSubmitterProviderType>(
{} as JobSubmitterProviderType
);
export const Provider = ({ children }: PropsWithChildren) => {
const [submittedURL, setSubmittedURL] = useState<string>("");
const [rows, setRows] = useState<Element[]>([]);
const [results, setResults] = useState<Result>({});
const [snackbarOpen, setSnackbarOpen] = useState<boolean>(false);
const [snackbarMessage, setSnackbarMessage] = useState<string>("");
const [snackbarSeverity, setSnackbarSeverity] = useState<string>("error");
const [isValidURL, setIsValidUrl] = useState<boolean>(true);
const [siteMap, setSiteMap] = useState<SiteMap | null>(null);
const value: JobSubmitterProviderType = useMemo(
() => ({
submittedURL,
setSubmittedURL,
rows,
setRows,
results,
setResults,
snackbarOpen,
setSnackbarOpen,
snackbarMessage,
setSnackbarMessage,
snackbarSeverity,
setSnackbarSeverity,
isValidURL,
setIsValidUrl,
siteMap,
setSiteMap,
}),
[
submittedURL,
rows,
results,
snackbarOpen,
snackbarMessage,
snackbarSeverity,
isValidURL,
siteMap,
]
);
return (
<JobSubmitterProvider.Provider value={value}>
{children}
</JobSubmitterProvider.Provider>
);
};
export const useJobSubmitterProvider = () => {
return useContext(JobSubmitterProvider);
};

View File

@@ -0,0 +1 @@
export * from "./site-map";

View File

@@ -0,0 +1 @@
export * from "./site-map-input";

View File

@@ -0,0 +1,21 @@
.button {
height: 3rem;
width: 2rem;
color: #ffffff;
font-weight: 600;
border-radius: 0.375rem;
transition: transform 0.2s ease-in-out;
transform: scale(1);
&:hover {
transform: scale(1.05);
}
}
.remove {
background-color: var(--delete-red) !important;
}
.remove:hover {
background-color: var(--delete-red-hover) !important;
}

View File

@@ -0,0 +1,135 @@
import { useState } from "react";
import { useJobSubmitterProvider } from "../../provider";
import {
MenuItem,
Select,
TextField,
FormControl,
Button,
Checkbox,
FormControlLabel,
} from "@mui/material";
import { ActionOption } from "@/types/job";
import classes from "./site-map-input.module.css";
import { clsx } from "clsx";
export type SiteMapInputProps = {
disabled?: boolean;
xpath?: string;
option?: ActionOption;
clickOnce?: boolean;
input?: string;
};
export const SiteMapInput = ({
disabled,
xpath,
option,
clickOnce,
input,
}: SiteMapInputProps) => {
console.log(clickOnce);
const [optionState, setOptionState] = useState<ActionOption>(
option || "click"
);
const [xpathState, setXpathState] = useState<string>(xpath || "");
const [clickOnceState, setClickOnceState] = useState<boolean>(
clickOnce || false
);
const [inputState, setInputState] = useState<string>(input || "");
const { siteMap, setSiteMap } = useJobSubmitterProvider();
const handleAdd = () => {
if (!siteMap) return;
console.log(optionState, xpathState, clickOnceState, inputState);
setSiteMap((prevSiteMap) => ({
...prevSiteMap,
actions: [
{
type: optionState,
xpath: xpathState,
name: "",
do_once: clickOnceState,
input: inputState,
},
...(prevSiteMap?.actions || []),
],
}));
setXpathState("");
};
const handleRemove = () => {
if (!siteMap) return;
setSiteMap((prevSiteMap) => ({
...prevSiteMap,
actions: (prevSiteMap?.actions || []).slice(0, -1),
}));
};
return (
<div className="flex flex-col gap-2 w-full">
<div className="flex gap-2 items-center">
<FormControl className="w-1/4">
<Select
disabled={disabled}
displayEmpty
value={optionState}
onChange={(e) => setOptionState(e.target.value as ActionOption)}
>
<MenuItem value="click">Click</MenuItem>
<MenuItem value="input">Input</MenuItem>
</Select>
</FormControl>
{optionState === "input" && (
<TextField
label="Input Text"
fullWidth
value={inputState}
onChange={(e) => setInputState(e.target.value)}
disabled={disabled}
/>
)}
<TextField
label="XPath Selector"
fullWidth
value={xpathState}
onChange={(e) => setXpathState(e.target.value)}
disabled={disabled}
/>
{disabled ? (
<Button
onClick={handleRemove}
className={clsx(classes.button, classes.remove)}
>
Delete
</Button>
) : (
<Button
onClick={handleAdd}
disabled={!xpathState}
className={clsx(classes.button, classes.add)}
>
Add
</Button>
)}
</div>
{!disabled && (
<FormControlLabel
label="Do Once"
control={
<Checkbox
checked={clickOnceState}
disabled={disabled}
onChange={() => setClickOnceState(!clickOnceState)}
/>
}
/>
)}
</div>
);
};

View File

@@ -0,0 +1,70 @@
import { useEffect, useState } from "react";
import { useJobSubmitterProvider } from "../provider";
import { Button, Divider, Typography, useTheme } from "@mui/material";
import { SiteMapInput } from "./site-map-input";
export const SiteMap = () => {
const { siteMap, setSiteMap } = useJobSubmitterProvider();
const [showSiteMap, setShowSiteMap] = useState<boolean>(false);
const theme = useTheme();
const handleCreateSiteMap = () => {
setSiteMap({ actions: [] });
setShowSiteMap(true);
};
const handleClearSiteMap = () => {
setSiteMap(null);
setShowSiteMap(false);
};
useEffect(() => {
if (siteMap) {
setShowSiteMap(true);
}
}, [siteMap]);
return (
<div className="flex flex-col gap-4">
{siteMap ? (
<Button onClick={handleClearSiteMap}>Clear Site Map</Button>
) : (
<Button onClick={handleCreateSiteMap}>Create Site Map</Button>
)}
{showSiteMap && (
<div className="flex flex-col gap-4">
<SiteMapInput />
{siteMap?.actions && siteMap?.actions.length > 0 && (
<>
<Divider
sx={{
borderColor:
theme.palette.mode === "dark" ? "#ffffff" : "0000000",
}}
/>
<Typography className="w-full text-center" variant="h5">
Site Map Actions
</Typography>
</>
)}
<ul className="flex flex-col gap-4">
{siteMap?.actions.reverse().map((action, index) => (
<li key={action.xpath} className="flex w-full items-center">
<Typography variant="h6" className="w-[10%] mr-2">
Action {index + 1}:
</Typography>
<SiteMapInput
disabled={Boolean(siteMap)}
xpath={action.xpath}
option={action.type}
clickOnce={action.do_once}
input={action.input}
/>
</li>
))}
</ul>
</div>
)}
</div>
);
};

View File

@@ -1,15 +1,17 @@
import { Dispatch, SetStateAction } from "react";
import { RawJobOptions } from "@/types";
import { RawJobOptions, SiteMap } from "@/types";
export const parseJobOptions = (
job_options: string,
setCustomJSONSelected: Dispatch<SetStateAction<boolean>>,
setProxiesSelected: Dispatch<SetStateAction<boolean>>,
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>,
setSiteMap: Dispatch<SetStateAction<any>>
) => {
if (job_options) {
const jsonOptions = JSON.parse(job_options as string);
console.log(jsonOptions);
const newJobOptions: RawJobOptions = {
multi_page_scrape: false,
custom_headers: null,
@@ -31,6 +33,10 @@ export const parseJobOptions = (
newJobOptions.proxies = jsonOptions.proxies.join(",");
}
if (jsonOptions.site_map) {
setSiteMap(jsonOptions.site_map);
}
setJobOptions(newJobOptions);
}
};

View File

@@ -1,117 +1,10 @@
"use client";
import React, { useState, useEffect, useRef } from "react";
import { Button, Container, Box, Snackbar, Alert } from "@mui/material";
import { useRouter } from "next/router";
import { Element, Result } from "@/types";
import { ElementTable } from "@/components/submit";
import { JobSubmitter } from "@/components/submit/job-submitter";
const Home = () => {
const router = useRouter();
const { elements, url } = router.query;
const [submittedURL, setSubmittedURL] = useState<string>("");
const [rows, setRows] = useState<Element[]>([]);
const [results, setResults] = useState<Result>({});
const [snackbarOpen, setSnackbarOpen] = useState<boolean>(false);
const [snackbarMessage, setSnackbarMessage] = useState<string>("");
const [snackbarSeverity, setSnackbarSeverity] = useState<string>("error");
const [isValidURL, setIsValidUrl] = useState<boolean>(true);
const resultsRef = useRef<HTMLTableElement | null>(null);
useEffect(() => {
if (elements) {
setRows(JSON.parse(elements as string));
}
if (url) {
setSubmittedURL(url as string);
}
}, [elements, url]);
useEffect(() => {
if (results && resultsRef.current) {
resultsRef.current.scrollIntoView({ behavior: "smooth" });
}
}, [results]);
const handleCloseSnackbar = () => {
setSnackbarOpen(false);
};
const ErrorSnackbar = () => {
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="error">
{snackbarMessage}
</Alert>
</Snackbar>
);
};
const NotifySnackbar = () => {
const goTo = () => {
router.push("/jobs");
};
const action = (
<Button color="inherit" size="small" onClick={goTo}>
Go To Job
</Button>
);
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="info" action={action}>
{snackbarMessage}
</Alert>
</Snackbar>
);
};
import { Provider as JobSubmitterProvider } from "@/components/submit/job-submitter/provider";
import { Home } from "@/components/pages/home/home";
export default function Main() {
return (
<Box
bgcolor="background.default"
display="flex"
flexDirection="column"
justifyContent="center"
alignItems="center"
height="100%"
py={4}
>
<Container maxWidth="lg">
<JobSubmitter
stateProps={{
submittedURL,
setSubmittedURL,
rows,
isValidURL,
setIsValidUrl,
setSnackbarMessage,
setSnackbarOpen,
setSnackbarSeverity,
}}
/>
{submittedURL.length ? (
<ElementTable
rows={rows}
setRows={setRows}
submittedURL={submittedURL}
/>
) : null}
</Container>
{snackbarSeverity === "info" ? <NotifySnackbar /> : <ErrorSnackbar />}
</Box>
<JobSubmitterProvider>
<Home />
</JobSubmitterProvider>
);
};
export default Home;
}

View File

@@ -1,9 +1,12 @@
import { SiteMap } from "@/types/job";
export const submitJob = async (
submittedURL: string,
rows: any[],
user: any,
jobOptions: any,
customHeaders: any
customHeaders: any,
siteMap: SiteMap | null
) => {
return await fetch(`/api/submit-scrape-job`, {
method: "POST",
@@ -18,6 +21,7 @@ export const submitJob = async (
...jobOptions,
custom_headers: customHeaders || {},
proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [],
site_map: siteMap,
},
},
}),

View File

@@ -2,6 +2,11 @@
@tailwind components;
@tailwind utilities;
:root {
--delete-red: #ef4444;
--delete-red-hover: #ff6969;
}
#__next {
height: 100%;
}

View File

@@ -34,6 +34,12 @@ const commonThemeOptions = {
h4: {
fontWeight: 500,
},
h5: {
fontWeight: 500,
},
h6: {
fontWeight: 500,
},
body1: {
fontFamily: '"Schibsted Grotesk", sans-serif',
},
@@ -175,6 +181,9 @@ const darkTheme = createTheme({
h5: {
color: "#ffffff",
},
h6: {
color: "#ffffff",
},
body1: {
...commonThemeOptions.typography.body1,
color: "#ffffff",

View File

@@ -16,6 +16,7 @@ export type JobOptions = {
multi_page_scrape: boolean;
custom_headers: null | string;
proxies: string[];
site_map?: SiteMap;
};
export type RawJobOptions = {
@@ -23,3 +24,17 @@ export type RawJobOptions = {
custom_headers: string | null;
proxies: string | null;
};
export type ActionOption = "click" | "input";
export type Action = {
type: ActionOption;
xpath: string;
name: string;
do_once?: boolean;
input?: string;
};
export type SiteMap = {
actions: Action[];
};