Feat: Site Mapping (#46)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled

* wip: add site mapping

* chore: cleanup
This commit is contained in:
Jayden Pyles
2024-11-16 20:55:23 -06:00
committed by GitHub
parent 3a0762f1e3
commit 7d80ff5c7f
35 changed files with 853 additions and 349 deletions

View File

@@ -0,0 +1,19 @@
from .job import (
query,
insert,
update_job,
delete_jobs,
get_jobs_per_day,
get_queued_job,
average_elements_per_link,
)
__all__ = [
"query",
"insert",
"update_job",
"delete_jobs",
"get_jobs_per_day",
"get_queued_job",
"average_elements_per_link",
]

View File

@@ -6,8 +6,8 @@ from typing import Any, Optional
from pymongo import DESCENDING from pymongo import DESCENDING
# LOCAL # LOCAL
from api.backend.models import FetchOptions
from api.backend.database import get_job_collection from api.backend.database import get_job_collection
from api.backend.job.models.job_options import FetchOptions
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)

View File

View File

@@ -0,0 +1,14 @@
from pydantic import BaseModel
from typing import Any, Optional
from api.backend.job.models.site_map import SiteMap
class FetchOptions(BaseModel):
chat: Optional[bool] = None
class JobOptions(BaseModel):
multi_page_scrape: bool = False
custom_headers: dict[str, Any] = {}
proxies: list[str] = []
site_map: Optional[SiteMap] = None

View File

@@ -0,0 +1,14 @@
from pydantic import BaseModel
from typing import Literal
class Action(BaseModel):
type: Literal["click", "input"]
xpath: str
name: str
input: str = ""
do_once: bool = True
class SiteMap(BaseModel):
actions: list[Action]

View File

@@ -0,0 +1,30 @@
import time
from typing import cast
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
def scrape_content(driver: webdriver.Chrome, pages: set[tuple[str, str]]):
_ = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
last_height = cast(str, driver.execute_script("return document.body.scrollHeight"))
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3) # Wait for the page to load
new_height = cast(
str, driver.execute_script("return document.body.scrollHeight")
)
if new_height == last_height:
break
last_height = new_height
pages.add((driver.page_source, driver.current_url))
return driver.page_source

View File

View File

@@ -0,0 +1,94 @@
from api.backend.job.models.site_map import Action, SiteMap
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from typing import Any
import logging
import time
from copy import deepcopy
from api.backend.job.scraping.scraping_utils import scrape_content
from selenium.webdriver.support.ui import WebDriverWait
from seleniumwire.inspect import TimeoutException
from seleniumwire.webdriver import Chrome
from selenium.webdriver.support import expected_conditions as EC
LOG = logging.getLogger(__name__)
def clear_done_actions(site_map: dict[str, Any]):
"""Clear all actions that have been clicked."""
cleared_site_map = deepcopy(site_map)
cleared_site_map["actions"] = [
action for action in cleared_site_map["actions"] if not action["do_once"]
]
return cleared_site_map
def handle_input(action: Action, driver: webdriver.Chrome):
try:
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, action.xpath))
)
LOG.info(f"Sending keys: {action.input} to element: {element}")
element.send_keys(action.input)
except NoSuchElementException:
LOG.info(f"Element not found: {action.xpath}")
return False
except TimeoutException:
LOG.info(f"Timeout waiting for element: {action.xpath}")
return False
except Exception as e:
LOG.info(f"Error handling input: {e}")
return False
return True
def handle_click(action: Action, driver: webdriver.Chrome):
try:
element = driver.find_element(By.XPATH, action.xpath)
LOG.info(f"Clicking element: {element}")
element.click()
except NoSuchElementException:
LOG.info(f"Element not found: {action.xpath}")
return False
return True
ACTION_MAP = {
"click": handle_click,
"input": handle_input,
}
async def handle_site_mapping(
site_map_dict: dict[str, Any],
driver: Chrome,
pages: set[tuple[str, str]],
):
site_map = SiteMap(**site_map_dict)
LOG.info(f"Handling site map: {site_map}")
for action in site_map.actions:
action_handler = ACTION_MAP[action.type]
if not action_handler(action, driver):
return
time.sleep(2)
_ = scrape_content(driver, pages)
cleared_site_map_dict = clear_done_actions(site_map_dict)
if cleared_site_map_dict["actions"]:
await handle_site_mapping(cleared_site_map_dict, driver, pages)

View File

@@ -2,12 +2,14 @@
from typing import Any, Optional, Union from typing import Any, Optional, Union
from datetime import datetime from datetime import datetime
# LOCAL
from api.backend.job.models.job_options import JobOptions
# PDM # PDM
import pydantic import pydantic
class FetchOptions(pydantic.BaseModel):
chat: Optional[bool] = None
class Element(pydantic.BaseModel): class Element(pydantic.BaseModel):
@@ -22,12 +24,6 @@ class CapturedElement(pydantic.BaseModel):
name: str name: str
class JobOptions(pydantic.BaseModel):
multi_page_scrape: bool = False
custom_headers: Optional[dict[str, Any]] = {}
proxies: Optional[list[str]] = []
class RetrieveScrapeJobs(pydantic.BaseModel): class RetrieveScrapeJobs(pydantic.BaseModel):
user: str user: str

View File

@@ -12,22 +12,17 @@ from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse, StreamingResponse from fastapi.responses import JSONResponse, StreamingResponse
# LOCAL # LOCAL
from api.backend.job import ( from api.backend.job import query, insert, update_job, delete_jobs
query,
insert,
update_job,
delete_jobs,
)
from api.backend.models import ( from api.backend.models import (
UpdateJobs, UpdateJobs,
DownloadJob, DownloadJob,
FetchOptions,
DeleteScrapeJobs, DeleteScrapeJobs,
Job, Job,
) )
from api.backend.schemas import User from api.backend.schemas import User
from api.backend.auth.auth_utils import get_current_user from api.backend.auth.auth_utils import get_current_user
from api.backend.utils import clean_text from api.backend.utils import clean_text
from api.backend.job.models.job_options import FetchOptions
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)

View File

@@ -1,19 +1,20 @@
import logging import logging
from typing import Any, Optional from typing import Any, Optional
import time
import random import random
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from lxml import etree from lxml import etree
from seleniumwire import webdriver from seleniumwire import webdriver
from lxml.etree import _Element # type: ignore [reportPrivateImport] from lxml.etree import _Element # pyright: ignore [reportPrivateUsage]
from fake_useragent import UserAgent from fake_useragent import UserAgent
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium.webdriver.chrome.options import Options as ChromeOptions
from urllib.parse import urlparse, urljoin from urllib.parse import urlparse, urljoin
from api.backend.models import Element, CapturedElement from api.backend.models import Element, CapturedElement
from api.backend.job.site_mapping.site_mapping import (
handle_site_mapping,
)
from api.backend.job.scraping.scraping_utils import scrape_content
from api.backend.job.models.site_map import SiteMap
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
@@ -95,6 +96,7 @@ async def make_site_request(
pages: set[tuple[str, str]] = set(), pages: set[tuple[str, str]] = set(),
original_url: str = "", original_url: str = "",
proxies: Optional[list[str]] = [], proxies: Optional[list[str]] = [],
site_map: Optional[dict[str, Any]] = None,
) -> None: ) -> None:
"""Make basic `GET` request to site using Selenium.""" """Make basic `GET` request to site using Selenium."""
# Check if URL has already been visited # Check if URL has already been visited
@@ -114,27 +116,16 @@ async def make_site_request(
final_url = driver.current_url final_url = driver.current_url
visited_urls.add(url) visited_urls.add(url)
visited_urls.add(final_url) visited_urls.add(final_url)
_ = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
last_height = driver.execute_script("return document.body.scrollHeight") page_source = scrape_content(driver, pages)
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3) # Wait for the page to load if site_map:
new_height = driver.execute_script("return document.body.scrollHeight") LOG.info("Site map: %s", site_map)
_ = await handle_site_mapping(
if new_height == last_height: site_map,
break driver,
pages,
last_height = new_height )
final_height = driver.execute_script("return document.body.scrollHeight")
page_source = driver.page_source
LOG.debug(f"Page source for url: {url}\n{page_source}")
pages.add((page_source, final_url))
finally: finally:
driver.quit() driver.quit()
@@ -192,6 +183,7 @@ async def scrape(
headers: Optional[dict[str, Any]], headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False, multi_page_scrape: bool = False,
proxies: Optional[list[str]] = [], proxies: Optional[list[str]] = [],
site_map: Optional[SiteMap] = None,
): ):
visited_urls: set[str] = set() visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set() pages: set[tuple[str, str]] = set()
@@ -204,6 +196,7 @@ async def scrape(
pages=pages, pages=pages,
original_url=url, original_url=url,
proxies=proxies, proxies=proxies,
site_map=site_map,
) )
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list() elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()

View File

@@ -24,6 +24,7 @@ async def process_job():
job["job_options"]["custom_headers"], job["job_options"]["custom_headers"],
job["job_options"]["multi_page_scrape"], job["job_options"]["multi_page_scrape"],
job["job_options"]["proxies"], job["job_options"]["proxies"],
job["job_options"]["site_map"],
) )
LOG.info( LOG.info(
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}" f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"

View File

@@ -10,5 +10,8 @@ services:
- "$PWD/package-lock.json:/app/package-lock.json" - "$PWD/package-lock.json:/app/package-lock.json"
- "$PWD/tsconfig.json:/app/tsconfig.json" - "$PWD/tsconfig.json:/app/tsconfig.json"
scraperr_api: scraperr_api:
environment:
- LOG_LEVEL=INFO
volumes: volumes:
- "$PWD/api:/project/api" - "$PWD/api:/project/api"
- "$PWD/scraping:/project/scraping"

View File

@@ -15,6 +15,7 @@ import {
Button, Button,
Tooltip, Tooltip,
IconButton, IconButton,
TableContainer,
} from "@mui/material"; } from "@mui/material";
import ExpandMoreIcon from "@mui/icons-material/ExpandMore"; import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
import StarIcon from "@mui/icons-material/Star"; import StarIcon from "@mui/icons-material/Star";
@@ -52,145 +53,147 @@ export const JobQueue = ({
const router = useRouter(); const router = useRouter();
return ( return (
<Table sx={{ tableLayout: "fixed", width: "100%" }}> <TableContainer component={Box} sx={{ maxHeight: "90dvh" }}>
<TableHead> <Table sx={{ tableLayout: "fixed", width: "100%" }}>
<TableRow> <TableHead>
<TableCell>Select</TableCell> <TableRow>
<TableCell>Id</TableCell> <TableCell>Select</TableCell>
<TableCell>Url</TableCell> <TableCell>Id</TableCell>
<TableCell>Elements</TableCell> <TableCell>Url</TableCell>
<TableCell>Result</TableCell> <TableCell>Elements</TableCell>
<TableCell>Time Created</TableCell> <TableCell>Result</TableCell>
<TableCell>Status</TableCell> <TableCell>Time Created</TableCell>
<TableCell>Actions</TableCell> <TableCell>Status</TableCell>
</TableRow> <TableCell>Actions</TableCell>
</TableHead> </TableRow>
<TableBody> </TableHead>
{filteredJobs.map((row, index) => ( <TableBody sx={{ overflow: "auto" }}>
<TableRow key={index}> {filteredJobs.map((row, index) => (
<TableCell padding="checkbox"> <TableRow key={index}>
<Checkbox <TableCell padding="checkbox">
checked={selectedJobs.has(row.id)} <Checkbox
onChange={() => onSelectJob(row.id)} checked={selectedJobs.has(row.id)}
/> onChange={() => onSelectJob(row.id)}
<Tooltip title="Chat with AI"> />
<span> <Tooltip title="Chat with AI">
<IconButton <span>
onClick={() => { <IconButton
router.push({ onClick={() => {
pathname: "/chat", router.push({
query: { pathname: "/chat",
job: row.id, query: {
}, job: row.id,
}); },
}} });
>
<AutoAwesome />
</IconButton>
</span>
</Tooltip>
<Tooltip title="Favorite Job">
<span>
<IconButton
color={row.favorite ? "warning" : "default"}
onClick={() => {
onFavorite([row.id], "favorite", !row.favorite);
row.favorite = !row.favorite;
}}
>
<StarIcon />
</IconButton>
</span>
</Tooltip>
</TableCell>
<TableCell sx={{ maxWidth: 100, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.id}</Box>
</TableCell>
<TableCell sx={{ maxWidth: 200, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.url}</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
{JSON.stringify(row.elements)}
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto", padding: 0 }}>
<Accordion sx={{ margin: 0, padding: 0.5 }}>
<AccordionSummary
expandIcon={<ExpandMoreIcon />}
aria-controls="panel1a-content"
id="panel1a-header"
sx={{
minHeight: 0,
"&.Mui-expanded": { minHeight: 0 },
}}
>
<Box
sx={{
maxHeight: 150,
overflow: "auto",
width: "100%",
}}
>
<Typography sx={{ fontSize: "0.875rem" }}>
Show Result
</Typography>
</Box>
</AccordionSummary>
<AccordionDetails sx={{ padding: 1 }}>
<Box sx={{ maxHeight: 200, overflow: "auto" }}>
<Typography
sx={{
fontSize: "0.875rem",
whiteSpace: "pre-wrap",
}} }}
> >
{JSON.stringify(row.result, null, 2)} <AutoAwesome />
</Typography> </IconButton>
</Box> </span>
</AccordionDetails> </Tooltip>
</Accordion> <Tooltip title="Favorite Job">
</TableCell> <span>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}> <IconButton
<Box sx={{ maxHeight: 100, overflow: "auto" }}> color={row.favorite ? "warning" : "default"}
{new Date(row.time_created).toLocaleString()} onClick={() => {
</Box> onFavorite([row.id], "favorite", !row.favorite);
</TableCell> row.favorite = !row.favorite;
<TableCell sx={{ maxWidth: 50, overflow: "auto" }}> }}
<Box sx={{ maxHeight: 100, overflow: "auto" }}> >
<Box <StarIcon />
className="rounded-md p-2 text-center" </IconButton>
sx={{ bgcolor: colors[row.status] }} </span>
> </Tooltip>
{row.status} </TableCell>
<TableCell sx={{ maxWidth: 100, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.id}</Box>
</TableCell>
<TableCell sx={{ maxWidth: 200, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.url}</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
{JSON.stringify(row.elements)}
</Box> </Box>
</Box> </TableCell>
</TableCell> <TableCell sx={{ maxWidth: 150, overflow: "auto", padding: 0 }}>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}> <Accordion sx={{ margin: 0, padding: 0.5 }}>
<Box sx={{ display: "flex", gap: 1 }}> <AccordionSummary
<Button expandIcon={<ExpandMoreIcon />}
onClick={() => { aria-controls="panel1a-content"
onDownload([row.id]); id="panel1a-header"
}} sx={{
size="small" minHeight: 0,
sx={{ minWidth: 0, padding: "4px 8px" }} "&.Mui-expanded": { minHeight: 0 },
> }}
Download >
</Button> <Box
<Button sx={{
onClick={() => maxHeight: 150,
onNavigate(row.elements, row.url, row.job_options) overflow: "auto",
} width: "100%",
size="small" }}
sx={{ minWidth: 0, padding: "4px 8px" }} >
> <Typography sx={{ fontSize: "0.875rem" }}>
Rerun Show Result
</Button> </Typography>
</Box> </Box>
</TableCell> </AccordionSummary>
</TableRow> <AccordionDetails sx={{ padding: 1 }}>
))} <Box sx={{ maxHeight: 200, overflow: "auto" }}>
</TableBody> <Typography
</Table> sx={{
fontSize: "0.875rem",
whiteSpace: "pre-wrap",
}}
>
{JSON.stringify(row.result, null, 2)}
</Typography>
</Box>
</AccordionDetails>
</Accordion>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
{new Date(row.time_created).toLocaleString()}
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 50, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
<Box
className="rounded-md p-2 text-center"
sx={{ bgcolor: colors[row.status] }}
>
{row.status}
</Box>
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ display: "flex", gap: 1 }}>
<Button
onClick={() => {
onDownload([row.id]);
}}
size="small"
sx={{ minWidth: 0, padding: "4px 8px" }}
>
Download
</Button>
<Button
onClick={() =>
onNavigate(row.elements, row.url, row.job_options)
}
size="small"
sx={{ minWidth: 0, padding: "4px 8px" }}
>
Rerun
</Button>
</Box>
</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</TableContainer>
); );
}; };

View File

@@ -0,0 +1,107 @@
"use client";
import React, { useState, useEffect, useRef } from "react";
import { Button, Container, Box, Snackbar, Alert } from "@mui/material";
import { useRouter } from "next/router";
import { Element, Result } from "@/types";
import { ElementTable, JobSubmitter } from "@/components/submit/job-submitter";
import { useJobSubmitterProvider } from "@/components/submit/job-submitter/provider";
export const Home = () => {
const {
submittedURL,
setSubmittedURL,
rows,
setRows,
results,
snackbarOpen,
setSnackbarOpen,
snackbarMessage,
snackbarSeverity,
} = useJobSubmitterProvider();
const router = useRouter();
const { elements, url } = router.query;
const resultsRef = useRef<HTMLTableElement | null>(null);
useEffect(() => {
if (elements) {
setRows(JSON.parse(elements as string));
}
if (url) {
setSubmittedURL(url as string);
}
}, [elements, url]);
useEffect(() => {
if (results && resultsRef.current) {
resultsRef.current.scrollIntoView({ behavior: "smooth" });
}
}, [results]);
const handleCloseSnackbar = () => {
setSnackbarOpen(false);
};
const ErrorSnackbar = () => {
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="error">
{snackbarMessage}
</Alert>
</Snackbar>
);
};
const NotifySnackbar = () => {
const goTo = () => {
router.push("/jobs");
};
const action = (
<Button color="inherit" size="small" onClick={goTo}>
Go To Job
</Button>
);
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="info" action={action}>
{snackbarMessage}
</Alert>
</Snackbar>
);
};
return (
<Box
bgcolor="background.default"
display="flex"
flexDirection="column"
justifyContent="center"
alignItems="center"
height="100%"
py={4}
>
<Container maxWidth="lg" className="overflow-y-auto max-h-full">
<JobSubmitter />
{submittedURL.length ? (
<ElementTable
rows={rows}
setRows={setRows}
submittedURL={submittedURL}
/>
) : null}
</Container>
{snackbarSeverity === "info" ? <NotifySnackbar /> : <ErrorSnackbar />}
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./home";

View File

@@ -1,2 +1 @@
export * from "./ElementTable";
export * from "./job-submitter"; export * from "./job-submitter";

View File

@@ -15,9 +15,11 @@ import {
IconButton, IconButton,
Tooltip, Tooltip,
useTheme, useTheme,
Divider,
} from "@mui/material"; } from "@mui/material";
import AddIcon from "@mui/icons-material/Add"; import AddIcon from "@mui/icons-material/Add";
import { Element } from "../../types"; import { Element } from "@/types";
import { SiteMap } from "../site-map";
interface Props { interface Props {
rows: Element[]; rows: Element[];
@@ -169,6 +171,13 @@ export const ElementTable = ({ rows, setRows, submittedURL }: Props) => {
</div> </div>
</TableContainer> </TableContainer>
</Box> </Box>
<Divider
sx={{
borderColor: theme.palette.mode === "dark" ? "#ffffff" : "0000000",
marginBottom: 2,
}}
/>
<SiteMap />
</Box> </Box>
); );
}; };

View File

@@ -0,0 +1 @@
export { ElementTable } from "./element-table";

View File

@@ -1 +1,2 @@
export { JobSubmitter } from "./job-submitter"; export { JobSubmitter } from "./job-submitter";
export { ElementTable } from "./element-table";

View File

@@ -1,26 +1,20 @@
import React, { Dispatch } from "react"; import React from "react";
import { TextField, Button, CircularProgress } from "@mui/material"; import { TextField, Button, CircularProgress } from "@mui/material";
import { Element } from "@/types"; import { useJobSubmitterProvider } from "../provider";
export type JobSubmitterInputProps = { export type JobSubmitterInputProps = {
submittedURL: string;
setSubmittedURL: Dispatch<React.SetStateAction<string>>;
isValidURL: boolean;
urlError: string | null; urlError: string | null;
handleSubmit: () => void; handleSubmit: () => void;
loading: boolean; loading: boolean;
rows: Element[];
}; };
export const JobSubmitterInput = ({ export const JobSubmitterInput = ({
submittedURL,
setSubmittedURL,
isValidURL,
urlError,
handleSubmit, handleSubmit,
loading, loading,
rows, urlError,
}: JobSubmitterInputProps) => { }: JobSubmitterInputProps) => {
const { submittedURL, setSubmittedURL, isValidURL, rows } =
useJobSubmitterProvider();
return ( return (
<div className="flex flex-row space-x-4 items-center mb-2"> <div className="flex flex-row space-x-4 items-center mb-2">
<TextField <TextField

View File

@@ -1,6 +1,7 @@
import { RawJobOptions } from "@/types/job"; import { RawJobOptions } from "@/types/job";
import { Box, FormControlLabel, Checkbox, TextField } from "@mui/material"; import { Box, FormControlLabel, Checkbox, TextField } from "@mui/material";
import { Dispatch, SetStateAction } from "react"; import { Dispatch, SetStateAction } from "react";
import { useJobSubmitterProvider } from "../provider";
export type JobSubmitterOptionsProps = { export type JobSubmitterOptionsProps = {
jobOptions: RawJobOptions; jobOptions: RawJobOptions;
@@ -14,9 +15,9 @@ export type JobSubmitterOptionsProps = {
export const JobSubmitterOptions = ({ export const JobSubmitterOptions = ({
jobOptions, jobOptions,
setJobOptions, setJobOptions,
handleSelectProxies,
customJSONSelected, customJSONSelected,
setCustomJSONSelected, setCustomJSONSelected,
handleSelectProxies,
proxiesSelected, proxiesSelected,
}: JobSubmitterOptionsProps) => { }: JobSubmitterOptionsProps) => {
const handleMultiPageScrapeChange = () => { const handleMultiPageScrapeChange = () => {

View File

@@ -1,7 +1,6 @@
"use client"; "use client";
import React, { useEffect, useState, Dispatch } from "react"; import React, { useEffect, useState } from "react";
import { Element } from "@/types";
import { useAuth } from "@/contexts/AuthContext"; import { useAuth } from "@/contexts/AuthContext";
import { useRouter } from "next/router"; import { useRouter } from "next/router";
import { RawJobOptions } from "@/types/job"; import { RawJobOptions } from "@/types/job";
@@ -10,21 +9,7 @@ import { JobSubmitterHeader } from "./job-submitter-header";
import { JobSubmitterInput } from "./job-submitter-input"; import { JobSubmitterInput } from "./job-submitter-input";
import { JobSubmitterOptions } from "./job-submitter-options"; import { JobSubmitterOptions } from "./job-submitter-options";
import { ApiService } from "@/services"; import { ApiService } from "@/services";
import { useJobSubmitterProvider } from "./provider";
interface StateProps {
submittedURL: string;
setSubmittedURL: Dispatch<React.SetStateAction<string>>;
rows: Element[];
isValidURL: boolean;
setIsValidUrl: Dispatch<React.SetStateAction<boolean>>;
setSnackbarMessage: Dispatch<React.SetStateAction<string>>;
setSnackbarOpen: Dispatch<React.SetStateAction<boolean>>;
setSnackbarSeverity: Dispatch<React.SetStateAction<string>>;
}
interface Props {
stateProps: StateProps;
}
const initialJobOptions: RawJobOptions = { const initialJobOptions: RawJobOptions = {
multi_page_scrape: false, multi_page_scrape: false,
@@ -32,7 +17,7 @@ const initialJobOptions: RawJobOptions = {
proxies: null, proxies: null,
}; };
export const JobSubmitter = ({ stateProps }: Props) => { export const JobSubmitter = () => {
const { user } = useAuth(); const { user } = useAuth();
const router = useRouter(); const router = useRouter();
const { job_options } = router.query; const { job_options } = router.query;
@@ -40,11 +25,13 @@ export const JobSubmitter = ({ stateProps }: Props) => {
const { const {
submittedURL, submittedURL,
rows, rows,
siteMap,
setIsValidUrl, setIsValidUrl,
setSnackbarMessage, setSnackbarMessage,
setSnackbarOpen, setSnackbarOpen,
setSnackbarSeverity, setSnackbarSeverity,
} = stateProps; setSiteMap,
} = useJobSubmitterProvider();
const [urlError, setUrlError] = useState<string | null>(null); const [urlError, setUrlError] = useState<string | null>(null);
const [loading, setLoading] = useState<boolean>(false); const [loading, setLoading] = useState<boolean>(false);
@@ -87,7 +74,8 @@ export const JobSubmitter = ({ stateProps }: Props) => {
rows, rows,
user, user,
jobOptions, jobOptions,
customHeaders customHeaders,
siteMap
) )
.then(async (response) => { .then(async (response) => {
if (!response.ok) { if (!response.ok) {
@@ -120,31 +108,28 @@ export const JobSubmitter = ({ stateProps }: Props) => {
job_options as string, job_options as string,
setCustomJSONSelected, setCustomJSONSelected,
setProxiesSelected, setProxiesSelected,
setJobOptions setJobOptions,
setSiteMap
); );
} }
}, [job_options]); }, [job_options]);
return ( return (
<> <div>
<div> <JobSubmitterHeader />
<JobSubmitterHeader /> <JobSubmitterInput
<JobSubmitterInput urlError={urlError}
{...stateProps} handleSubmit={handleSubmit}
urlError={urlError} loading={loading}
handleSubmit={handleSubmit} />
loading={loading} <JobSubmitterOptions
/> jobOptions={jobOptions}
<JobSubmitterOptions setJobOptions={setJobOptions}
{...stateProps} customJSONSelected={customJSONSelected}
jobOptions={jobOptions} setCustomJSONSelected={setCustomJSONSelected}
setJobOptions={setJobOptions} handleSelectProxies={handleSelectProxies}
customJSONSelected={customJSONSelected} proxiesSelected={proxiesSelected}
setCustomJSONSelected={setCustomJSONSelected} />
handleSelectProxies={handleSelectProxies} </div>
proxiesSelected={proxiesSelected}
/>
</div>
</>
); );
}; };

View File

@@ -0,0 +1,84 @@
import React, {
createContext,
PropsWithChildren,
useContext,
useState,
Dispatch,
useMemo,
} from "react";
import { Element, Result, SiteMap } from "@/types";
type JobSubmitterProviderType = {
submittedURL: string;
setSubmittedURL: Dispatch<React.SetStateAction<string>>;
rows: Element[];
setRows: Dispatch<React.SetStateAction<Element[]>>;
results: Result;
setResults: Dispatch<React.SetStateAction<Result>>;
snackbarOpen: boolean;
setSnackbarOpen: Dispatch<React.SetStateAction<boolean>>;
snackbarMessage: string;
setSnackbarMessage: Dispatch<React.SetStateAction<string>>;
snackbarSeverity: string;
setSnackbarSeverity: Dispatch<React.SetStateAction<string>>;
isValidURL: boolean;
setIsValidUrl: Dispatch<React.SetStateAction<boolean>>;
siteMap: SiteMap | null;
setSiteMap: Dispatch<React.SetStateAction<SiteMap | null>>;
};
const JobSubmitterProvider = createContext<JobSubmitterProviderType>(
{} as JobSubmitterProviderType
);
export const Provider = ({ children }: PropsWithChildren) => {
const [submittedURL, setSubmittedURL] = useState<string>("");
const [rows, setRows] = useState<Element[]>([]);
const [results, setResults] = useState<Result>({});
const [snackbarOpen, setSnackbarOpen] = useState<boolean>(false);
const [snackbarMessage, setSnackbarMessage] = useState<string>("");
const [snackbarSeverity, setSnackbarSeverity] = useState<string>("error");
const [isValidURL, setIsValidUrl] = useState<boolean>(true);
const [siteMap, setSiteMap] = useState<SiteMap | null>(null);
const value: JobSubmitterProviderType = useMemo(
() => ({
submittedURL,
setSubmittedURL,
rows,
setRows,
results,
setResults,
snackbarOpen,
setSnackbarOpen,
snackbarMessage,
setSnackbarMessage,
snackbarSeverity,
setSnackbarSeverity,
isValidURL,
setIsValidUrl,
siteMap,
setSiteMap,
}),
[
submittedURL,
rows,
results,
snackbarOpen,
snackbarMessage,
snackbarSeverity,
isValidURL,
siteMap,
]
);
return (
<JobSubmitterProvider.Provider value={value}>
{children}
</JobSubmitterProvider.Provider>
);
};
export const useJobSubmitterProvider = () => {
return useContext(JobSubmitterProvider);
};

View File

@@ -0,0 +1 @@
export * from "./site-map";

View File

@@ -0,0 +1 @@
export * from "./site-map-input";

View File

@@ -0,0 +1,21 @@
.button {
height: 3rem;
width: 2rem;
color: #ffffff;
font-weight: 600;
border-radius: 0.375rem;
transition: transform 0.2s ease-in-out;
transform: scale(1);
&:hover {
transform: scale(1.05);
}
}
.remove {
background-color: var(--delete-red) !important;
}
.remove:hover {
background-color: var(--delete-red-hover) !important;
}

View File

@@ -0,0 +1,135 @@
import { useState } from "react";
import { useJobSubmitterProvider } from "../../provider";
import {
MenuItem,
Select,
TextField,
FormControl,
Button,
Checkbox,
FormControlLabel,
} from "@mui/material";
import { ActionOption } from "@/types/job";
import classes from "./site-map-input.module.css";
import { clsx } from "clsx";
export type SiteMapInputProps = {
disabled?: boolean;
xpath?: string;
option?: ActionOption;
clickOnce?: boolean;
input?: string;
};
export const SiteMapInput = ({
disabled,
xpath,
option,
clickOnce,
input,
}: SiteMapInputProps) => {
console.log(clickOnce);
const [optionState, setOptionState] = useState<ActionOption>(
option || "click"
);
const [xpathState, setXpathState] = useState<string>(xpath || "");
const [clickOnceState, setClickOnceState] = useState<boolean>(
clickOnce || false
);
const [inputState, setInputState] = useState<string>(input || "");
const { siteMap, setSiteMap } = useJobSubmitterProvider();
const handleAdd = () => {
if (!siteMap) return;
console.log(optionState, xpathState, clickOnceState, inputState);
setSiteMap((prevSiteMap) => ({
...prevSiteMap,
actions: [
{
type: optionState,
xpath: xpathState,
name: "",
do_once: clickOnceState,
input: inputState,
},
...(prevSiteMap?.actions || []),
],
}));
setXpathState("");
};
const handleRemove = () => {
if (!siteMap) return;
setSiteMap((prevSiteMap) => ({
...prevSiteMap,
actions: (prevSiteMap?.actions || []).slice(0, -1),
}));
};
return (
<div className="flex flex-col gap-2 w-full">
<div className="flex gap-2 items-center">
<FormControl className="w-1/4">
<Select
disabled={disabled}
displayEmpty
value={optionState}
onChange={(e) => setOptionState(e.target.value as ActionOption)}
>
<MenuItem value="click">Click</MenuItem>
<MenuItem value="input">Input</MenuItem>
</Select>
</FormControl>
{optionState === "input" && (
<TextField
label="Input Text"
fullWidth
value={inputState}
onChange={(e) => setInputState(e.target.value)}
disabled={disabled}
/>
)}
<TextField
label="XPath Selector"
fullWidth
value={xpathState}
onChange={(e) => setXpathState(e.target.value)}
disabled={disabled}
/>
{disabled ? (
<Button
onClick={handleRemove}
className={clsx(classes.button, classes.remove)}
>
Delete
</Button>
) : (
<Button
onClick={handleAdd}
disabled={!xpathState}
className={clsx(classes.button, classes.add)}
>
Add
</Button>
)}
</div>
{!disabled && (
<FormControlLabel
label="Do Once"
control={
<Checkbox
checked={clickOnceState}
disabled={disabled}
onChange={() => setClickOnceState(!clickOnceState)}
/>
}
/>
)}
</div>
);
};

View File

@@ -0,0 +1,70 @@
import { useEffect, useState } from "react";
import { useJobSubmitterProvider } from "../provider";
import { Button, Divider, Typography, useTheme } from "@mui/material";
import { SiteMapInput } from "./site-map-input";
export const SiteMap = () => {
const { siteMap, setSiteMap } = useJobSubmitterProvider();
const [showSiteMap, setShowSiteMap] = useState<boolean>(false);
const theme = useTheme();
const handleCreateSiteMap = () => {
setSiteMap({ actions: [] });
setShowSiteMap(true);
};
const handleClearSiteMap = () => {
setSiteMap(null);
setShowSiteMap(false);
};
useEffect(() => {
if (siteMap) {
setShowSiteMap(true);
}
}, [siteMap]);
return (
<div className="flex flex-col gap-4">
{siteMap ? (
<Button onClick={handleClearSiteMap}>Clear Site Map</Button>
) : (
<Button onClick={handleCreateSiteMap}>Create Site Map</Button>
)}
{showSiteMap && (
<div className="flex flex-col gap-4">
<SiteMapInput />
{siteMap?.actions && siteMap?.actions.length > 0 && (
<>
<Divider
sx={{
borderColor:
theme.palette.mode === "dark" ? "#ffffff" : "0000000",
}}
/>
<Typography className="w-full text-center" variant="h5">
Site Map Actions
</Typography>
</>
)}
<ul className="flex flex-col gap-4">
{siteMap?.actions.reverse().map((action, index) => (
<li key={action.xpath} className="flex w-full items-center">
<Typography variant="h6" className="w-[10%] mr-2">
Action {index + 1}:
</Typography>
<SiteMapInput
disabled={Boolean(siteMap)}
xpath={action.xpath}
option={action.type}
clickOnce={action.do_once}
input={action.input}
/>
</li>
))}
</ul>
</div>
)}
</div>
);
};

View File

@@ -1,15 +1,17 @@
import { Dispatch, SetStateAction } from "react"; import { Dispatch, SetStateAction } from "react";
import { RawJobOptions } from "@/types"; import { RawJobOptions, SiteMap } from "@/types";
export const parseJobOptions = ( export const parseJobOptions = (
job_options: string, job_options: string,
setCustomJSONSelected: Dispatch<SetStateAction<boolean>>, setCustomJSONSelected: Dispatch<SetStateAction<boolean>>,
setProxiesSelected: Dispatch<SetStateAction<boolean>>, setProxiesSelected: Dispatch<SetStateAction<boolean>>,
setJobOptions: Dispatch<SetStateAction<RawJobOptions>> setJobOptions: Dispatch<SetStateAction<RawJobOptions>>,
setSiteMap: Dispatch<SetStateAction<any>>
) => { ) => {
if (job_options) { if (job_options) {
const jsonOptions = JSON.parse(job_options as string); const jsonOptions = JSON.parse(job_options as string);
console.log(jsonOptions);
const newJobOptions: RawJobOptions = { const newJobOptions: RawJobOptions = {
multi_page_scrape: false, multi_page_scrape: false,
custom_headers: null, custom_headers: null,
@@ -31,6 +33,10 @@ export const parseJobOptions = (
newJobOptions.proxies = jsonOptions.proxies.join(","); newJobOptions.proxies = jsonOptions.proxies.join(",");
} }
if (jsonOptions.site_map) {
setSiteMap(jsonOptions.site_map);
}
setJobOptions(newJobOptions); setJobOptions(newJobOptions);
} }
}; };

View File

@@ -1,117 +1,10 @@
"use client"; import { Provider as JobSubmitterProvider } from "@/components/submit/job-submitter/provider";
import { Home } from "@/components/pages/home/home";
import React, { useState, useEffect, useRef } from "react";
import { Button, Container, Box, Snackbar, Alert } from "@mui/material";
import { useRouter } from "next/router";
import { Element, Result } from "@/types";
import { ElementTable } from "@/components/submit";
import { JobSubmitter } from "@/components/submit/job-submitter";
const Home = () => {
const router = useRouter();
const { elements, url } = router.query;
const [submittedURL, setSubmittedURL] = useState<string>("");
const [rows, setRows] = useState<Element[]>([]);
const [results, setResults] = useState<Result>({});
const [snackbarOpen, setSnackbarOpen] = useState<boolean>(false);
const [snackbarMessage, setSnackbarMessage] = useState<string>("");
const [snackbarSeverity, setSnackbarSeverity] = useState<string>("error");
const [isValidURL, setIsValidUrl] = useState<boolean>(true);
const resultsRef = useRef<HTMLTableElement | null>(null);
useEffect(() => {
if (elements) {
setRows(JSON.parse(elements as string));
}
if (url) {
setSubmittedURL(url as string);
}
}, [elements, url]);
useEffect(() => {
if (results && resultsRef.current) {
resultsRef.current.scrollIntoView({ behavior: "smooth" });
}
}, [results]);
const handleCloseSnackbar = () => {
setSnackbarOpen(false);
};
const ErrorSnackbar = () => {
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="error">
{snackbarMessage}
</Alert>
</Snackbar>
);
};
const NotifySnackbar = () => {
const goTo = () => {
router.push("/jobs");
};
const action = (
<Button color="inherit" size="small" onClick={goTo}>
Go To Job
</Button>
);
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="info" action={action}>
{snackbarMessage}
</Alert>
</Snackbar>
);
};
export default function Main() {
return ( return (
<Box <JobSubmitterProvider>
bgcolor="background.default" <Home />
display="flex" </JobSubmitterProvider>
flexDirection="column"
justifyContent="center"
alignItems="center"
height="100%"
py={4}
>
<Container maxWidth="lg">
<JobSubmitter
stateProps={{
submittedURL,
setSubmittedURL,
rows,
isValidURL,
setIsValidUrl,
setSnackbarMessage,
setSnackbarOpen,
setSnackbarSeverity,
}}
/>
{submittedURL.length ? (
<ElementTable
rows={rows}
setRows={setRows}
submittedURL={submittedURL}
/>
) : null}
</Container>
{snackbarSeverity === "info" ? <NotifySnackbar /> : <ErrorSnackbar />}
</Box>
); );
}; }
export default Home;

View File

@@ -1,9 +1,12 @@
import { SiteMap } from "@/types/job";
export const submitJob = async ( export const submitJob = async (
submittedURL: string, submittedURL: string,
rows: any[], rows: any[],
user: any, user: any,
jobOptions: any, jobOptions: any,
customHeaders: any customHeaders: any,
siteMap: SiteMap | null
) => { ) => {
return await fetch(`/api/submit-scrape-job`, { return await fetch(`/api/submit-scrape-job`, {
method: "POST", method: "POST",
@@ -18,6 +21,7 @@ export const submitJob = async (
...jobOptions, ...jobOptions,
custom_headers: customHeaders || {}, custom_headers: customHeaders || {},
proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [], proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [],
site_map: siteMap,
}, },
}, },
}), }),

View File

@@ -2,6 +2,11 @@
@tailwind components; @tailwind components;
@tailwind utilities; @tailwind utilities;
:root {
--delete-red: #ef4444;
--delete-red-hover: #ff6969;
}
#__next { #__next {
height: 100%; height: 100%;
} }

View File

@@ -34,6 +34,12 @@ const commonThemeOptions = {
h4: { h4: {
fontWeight: 500, fontWeight: 500,
}, },
h5: {
fontWeight: 500,
},
h6: {
fontWeight: 500,
},
body1: { body1: {
fontFamily: '"Schibsted Grotesk", sans-serif', fontFamily: '"Schibsted Grotesk", sans-serif',
}, },
@@ -175,6 +181,9 @@ const darkTheme = createTheme({
h5: { h5: {
color: "#ffffff", color: "#ffffff",
}, },
h6: {
color: "#ffffff",
},
body1: { body1: {
...commonThemeOptions.typography.body1, ...commonThemeOptions.typography.body1,
color: "#ffffff", color: "#ffffff",

View File

@@ -16,6 +16,7 @@ export type JobOptions = {
multi_page_scrape: boolean; multi_page_scrape: boolean;
custom_headers: null | string; custom_headers: null | string;
proxies: string[]; proxies: string[];
site_map?: SiteMap;
}; };
export type RawJobOptions = { export type RawJobOptions = {
@@ -23,3 +24,17 @@ export type RawJobOptions = {
custom_headers: string | null; custom_headers: string | null;
proxies: string | null; proxies: string | null;
}; };
export type ActionOption = "click" | "input";
export type Action = {
type: ActionOption;
xpath: string;
name: string;
do_once?: boolean;
input?: string;
};
export type SiteMap = {
actions: Action[];
};