mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-12-13 11:15:59 +00:00
feat: add proxies (#39)
This commit is contained in:
2
.github/workflows/docker-image.yml
vendored
2
.github/workflows/docker-image.yml
vendored
@@ -1,4 +1,6 @@
|
|||||||
name: ci
|
name: ci
|
||||||
|
requires:
|
||||||
|
- unit-tests
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches: ["master"]
|
branches: ["master"]
|
||||||
|
|||||||
25
.github/workflows/unit-tests.yml
vendored
Normal file
25
.github/workflows/unit-tests.yml
vendored
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
name: Unit Tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
unit-tests:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install pdm
|
||||||
|
run: pip install pdm
|
||||||
|
|
||||||
|
- name: Install project dependencies
|
||||||
|
run: pdm install
|
||||||
|
|
||||||
|
- name: Run tests
|
||||||
|
run: PYTHONPATH=. pdm run pytest api/backend/tests
|
||||||
@@ -23,8 +23,9 @@ class CapturedElement(pydantic.BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class JobOptions(pydantic.BaseModel):
|
class JobOptions(pydantic.BaseModel):
|
||||||
multi_page_scrape: bool
|
multi_page_scrape: bool = False
|
||||||
custom_headers: Optional[dict[str, Any]]
|
custom_headers: Optional[dict[str, Any]] = {}
|
||||||
|
proxies: Optional[list[str]] = []
|
||||||
|
|
||||||
|
|
||||||
class RetrieveScrapeJobs(pydantic.BaseModel):
|
class RetrieveScrapeJobs(pydantic.BaseModel):
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ from io import StringIO
|
|||||||
import csv
|
import csv
|
||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
# PDM
|
# PDM
|
||||||
from fastapi import Depends, APIRouter
|
from fastapi import Depends, APIRouter
|
||||||
@@ -27,7 +26,7 @@ from api.backend.models import (
|
|||||||
Job,
|
Job,
|
||||||
)
|
)
|
||||||
from api.backend.schemas import User
|
from api.backend.schemas import User
|
||||||
from api.backend.auth.auth_utils import get_current_user, EMPTY_USER
|
from api.backend.auth.auth_utils import get_current_user
|
||||||
from api.backend.utils import clean_text
|
from api.backend.utils import clean_text
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
import time
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
@@ -12,7 +13,6 @@ from selenium.webdriver.common.by import By
|
|||||||
from selenium.webdriver.support.ui import WebDriverWait
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
||||||
from urllib.parse import urlparse, urljoin
|
from urllib.parse import urlparse, urljoin
|
||||||
|
|
||||||
from api.backend.models import Element, CapturedElement
|
from api.backend.models import Element, CapturedElement
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
@@ -60,7 +60,7 @@ def interceptor(headers: dict[str, Any]):
|
|||||||
return _interceptor
|
return _interceptor
|
||||||
|
|
||||||
|
|
||||||
def create_driver():
|
def create_driver(proxies: Optional[list[str]] = []):
|
||||||
ua = UserAgent()
|
ua = UserAgent()
|
||||||
chrome_options = ChromeOptions()
|
chrome_options = ChromeOptions()
|
||||||
chrome_options.add_argument("--headless")
|
chrome_options.add_argument("--headless")
|
||||||
@@ -68,7 +68,23 @@ def create_driver():
|
|||||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||||
chrome_options.add_argument(f"user-agent={ua.random}")
|
chrome_options.add_argument(f"user-agent={ua.random}")
|
||||||
|
|
||||||
return webdriver.Chrome(options=chrome_options)
|
sw_options = {}
|
||||||
|
if proxies:
|
||||||
|
selected_proxy = proxies[random.randint(0, len(proxies) - 1)]
|
||||||
|
LOG.info(f"Using proxy: {selected_proxy}")
|
||||||
|
|
||||||
|
sw_options = {
|
||||||
|
"proxy": {
|
||||||
|
"https": f"https://{selected_proxy}",
|
||||||
|
"http": f"http://{selected_proxy}",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
driver = webdriver.Chrome(
|
||||||
|
options=chrome_options,
|
||||||
|
seleniumwire_options=sw_options,
|
||||||
|
)
|
||||||
|
return driver
|
||||||
|
|
||||||
|
|
||||||
async def make_site_request(
|
async def make_site_request(
|
||||||
@@ -78,13 +94,14 @@ async def make_site_request(
|
|||||||
visited_urls: set[str] = set(),
|
visited_urls: set[str] = set(),
|
||||||
pages: set[tuple[str, str]] = set(),
|
pages: set[tuple[str, str]] = set(),
|
||||||
original_url: str = "",
|
original_url: str = "",
|
||||||
|
proxies: Optional[list[str]] = [],
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Make basic `GET` request to site using Selenium."""
|
"""Make basic `GET` request to site using Selenium."""
|
||||||
# Check if URL has already been visited
|
# Check if URL has already been visited
|
||||||
if url in visited_urls:
|
if url in visited_urls:
|
||||||
return
|
return
|
||||||
|
|
||||||
driver = create_driver()
|
driver = create_driver(proxies)
|
||||||
driver.implicitly_wait(10)
|
driver.implicitly_wait(10)
|
||||||
|
|
||||||
if headers:
|
if headers:
|
||||||
@@ -93,6 +110,7 @@ async def make_site_request(
|
|||||||
try:
|
try:
|
||||||
LOG.info(f"Visiting URL: {url}")
|
LOG.info(f"Visiting URL: {url}")
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
|
|
||||||
final_url = driver.current_url
|
final_url = driver.current_url
|
||||||
visited_urls.add(url)
|
visited_urls.add(url)
|
||||||
visited_urls.add(final_url)
|
visited_urls.add(final_url)
|
||||||
@@ -173,6 +191,7 @@ async def scrape(
|
|||||||
xpaths: list[Element],
|
xpaths: list[Element],
|
||||||
headers: Optional[dict[str, Any]],
|
headers: Optional[dict[str, Any]],
|
||||||
multi_page_scrape: bool = False,
|
multi_page_scrape: bool = False,
|
||||||
|
proxies: Optional[list[str]] = [],
|
||||||
):
|
):
|
||||||
visited_urls: set[str] = set()
|
visited_urls: set[str] = set()
|
||||||
pages: set[tuple[str, str]] = set()
|
pages: set[tuple[str, str]] = set()
|
||||||
@@ -184,6 +203,7 @@ async def scrape(
|
|||||||
visited_urls=visited_urls,
|
visited_urls=visited_urls,
|
||||||
pages=pages,
|
pages=pages,
|
||||||
original_url=url,
|
original_url=url,
|
||||||
|
proxies=proxies,
|
||||||
)
|
)
|
||||||
|
|
||||||
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()
|
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()
|
||||||
|
|||||||
@@ -5,12 +5,14 @@ from faker import Faker
|
|||||||
fake = Faker()
|
fake = Faker()
|
||||||
|
|
||||||
|
|
||||||
def create_job():
|
def create_job(
|
||||||
|
job_options: JobOptions = JobOptions(multi_page_scrape=False, custom_headers={})
|
||||||
|
):
|
||||||
return Job(
|
return Job(
|
||||||
id=uuid.uuid4().hex,
|
id=uuid.uuid4().hex,
|
||||||
url="https://example.com",
|
url="https://example.com",
|
||||||
elements=[Element(name="test", xpath="xpath")],
|
elements=[Element(name="test", xpath="xpath")],
|
||||||
job_options=JobOptions(multi_page_scrape=False, custom_headers={}),
|
job_options=job_options,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -9,12 +9,18 @@ client = TestClient(app)
|
|||||||
|
|
||||||
mocked_job = create_completed_job().model_dump()
|
mocked_job = create_completed_job().model_dump()
|
||||||
mock_results = [mocked_job]
|
mock_results = [mocked_job]
|
||||||
|
mocked_random_int = 123456
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@patch("api.backend.app.query")
|
@patch("api.backend.routers.job_router.query")
|
||||||
async def test_download(mock_query: AsyncMock):
|
@patch("api.backend.routers.job_router.random.randint")
|
||||||
|
async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
|
||||||
|
# Ensure the mock returns immediately
|
||||||
mock_query.return_value = mock_results
|
mock_query.return_value = mock_results
|
||||||
|
mock_randint.return_value = mocked_random_int
|
||||||
|
|
||||||
|
# Create a DownloadJob instance
|
||||||
download_job = DownloadJob(ids=[mocked_job["id"]])
|
download_job = DownloadJob(ids=[mocked_job["id"]])
|
||||||
|
|
||||||
# Make a POST request to the /download endpoint
|
# Make a POST request to the /download endpoint
|
||||||
@@ -26,5 +32,9 @@ async def test_download(mock_query: AsyncMock):
|
|||||||
|
|
||||||
# Check the content of the CSV
|
# Check the content of the CSV
|
||||||
csv_content = response.content.decode("utf-8")
|
csv_content = response.content.decode("utf-8")
|
||||||
expected_csv = f"id,url,element_name,xpath,text,user,time_created\r\n{mocked_job['id']},https://example.com,element_name,//div,example,{mocked_job['user']},{mocked_job['time_created']}\r\n"
|
expected_csv = (
|
||||||
|
f'"id","url","element_name","xpath","text","user","time_created"\r\n'
|
||||||
|
f'"{mocked_job["id"]}-{mocked_random_int}","https://example.com","element_name","//div","example",'
|
||||||
|
f'"{mocked_job["user"]}","{mocked_job["time_created"]}"\r\n'
|
||||||
|
)
|
||||||
assert csv_content == expected_csv
|
assert csv_content == expected_csv
|
||||||
|
|||||||
0
api/backend/tests/scraping/__init__.py
Normal file
0
api/backend/tests/scraping/__init__.py
Normal file
33
api/backend/tests/scraping/test_scraping.py
Normal file
33
api/backend/tests/scraping/test_scraping.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
import pytest
|
||||||
|
from unittest.mock import AsyncMock, patch, MagicMock
|
||||||
|
from api.backend.tests.factories.job_factory import create_job
|
||||||
|
from api.backend.models import JobOptions
|
||||||
|
from api.backend.scraping import create_driver
|
||||||
|
|
||||||
|
|
||||||
|
mocked_job = create_job(
|
||||||
|
job_options=JobOptions(
|
||||||
|
multi_page_scrape=False, custom_headers={}, proxies=["127.0.0.1:8080"]
|
||||||
|
)
|
||||||
|
).model_dump()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@patch("seleniumwire.webdriver.Chrome.get")
|
||||||
|
async def test_proxy(mock_get: AsyncMock):
|
||||||
|
# Mock the response of the requests.get call
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_get.return_value = mock_response
|
||||||
|
|
||||||
|
driver = create_driver(proxies=["127.0.0.1:8080"])
|
||||||
|
assert driver is not None
|
||||||
|
|
||||||
|
# Simulate a request
|
||||||
|
driver.get("http://example.com")
|
||||||
|
response = driver.last_request
|
||||||
|
|
||||||
|
# Check if the proxy header is set correctly
|
||||||
|
if response:
|
||||||
|
assert response.headers["Proxy"] == "127.0.0.1:8080"
|
||||||
|
|
||||||
|
driver.quit()
|
||||||
@@ -23,6 +23,7 @@ async def process_job():
|
|||||||
[Element(**j) for j in job["elements"]],
|
[Element(**j) for j in job["elements"]],
|
||||||
job["job_options"]["custom_headers"],
|
job["job_options"]["custom_headers"],
|
||||||
job["job_options"]["multi_page_scrape"],
|
job["job_options"]["multi_page_scrape"],
|
||||||
|
job["job_options"]["proxies"],
|
||||||
)
|
)
|
||||||
LOG.info(
|
LOG.info(
|
||||||
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
|
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
|
||||||
|
|||||||
@@ -19,4 +19,4 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "8000:8000"
|
- "8000:8000"
|
||||||
volumes:
|
volumes:
|
||||||
- "$PWD/api:/project/app/api"
|
- "$PWD/api:/project/api"
|
||||||
|
|||||||
@@ -1,13 +1,14 @@
|
|||||||
|
import { RawJobOptions } from "@/types/job";
|
||||||
import { Box, FormControlLabel, Checkbox, TextField } from "@mui/material";
|
import { Box, FormControlLabel, Checkbox, TextField } from "@mui/material";
|
||||||
import { Dispatch, SetStateAction } from "react";
|
import { Dispatch, SetStateAction } from "react";
|
||||||
|
|
||||||
import { JobOptions } from "@/types/job";
|
|
||||||
|
|
||||||
export type JobSubmitterOptionsProps = {
|
export type JobSubmitterOptionsProps = {
|
||||||
jobOptions: JobOptions;
|
jobOptions: RawJobOptions;
|
||||||
setJobOptions: Dispatch<SetStateAction<JobOptions>>;
|
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
|
||||||
customJSONSelected: boolean;
|
customJSONSelected: boolean;
|
||||||
setCustomJSONSelected: Dispatch<SetStateAction<boolean>>;
|
setCustomJSONSelected: Dispatch<SetStateAction<boolean>>;
|
||||||
|
handleSelectProxies: () => void;
|
||||||
|
proxiesSelected: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const JobSubmitterOptions = ({
|
export const JobSubmitterOptions = ({
|
||||||
@@ -15,24 +16,69 @@ export const JobSubmitterOptions = ({
|
|||||||
setJobOptions,
|
setJobOptions,
|
||||||
customJSONSelected,
|
customJSONSelected,
|
||||||
setCustomJSONSelected,
|
setCustomJSONSelected,
|
||||||
|
handleSelectProxies,
|
||||||
|
proxiesSelected,
|
||||||
}: JobSubmitterOptionsProps) => {
|
}: JobSubmitterOptionsProps) => {
|
||||||
|
const handleMultiPageScrapeChange = () => {
|
||||||
|
setJobOptions((prevJobOptions) => ({
|
||||||
|
...prevJobOptions,
|
||||||
|
multi_page_scrape: !prevJobOptions.multi_page_scrape,
|
||||||
|
}));
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleProxiesChange = (e: React.ChangeEvent<HTMLInputElement>) => {
|
||||||
|
setJobOptions((prevJobOptions) => ({
|
||||||
|
...prevJobOptions,
|
||||||
|
proxies: e.target.value,
|
||||||
|
}));
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleCustomHeadersChange = (
|
||||||
|
e: React.ChangeEvent<HTMLInputElement>
|
||||||
|
) => {
|
||||||
|
setJobOptions((prevJobOptions) => ({
|
||||||
|
...prevJobOptions,
|
||||||
|
custom_headers: e.target.value,
|
||||||
|
}));
|
||||||
|
};
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<Box bgcolor="background.paper" className="flex flex-col mb-2 rounded-md">
|
<Box bgcolor="background.paper" className="flex flex-col mb-2 rounded-md">
|
||||||
<div id="options" className="p-2 flex flex-row space-x-2">
|
<div id="options" className="p-2 flex flex-row space-x-2">
|
||||||
<FormControlLabel
|
<FormControlLabel
|
||||||
label="Multi-Page Scrape"
|
label="Multi-Page Scrape"
|
||||||
|
className="mr-0"
|
||||||
control={
|
control={
|
||||||
<Checkbox
|
<Checkbox
|
||||||
checked={jobOptions.multi_page_scrape}
|
checked={jobOptions.multi_page_scrape}
|
||||||
onChange={() =>
|
onChange={handleMultiPageScrapeChange}
|
||||||
setJobOptions((prevJobOptions) => ({
|
|
||||||
...prevJobOptions,
|
|
||||||
multi_page_scrape: !prevJobOptions.multi_page_scrape,
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
/>
|
/>
|
||||||
}
|
}
|
||||||
></FormControlLabel>
|
></FormControlLabel>
|
||||||
|
<FormControlLabel
|
||||||
|
label="Proxies"
|
||||||
|
control={
|
||||||
|
<Checkbox
|
||||||
|
checked={proxiesSelected}
|
||||||
|
onChange={handleSelectProxies}
|
||||||
|
/>
|
||||||
|
}
|
||||||
|
></FormControlLabel>
|
||||||
|
{proxiesSelected ? (
|
||||||
|
<div id="proxies">
|
||||||
|
<TextField
|
||||||
|
InputLabelProps={{ shrink: false }}
|
||||||
|
fullWidth
|
||||||
|
multiline={false}
|
||||||
|
variant="outlined"
|
||||||
|
value={jobOptions.proxies || ""}
|
||||||
|
onChange={handleProxiesChange}
|
||||||
|
inputProps={{
|
||||||
|
style: { whiteSpace: "nowrap", overflowX: "auto" },
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
) : null}
|
||||||
<FormControlLabel
|
<FormControlLabel
|
||||||
label="Custom Headers (JSON)"
|
label="Custom Headers (JSON)"
|
||||||
control={
|
control={
|
||||||
@@ -58,14 +104,8 @@ export const JobSubmitterOptions = ({
|
|||||||
minRows={4}
|
minRows={4}
|
||||||
variant="outlined"
|
variant="outlined"
|
||||||
value={jobOptions.custom_headers || ""}
|
value={jobOptions.custom_headers || ""}
|
||||||
onChange={(e) =>
|
onChange={handleCustomHeadersChange}
|
||||||
setJobOptions((prevJobOptions) => ({
|
|
||||||
...prevJobOptions,
|
|
||||||
custom_headers: e.target.value,
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
style={{ maxHeight: "20vh", overflow: "auto" }}
|
style={{ maxHeight: "20vh", overflow: "auto" }}
|
||||||
className="mt-2"
|
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
) : null}
|
) : null}
|
||||||
|
|||||||
@@ -4,11 +4,12 @@ import React, { useEffect, useState, Dispatch } from "react";
|
|||||||
import { Element } from "@/types";
|
import { Element } from "@/types";
|
||||||
import { useAuth } from "@/contexts/AuthContext";
|
import { useAuth } from "@/contexts/AuthContext";
|
||||||
import { useRouter } from "next/router";
|
import { useRouter } from "next/router";
|
||||||
import { Constants } from "@/lib";
|
import { RawJobOptions } from "@/types/job";
|
||||||
|
import { parseJobOptions, validateURL } from "@/lib";
|
||||||
import { JobSubmitterHeader } from "./job-submitter-header";
|
import { JobSubmitterHeader } from "./job-submitter-header";
|
||||||
import { JobSubmitterInput } from "./job-submitter-input";
|
import { JobSubmitterInput } from "./job-submitter-input";
|
||||||
import { JobSubmitterOptions } from "./job-submitter-options";
|
import { JobSubmitterOptions } from "./job-submitter-options";
|
||||||
|
import { ApiService } from "@/services";
|
||||||
|
|
||||||
interface StateProps {
|
interface StateProps {
|
||||||
submittedURL: string;
|
submittedURL: string;
|
||||||
@@ -25,22 +26,20 @@ interface Props {
|
|||||||
stateProps: StateProps;
|
stateProps: StateProps;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface JobOptions {
|
const initialJobOptions: RawJobOptions = {
|
||||||
multi_page_scrape: boolean;
|
multi_page_scrape: false,
|
||||||
custom_headers: null | string;
|
custom_headers: null,
|
||||||
}
|
proxies: null,
|
||||||
|
};
|
||||||
|
|
||||||
export const JobSubmitter = ({ stateProps }: Props) => {
|
export const JobSubmitter = ({ stateProps }: Props) => {
|
||||||
const { user } = useAuth();
|
const { user } = useAuth();
|
||||||
const router = useRouter();
|
const router = useRouter();
|
||||||
|
|
||||||
const { job_options } = router.query;
|
const { job_options } = router.query;
|
||||||
|
|
||||||
const {
|
const {
|
||||||
submittedURL,
|
submittedURL,
|
||||||
setSubmittedURL,
|
|
||||||
rows,
|
rows,
|
||||||
isValidURL,
|
|
||||||
setIsValidUrl,
|
setIsValidUrl,
|
||||||
setSnackbarMessage,
|
setSnackbarMessage,
|
||||||
setSnackbarOpen,
|
setSnackbarOpen,
|
||||||
@@ -49,22 +48,16 @@ export const JobSubmitter = ({ stateProps }: Props) => {
|
|||||||
|
|
||||||
const [urlError, setUrlError] = useState<string | null>(null);
|
const [urlError, setUrlError] = useState<string | null>(null);
|
||||||
const [loading, setLoading] = useState<boolean>(false);
|
const [loading, setLoading] = useState<boolean>(false);
|
||||||
const [jobOptions, setJobOptions] = useState<JobOptions>({
|
const [jobOptions, setJobOptions] =
|
||||||
multi_page_scrape: false,
|
useState<RawJobOptions>(initialJobOptions);
|
||||||
custom_headers: null,
|
|
||||||
});
|
|
||||||
const [customJSONSelected, setCustomJSONSelected] = useState<boolean>(false);
|
const [customJSONSelected, setCustomJSONSelected] = useState<boolean>(false);
|
||||||
|
const [proxiesSelected, setProxiesSelected] = useState<boolean>(false);
|
||||||
|
|
||||||
function validateURL(url: string): boolean {
|
const handleSelectProxies = () => {
|
||||||
try {
|
setProxiesSelected(!proxiesSelected);
|
||||||
new URL(url);
|
};
|
||||||
return true;
|
|
||||||
} catch (_) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const handleSubmit = () => {
|
const handleSubmit = async () => {
|
||||||
if (!validateURL(submittedURL)) {
|
if (!validateURL(submittedURL)) {
|
||||||
setIsValidUrl(false);
|
setIsValidUrl(false);
|
||||||
setUrlError("Please enter a valid URL.");
|
setUrlError("Please enter a valid URL.");
|
||||||
@@ -76,6 +69,7 @@ export const JobSubmitter = ({ stateProps }: Props) => {
|
|||||||
setLoading(true);
|
setLoading(true);
|
||||||
|
|
||||||
let customHeaders;
|
let customHeaders;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
customHeaders = jobOptions.custom_headers
|
customHeaders = jobOptions.custom_headers
|
||||||
? JSON.parse(jobOptions.custom_headers)
|
? JSON.parse(jobOptions.custom_headers)
|
||||||
@@ -88,21 +82,14 @@ export const JobSubmitter = ({ stateProps }: Props) => {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
fetch(`${Constants.DOMAIN}/api/submit-scrape-job`, {
|
await ApiService.submitJob(
|
||||||
method: "POST",
|
submittedURL,
|
||||||
headers: { "content-type": "application/json" },
|
rows,
|
||||||
body: JSON.stringify({
|
user,
|
||||||
url: submittedURL,
|
jobOptions,
|
||||||
elements: rows,
|
customHeaders
|
||||||
user: user?.email,
|
)
|
||||||
time_created: new Date().toISOString(),
|
.then(async (response) => {
|
||||||
job_options: {
|
|
||||||
...jobOptions,
|
|
||||||
custom_headers: customHeaders,
|
|
||||||
},
|
|
||||||
}),
|
|
||||||
})
|
|
||||||
.then((response) => {
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
return response.json().then((error) => {
|
return response.json().then((error) => {
|
||||||
throw new Error(error.error);
|
throw new Error(error.error);
|
||||||
@@ -126,26 +113,15 @@ export const JobSubmitter = ({ stateProps }: Props) => {
|
|||||||
.finally(() => setLoading(false));
|
.finally(() => setLoading(false));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Parse the job options from the query string
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (job_options) {
|
if (job_options) {
|
||||||
const jsonOptions = JSON.parse(job_options as string);
|
parseJobOptions(
|
||||||
const newJobOptions: JobOptions = {
|
job_options as string,
|
||||||
multi_page_scrape: false,
|
setCustomJSONSelected,
|
||||||
custom_headers: null,
|
setProxiesSelected,
|
||||||
};
|
setJobOptions
|
||||||
|
);
|
||||||
if (
|
|
||||||
jsonOptions.custom_headers &&
|
|
||||||
Object.keys(jsonOptions.custom_headers).length
|
|
||||||
) {
|
|
||||||
setCustomJSONSelected(true);
|
|
||||||
newJobOptions.custom_headers = JSON.stringify(
|
|
||||||
jsonOptions.custom_headers
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
newJobOptions.multi_page_scrape = jsonOptions.multi_page_scrape;
|
|
||||||
setJobOptions(newJobOptions);
|
|
||||||
}
|
}
|
||||||
}, [job_options]);
|
}, [job_options]);
|
||||||
|
|
||||||
@@ -165,6 +141,8 @@ export const JobSubmitter = ({ stateProps }: Props) => {
|
|||||||
setJobOptions={setJobOptions}
|
setJobOptions={setJobOptions}
|
||||||
customJSONSelected={customJSONSelected}
|
customJSONSelected={customJSONSelected}
|
||||||
setCustomJSONSelected={setCustomJSONSelected}
|
setCustomJSONSelected={setCustomJSONSelected}
|
||||||
|
handleSelectProxies={handleSelectProxies}
|
||||||
|
proxiesSelected={proxiesSelected}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
</>
|
</>
|
||||||
|
|||||||
2
src/lib/helpers/index.ts
Normal file
2
src/lib/helpers/index.ts
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
export * from "./parse-job-options";
|
||||||
|
export * from "./validate-url";
|
||||||
36
src/lib/helpers/parse-job-options.ts
Normal file
36
src/lib/helpers/parse-job-options.ts
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
import { Dispatch, SetStateAction } from "react";
|
||||||
|
|
||||||
|
import { RawJobOptions } from "@/types";
|
||||||
|
|
||||||
|
export const parseJobOptions = (
|
||||||
|
job_options: string,
|
||||||
|
setCustomJSONSelected: Dispatch<SetStateAction<boolean>>,
|
||||||
|
setProxiesSelected: Dispatch<SetStateAction<boolean>>,
|
||||||
|
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>
|
||||||
|
) => {
|
||||||
|
if (job_options) {
|
||||||
|
const jsonOptions = JSON.parse(job_options as string);
|
||||||
|
const newJobOptions: RawJobOptions = {
|
||||||
|
multi_page_scrape: false,
|
||||||
|
custom_headers: null,
|
||||||
|
proxies: null,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (
|
||||||
|
jsonOptions.custom_headers &&
|
||||||
|
Object.keys(jsonOptions.custom_headers).length
|
||||||
|
) {
|
||||||
|
setCustomJSONSelected(true);
|
||||||
|
newJobOptions.custom_headers = JSON.stringify(jsonOptions.custom_headers);
|
||||||
|
}
|
||||||
|
|
||||||
|
newJobOptions.multi_page_scrape = jsonOptions.multi_page_scrape;
|
||||||
|
|
||||||
|
if (jsonOptions.proxies) {
|
||||||
|
setProxiesSelected(true);
|
||||||
|
newJobOptions.proxies = jsonOptions.proxies.join(",");
|
||||||
|
}
|
||||||
|
|
||||||
|
setJobOptions(newJobOptions);
|
||||||
|
}
|
||||||
|
};
|
||||||
8
src/lib/helpers/validate-url.ts
Normal file
8
src/lib/helpers/validate-url.ts
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
export function validateURL(url: string): boolean {
|
||||||
|
try {
|
||||||
|
new URL(url);
|
||||||
|
return true;
|
||||||
|
} catch (_) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,2 +1,3 @@
|
|||||||
export * from "./constants";
|
export * from "./constants";
|
||||||
export * from "./utils";
|
export * from "./utils";
|
||||||
|
export * from "./helpers";
|
||||||
|
|||||||
5
src/services/api-service/api-service.ts
Normal file
5
src/services/api-service/api-service.ts
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
import * as functions from "./functions";
|
||||||
|
|
||||||
|
export const ApiService = {
|
||||||
|
...functions,
|
||||||
|
};
|
||||||
1
src/services/api-service/functions/index.ts
Normal file
1
src/services/api-service/functions/index.ts
Normal file
@@ -0,0 +1 @@
|
|||||||
|
export * from "./submit-job";
|
||||||
25
src/services/api-service/functions/submit-job.ts
Normal file
25
src/services/api-service/functions/submit-job.ts
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
import { Constants } from "@/lib";
|
||||||
|
|
||||||
|
export const submitJob = async (
|
||||||
|
submittedURL: string,
|
||||||
|
rows: any[],
|
||||||
|
user: any,
|
||||||
|
jobOptions: any,
|
||||||
|
customHeaders: any
|
||||||
|
) => {
|
||||||
|
return await fetch(`${Constants.DOMAIN}/api/submit-scrape-job`, {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "content-type": "application/json" },
|
||||||
|
body: JSON.stringify({
|
||||||
|
url: submittedURL,
|
||||||
|
elements: rows,
|
||||||
|
user: user?.email,
|
||||||
|
time_created: new Date().toISOString(),
|
||||||
|
job_options: {
|
||||||
|
...jobOptions,
|
||||||
|
custom_headers: customHeaders,
|
||||||
|
proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [],
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
};
|
||||||
1
src/services/api-service/index.ts
Normal file
1
src/services/api-service/index.ts
Normal file
@@ -0,0 +1 @@
|
|||||||
|
export * from "./api-service";
|
||||||
1
src/services/index.ts
Normal file
1
src/services/index.ts
Normal file
@@ -0,0 +1 @@
|
|||||||
|
export * from "./api-service";
|
||||||
@@ -15,4 +15,11 @@ export interface Job {
|
|||||||
export type JobOptions = {
|
export type JobOptions = {
|
||||||
multi_page_scrape: boolean;
|
multi_page_scrape: boolean;
|
||||||
custom_headers: null | string;
|
custom_headers: null | string;
|
||||||
|
proxies: string[];
|
||||||
|
};
|
||||||
|
|
||||||
|
export type RawJobOptions = {
|
||||||
|
multi_page_scrape: boolean;
|
||||||
|
custom_headers: string | null;
|
||||||
|
proxies: string | null;
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user