feat: add proxies (#39)

This commit is contained in:
Jayden Pyles
2024-11-09 21:27:53 -06:00
committed by GitHub
parent 266b91ed0e
commit 1cdffd9006
23 changed files with 284 additions and 86 deletions

View File

@@ -1,4 +1,6 @@
name: ci name: ci
requires:
- unit-tests
on: on:
push: push:
branches: ["master"] branches: ["master"]

25
.github/workflows/unit-tests.yml vendored Normal file
View File

@@ -0,0 +1,25 @@
name: Unit Tests
on:
push:
branches:
- master
pull_request:
branches:
- master
jobs:
unit-tests:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install pdm
run: pip install pdm
- name: Install project dependencies
run: pdm install
- name: Run tests
run: PYTHONPATH=. pdm run pytest api/backend/tests

View File

@@ -23,8 +23,9 @@ class CapturedElement(pydantic.BaseModel):
class JobOptions(pydantic.BaseModel): class JobOptions(pydantic.BaseModel):
multi_page_scrape: bool multi_page_scrape: bool = False
custom_headers: Optional[dict[str, Any]] custom_headers: Optional[dict[str, Any]] = {}
proxies: Optional[list[str]] = []
class RetrieveScrapeJobs(pydantic.BaseModel): class RetrieveScrapeJobs(pydantic.BaseModel):

View File

@@ -5,7 +5,6 @@ from io import StringIO
import csv import csv
import logging import logging
import random import random
from typing import Optional
# PDM # PDM
from fastapi import Depends, APIRouter from fastapi import Depends, APIRouter
@@ -27,7 +26,7 @@ from api.backend.models import (
Job, Job,
) )
from api.backend.schemas import User from api.backend.schemas import User
from api.backend.auth.auth_utils import get_current_user, EMPTY_USER from api.backend.auth.auth_utils import get_current_user
from api.backend.utils import clean_text from api.backend.utils import clean_text
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)

View File

@@ -1,6 +1,7 @@
import logging import logging
from typing import Any, Optional from typing import Any, Optional
import time import time
import random
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from lxml import etree from lxml import etree
@@ -12,7 +13,6 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium.webdriver.chrome.options import Options as ChromeOptions
from urllib.parse import urlparse, urljoin from urllib.parse import urlparse, urljoin
from api.backend.models import Element, CapturedElement from api.backend.models import Element, CapturedElement
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
@@ -60,7 +60,7 @@ def interceptor(headers: dict[str, Any]):
return _interceptor return _interceptor
def create_driver(): def create_driver(proxies: Optional[list[str]] = []):
ua = UserAgent() ua = UserAgent()
chrome_options = ChromeOptions() chrome_options = ChromeOptions()
chrome_options.add_argument("--headless") chrome_options.add_argument("--headless")
@@ -68,7 +68,23 @@ def create_driver():
chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument(f"user-agent={ua.random}") chrome_options.add_argument(f"user-agent={ua.random}")
return webdriver.Chrome(options=chrome_options) sw_options = {}
if proxies:
selected_proxy = proxies[random.randint(0, len(proxies) - 1)]
LOG.info(f"Using proxy: {selected_proxy}")
sw_options = {
"proxy": {
"https": f"https://{selected_proxy}",
"http": f"http://{selected_proxy}",
}
}
driver = webdriver.Chrome(
options=chrome_options,
seleniumwire_options=sw_options,
)
return driver
async def make_site_request( async def make_site_request(
@@ -78,13 +94,14 @@ async def make_site_request(
visited_urls: set[str] = set(), visited_urls: set[str] = set(),
pages: set[tuple[str, str]] = set(), pages: set[tuple[str, str]] = set(),
original_url: str = "", original_url: str = "",
proxies: Optional[list[str]] = [],
) -> None: ) -> None:
"""Make basic `GET` request to site using Selenium.""" """Make basic `GET` request to site using Selenium."""
# Check if URL has already been visited # Check if URL has already been visited
if url in visited_urls: if url in visited_urls:
return return
driver = create_driver() driver = create_driver(proxies)
driver.implicitly_wait(10) driver.implicitly_wait(10)
if headers: if headers:
@@ -93,6 +110,7 @@ async def make_site_request(
try: try:
LOG.info(f"Visiting URL: {url}") LOG.info(f"Visiting URL: {url}")
driver.get(url) driver.get(url)
final_url = driver.current_url final_url = driver.current_url
visited_urls.add(url) visited_urls.add(url)
visited_urls.add(final_url) visited_urls.add(final_url)
@@ -173,6 +191,7 @@ async def scrape(
xpaths: list[Element], xpaths: list[Element],
headers: Optional[dict[str, Any]], headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False, multi_page_scrape: bool = False,
proxies: Optional[list[str]] = [],
): ):
visited_urls: set[str] = set() visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set() pages: set[tuple[str, str]] = set()
@@ -184,6 +203,7 @@ async def scrape(
visited_urls=visited_urls, visited_urls=visited_urls,
pages=pages, pages=pages,
original_url=url, original_url=url,
proxies=proxies,
) )
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list() elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()

View File

@@ -5,12 +5,14 @@ from faker import Faker
fake = Faker() fake = Faker()
def create_job(): def create_job(
job_options: JobOptions = JobOptions(multi_page_scrape=False, custom_headers={})
):
return Job( return Job(
id=uuid.uuid4().hex, id=uuid.uuid4().hex,
url="https://example.com", url="https://example.com",
elements=[Element(name="test", xpath="xpath")], elements=[Element(name="test", xpath="xpath")],
job_options=JobOptions(multi_page_scrape=False, custom_headers={}), job_options=job_options,
) )

View File

@@ -9,12 +9,18 @@ client = TestClient(app)
mocked_job = create_completed_job().model_dump() mocked_job = create_completed_job().model_dump()
mock_results = [mocked_job] mock_results = [mocked_job]
mocked_random_int = 123456
@pytest.mark.asyncio @pytest.mark.asyncio
@patch("api.backend.app.query") @patch("api.backend.routers.job_router.query")
async def test_download(mock_query: AsyncMock): @patch("api.backend.routers.job_router.random.randint")
async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
# Ensure the mock returns immediately
mock_query.return_value = mock_results mock_query.return_value = mock_results
mock_randint.return_value = mocked_random_int
# Create a DownloadJob instance
download_job = DownloadJob(ids=[mocked_job["id"]]) download_job = DownloadJob(ids=[mocked_job["id"]])
# Make a POST request to the /download endpoint # Make a POST request to the /download endpoint
@@ -26,5 +32,9 @@ async def test_download(mock_query: AsyncMock):
# Check the content of the CSV # Check the content of the CSV
csv_content = response.content.decode("utf-8") csv_content = response.content.decode("utf-8")
expected_csv = f"id,url,element_name,xpath,text,user,time_created\r\n{mocked_job['id']},https://example.com,element_name,//div,example,{mocked_job['user']},{mocked_job['time_created']}\r\n" expected_csv = (
f'"id","url","element_name","xpath","text","user","time_created"\r\n'
f'"{mocked_job["id"]}-{mocked_random_int}","https://example.com","element_name","//div","example",'
f'"{mocked_job["user"]}","{mocked_job["time_created"]}"\r\n'
)
assert csv_content == expected_csv assert csv_content == expected_csv

View File

View File

@@ -0,0 +1,33 @@
import pytest
from unittest.mock import AsyncMock, patch, MagicMock
from api.backend.tests.factories.job_factory import create_job
from api.backend.models import JobOptions
from api.backend.scraping import create_driver
mocked_job = create_job(
job_options=JobOptions(
multi_page_scrape=False, custom_headers={}, proxies=["127.0.0.1:8080"]
)
).model_dump()
@pytest.mark.asyncio
@patch("seleniumwire.webdriver.Chrome.get")
async def test_proxy(mock_get: AsyncMock):
# Mock the response of the requests.get call
mock_response = MagicMock()
mock_get.return_value = mock_response
driver = create_driver(proxies=["127.0.0.1:8080"])
assert driver is not None
# Simulate a request
driver.get("http://example.com")
response = driver.last_request
# Check if the proxy header is set correctly
if response:
assert response.headers["Proxy"] == "127.0.0.1:8080"
driver.quit()

View File

@@ -23,6 +23,7 @@ async def process_job():
[Element(**j) for j in job["elements"]], [Element(**j) for j in job["elements"]],
job["job_options"]["custom_headers"], job["job_options"]["custom_headers"],
job["job_options"]["multi_page_scrape"], job["job_options"]["multi_page_scrape"],
job["job_options"]["proxies"],
) )
LOG.info( LOG.info(
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}" f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"

View File

@@ -19,4 +19,4 @@ services:
ports: ports:
- "8000:8000" - "8000:8000"
volumes: volumes:
- "$PWD/api:/project/app/api" - "$PWD/api:/project/api"

View File

@@ -1,13 +1,14 @@
import { RawJobOptions } from "@/types/job";
import { Box, FormControlLabel, Checkbox, TextField } from "@mui/material"; import { Box, FormControlLabel, Checkbox, TextField } from "@mui/material";
import { Dispatch, SetStateAction } from "react"; import { Dispatch, SetStateAction } from "react";
import { JobOptions } from "@/types/job";
export type JobSubmitterOptionsProps = { export type JobSubmitterOptionsProps = {
jobOptions: JobOptions; jobOptions: RawJobOptions;
setJobOptions: Dispatch<SetStateAction<JobOptions>>; setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
customJSONSelected: boolean; customJSONSelected: boolean;
setCustomJSONSelected: Dispatch<SetStateAction<boolean>>; setCustomJSONSelected: Dispatch<SetStateAction<boolean>>;
handleSelectProxies: () => void;
proxiesSelected: boolean;
}; };
export const JobSubmitterOptions = ({ export const JobSubmitterOptions = ({
@@ -15,24 +16,69 @@ export const JobSubmitterOptions = ({
setJobOptions, setJobOptions,
customJSONSelected, customJSONSelected,
setCustomJSONSelected, setCustomJSONSelected,
handleSelectProxies,
proxiesSelected,
}: JobSubmitterOptionsProps) => { }: JobSubmitterOptionsProps) => {
const handleMultiPageScrapeChange = () => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
multi_page_scrape: !prevJobOptions.multi_page_scrape,
}));
};
const handleProxiesChange = (e: React.ChangeEvent<HTMLInputElement>) => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
proxies: e.target.value,
}));
};
const handleCustomHeadersChange = (
e: React.ChangeEvent<HTMLInputElement>
) => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
custom_headers: e.target.value,
}));
};
return ( return (
<Box bgcolor="background.paper" className="flex flex-col mb-2 rounded-md"> <Box bgcolor="background.paper" className="flex flex-col mb-2 rounded-md">
<div id="options" className="p-2 flex flex-row space-x-2"> <div id="options" className="p-2 flex flex-row space-x-2">
<FormControlLabel <FormControlLabel
label="Multi-Page Scrape" label="Multi-Page Scrape"
className="mr-0"
control={ control={
<Checkbox <Checkbox
checked={jobOptions.multi_page_scrape} checked={jobOptions.multi_page_scrape}
onChange={() => onChange={handleMultiPageScrapeChange}
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
multi_page_scrape: !prevJobOptions.multi_page_scrape,
}))
}
/> />
} }
></FormControlLabel> ></FormControlLabel>
<FormControlLabel
label="Proxies"
control={
<Checkbox
checked={proxiesSelected}
onChange={handleSelectProxies}
/>
}
></FormControlLabel>
{proxiesSelected ? (
<div id="proxies">
<TextField
InputLabelProps={{ shrink: false }}
fullWidth
multiline={false}
variant="outlined"
value={jobOptions.proxies || ""}
onChange={handleProxiesChange}
inputProps={{
style: { whiteSpace: "nowrap", overflowX: "auto" },
}}
/>
</div>
) : null}
<FormControlLabel <FormControlLabel
label="Custom Headers (JSON)" label="Custom Headers (JSON)"
control={ control={
@@ -58,14 +104,8 @@ export const JobSubmitterOptions = ({
minRows={4} minRows={4}
variant="outlined" variant="outlined"
value={jobOptions.custom_headers || ""} value={jobOptions.custom_headers || ""}
onChange={(e) => onChange={handleCustomHeadersChange}
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
custom_headers: e.target.value,
}))
}
style={{ maxHeight: "20vh", overflow: "auto" }} style={{ maxHeight: "20vh", overflow: "auto" }}
className="mt-2"
/> />
</div> </div>
) : null} ) : null}

View File

@@ -4,11 +4,12 @@ import React, { useEffect, useState, Dispatch } from "react";
import { Element } from "@/types"; import { Element } from "@/types";
import { useAuth } from "@/contexts/AuthContext"; import { useAuth } from "@/contexts/AuthContext";
import { useRouter } from "next/router"; import { useRouter } from "next/router";
import { Constants } from "@/lib"; import { RawJobOptions } from "@/types/job";
import { parseJobOptions, validateURL } from "@/lib";
import { JobSubmitterHeader } from "./job-submitter-header"; import { JobSubmitterHeader } from "./job-submitter-header";
import { JobSubmitterInput } from "./job-submitter-input"; import { JobSubmitterInput } from "./job-submitter-input";
import { JobSubmitterOptions } from "./job-submitter-options"; import { JobSubmitterOptions } from "./job-submitter-options";
import { ApiService } from "@/services";
interface StateProps { interface StateProps {
submittedURL: string; submittedURL: string;
@@ -25,22 +26,20 @@ interface Props {
stateProps: StateProps; stateProps: StateProps;
} }
interface JobOptions { const initialJobOptions: RawJobOptions = {
multi_page_scrape: boolean; multi_page_scrape: false,
custom_headers: null | string; custom_headers: null,
} proxies: null,
};
export const JobSubmitter = ({ stateProps }: Props) => { export const JobSubmitter = ({ stateProps }: Props) => {
const { user } = useAuth(); const { user } = useAuth();
const router = useRouter(); const router = useRouter();
const { job_options } = router.query; const { job_options } = router.query;
const { const {
submittedURL, submittedURL,
setSubmittedURL,
rows, rows,
isValidURL,
setIsValidUrl, setIsValidUrl,
setSnackbarMessage, setSnackbarMessage,
setSnackbarOpen, setSnackbarOpen,
@@ -49,22 +48,16 @@ export const JobSubmitter = ({ stateProps }: Props) => {
const [urlError, setUrlError] = useState<string | null>(null); const [urlError, setUrlError] = useState<string | null>(null);
const [loading, setLoading] = useState<boolean>(false); const [loading, setLoading] = useState<boolean>(false);
const [jobOptions, setJobOptions] = useState<JobOptions>({ const [jobOptions, setJobOptions] =
multi_page_scrape: false, useState<RawJobOptions>(initialJobOptions);
custom_headers: null,
});
const [customJSONSelected, setCustomJSONSelected] = useState<boolean>(false); const [customJSONSelected, setCustomJSONSelected] = useState<boolean>(false);
const [proxiesSelected, setProxiesSelected] = useState<boolean>(false);
function validateURL(url: string): boolean { const handleSelectProxies = () => {
try { setProxiesSelected(!proxiesSelected);
new URL(url); };
return true;
} catch (_) {
return false;
}
}
const handleSubmit = () => { const handleSubmit = async () => {
if (!validateURL(submittedURL)) { if (!validateURL(submittedURL)) {
setIsValidUrl(false); setIsValidUrl(false);
setUrlError("Please enter a valid URL."); setUrlError("Please enter a valid URL.");
@@ -76,6 +69,7 @@ export const JobSubmitter = ({ stateProps }: Props) => {
setLoading(true); setLoading(true);
let customHeaders; let customHeaders;
try { try {
customHeaders = jobOptions.custom_headers customHeaders = jobOptions.custom_headers
? JSON.parse(jobOptions.custom_headers) ? JSON.parse(jobOptions.custom_headers)
@@ -88,21 +82,14 @@ export const JobSubmitter = ({ stateProps }: Props) => {
return; return;
} }
fetch(`${Constants.DOMAIN}/api/submit-scrape-job`, { await ApiService.submitJob(
method: "POST", submittedURL,
headers: { "content-type": "application/json" }, rows,
body: JSON.stringify({ user,
url: submittedURL, jobOptions,
elements: rows, customHeaders
user: user?.email, )
time_created: new Date().toISOString(), .then(async (response) => {
job_options: {
...jobOptions,
custom_headers: customHeaders,
},
}),
})
.then((response) => {
if (!response.ok) { if (!response.ok) {
return response.json().then((error) => { return response.json().then((error) => {
throw new Error(error.error); throw new Error(error.error);
@@ -126,27 +113,16 @@ export const JobSubmitter = ({ stateProps }: Props) => {
.finally(() => setLoading(false)); .finally(() => setLoading(false));
}; };
// Parse the job options from the query string
useEffect(() => { useEffect(() => {
if (job_options) { if (job_options) {
const jsonOptions = JSON.parse(job_options as string); parseJobOptions(
const newJobOptions: JobOptions = { job_options as string,
multi_page_scrape: false, setCustomJSONSelected,
custom_headers: null, setProxiesSelected,
}; setJobOptions
if (
jsonOptions.custom_headers &&
Object.keys(jsonOptions.custom_headers).length
) {
setCustomJSONSelected(true);
newJobOptions.custom_headers = JSON.stringify(
jsonOptions.custom_headers
); );
} }
newJobOptions.multi_page_scrape = jsonOptions.multi_page_scrape;
setJobOptions(newJobOptions);
}
}, [job_options]); }, [job_options]);
return ( return (
@@ -165,6 +141,8 @@ export const JobSubmitter = ({ stateProps }: Props) => {
setJobOptions={setJobOptions} setJobOptions={setJobOptions}
customJSONSelected={customJSONSelected} customJSONSelected={customJSONSelected}
setCustomJSONSelected={setCustomJSONSelected} setCustomJSONSelected={setCustomJSONSelected}
handleSelectProxies={handleSelectProxies}
proxiesSelected={proxiesSelected}
/> />
</div> </div>
</> </>

2
src/lib/helpers/index.ts Normal file
View File

@@ -0,0 +1,2 @@
export * from "./parse-job-options";
export * from "./validate-url";

View File

@@ -0,0 +1,36 @@
import { Dispatch, SetStateAction } from "react";
import { RawJobOptions } from "@/types";
export const parseJobOptions = (
job_options: string,
setCustomJSONSelected: Dispatch<SetStateAction<boolean>>,
setProxiesSelected: Dispatch<SetStateAction<boolean>>,
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>
) => {
if (job_options) {
const jsonOptions = JSON.parse(job_options as string);
const newJobOptions: RawJobOptions = {
multi_page_scrape: false,
custom_headers: null,
proxies: null,
};
if (
jsonOptions.custom_headers &&
Object.keys(jsonOptions.custom_headers).length
) {
setCustomJSONSelected(true);
newJobOptions.custom_headers = JSON.stringify(jsonOptions.custom_headers);
}
newJobOptions.multi_page_scrape = jsonOptions.multi_page_scrape;
if (jsonOptions.proxies) {
setProxiesSelected(true);
newJobOptions.proxies = jsonOptions.proxies.join(",");
}
setJobOptions(newJobOptions);
}
};

View File

@@ -0,0 +1,8 @@
export function validateURL(url: string): boolean {
try {
new URL(url);
return true;
} catch (_) {
return false;
}
}

View File

@@ -1,2 +1,3 @@
export * from "./constants"; export * from "./constants";
export * from "./utils"; export * from "./utils";
export * from "./helpers";

View File

@@ -0,0 +1,5 @@
import * as functions from "./functions";
export const ApiService = {
...functions,
};

View File

@@ -0,0 +1 @@
export * from "./submit-job";

View File

@@ -0,0 +1,25 @@
import { Constants } from "@/lib";
export const submitJob = async (
submittedURL: string,
rows: any[],
user: any,
jobOptions: any,
customHeaders: any
) => {
return await fetch(`${Constants.DOMAIN}/api/submit-scrape-job`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
url: submittedURL,
elements: rows,
user: user?.email,
time_created: new Date().toISOString(),
job_options: {
...jobOptions,
custom_headers: customHeaders,
proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [],
},
}),
});
};

View File

@@ -0,0 +1 @@
export * from "./api-service";

1
src/services/index.ts Normal file
View File

@@ -0,0 +1 @@
export * from "./api-service";

View File

@@ -15,4 +15,11 @@ export interface Job {
export type JobOptions = { export type JobOptions = {
multi_page_scrape: boolean; multi_page_scrape: boolean;
custom_headers: null | string; custom_headers: null | string;
proxies: string[];
};
export type RawJobOptions = {
multi_page_scrape: boolean;
custom_headers: string | null;
proxies: string | null;
}; };