feat: add proxies (#39)

This commit is contained in:
Jayden Pyles
2024-11-09 21:27:53 -06:00
committed by GitHub
parent 266b91ed0e
commit 1cdffd9006
23 changed files with 284 additions and 86 deletions

View File

@@ -1,4 +1,6 @@
name: ci
requires:
- unit-tests
on:
push:
branches: ["master"]

25
.github/workflows/unit-tests.yml vendored Normal file
View File

@@ -0,0 +1,25 @@
name: Unit Tests
on:
push:
branches:
- master
pull_request:
branches:
- master
jobs:
unit-tests:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install pdm
run: pip install pdm
- name: Install project dependencies
run: pdm install
- name: Run tests
run: PYTHONPATH=. pdm run pytest api/backend/tests

View File

@@ -23,8 +23,9 @@ class CapturedElement(pydantic.BaseModel):
class JobOptions(pydantic.BaseModel):
multi_page_scrape: bool
custom_headers: Optional[dict[str, Any]]
multi_page_scrape: bool = False
custom_headers: Optional[dict[str, Any]] = {}
proxies: Optional[list[str]] = []
class RetrieveScrapeJobs(pydantic.BaseModel):

View File

@@ -5,7 +5,6 @@ from io import StringIO
import csv
import logging
import random
from typing import Optional
# PDM
from fastapi import Depends, APIRouter
@@ -27,7 +26,7 @@ from api.backend.models import (
Job,
)
from api.backend.schemas import User
from api.backend.auth.auth_utils import get_current_user, EMPTY_USER
from api.backend.auth.auth_utils import get_current_user
from api.backend.utils import clean_text
LOG = logging.getLogger(__name__)

View File

@@ -1,6 +1,7 @@
import logging
from typing import Any, Optional
import time
import random
from bs4 import BeautifulSoup
from lxml import etree
@@ -12,7 +13,6 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options as ChromeOptions
from urllib.parse import urlparse, urljoin
from api.backend.models import Element, CapturedElement
LOG = logging.getLogger(__name__)
@@ -60,7 +60,7 @@ def interceptor(headers: dict[str, Any]):
return _interceptor
def create_driver():
def create_driver(proxies: Optional[list[str]] = []):
ua = UserAgent()
chrome_options = ChromeOptions()
chrome_options.add_argument("--headless")
@@ -68,7 +68,23 @@ def create_driver():
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument(f"user-agent={ua.random}")
return webdriver.Chrome(options=chrome_options)
sw_options = {}
if proxies:
selected_proxy = proxies[random.randint(0, len(proxies) - 1)]
LOG.info(f"Using proxy: {selected_proxy}")
sw_options = {
"proxy": {
"https": f"https://{selected_proxy}",
"http": f"http://{selected_proxy}",
}
}
driver = webdriver.Chrome(
options=chrome_options,
seleniumwire_options=sw_options,
)
return driver
async def make_site_request(
@@ -78,13 +94,14 @@ async def make_site_request(
visited_urls: set[str] = set(),
pages: set[tuple[str, str]] = set(),
original_url: str = "",
proxies: Optional[list[str]] = [],
) -> None:
"""Make basic `GET` request to site using Selenium."""
# Check if URL has already been visited
if url in visited_urls:
return
driver = create_driver()
driver = create_driver(proxies)
driver.implicitly_wait(10)
if headers:
@@ -93,6 +110,7 @@ async def make_site_request(
try:
LOG.info(f"Visiting URL: {url}")
driver.get(url)
final_url = driver.current_url
visited_urls.add(url)
visited_urls.add(final_url)
@@ -173,6 +191,7 @@ async def scrape(
xpaths: list[Element],
headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False,
proxies: Optional[list[str]] = [],
):
visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set()
@@ -184,6 +203,7 @@ async def scrape(
visited_urls=visited_urls,
pages=pages,
original_url=url,
proxies=proxies,
)
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()

View File

@@ -5,12 +5,14 @@ from faker import Faker
fake = Faker()
def create_job():
def create_job(
job_options: JobOptions = JobOptions(multi_page_scrape=False, custom_headers={})
):
return Job(
id=uuid.uuid4().hex,
url="https://example.com",
elements=[Element(name="test", xpath="xpath")],
job_options=JobOptions(multi_page_scrape=False, custom_headers={}),
job_options=job_options,
)

View File

@@ -9,12 +9,18 @@ client = TestClient(app)
mocked_job = create_completed_job().model_dump()
mock_results = [mocked_job]
mocked_random_int = 123456
@pytest.mark.asyncio
@patch("api.backend.app.query")
async def test_download(mock_query: AsyncMock):
@patch("api.backend.routers.job_router.query")
@patch("api.backend.routers.job_router.random.randint")
async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
# Ensure the mock returns immediately
mock_query.return_value = mock_results
mock_randint.return_value = mocked_random_int
# Create a DownloadJob instance
download_job = DownloadJob(ids=[mocked_job["id"]])
# Make a POST request to the /download endpoint
@@ -26,5 +32,9 @@ async def test_download(mock_query: AsyncMock):
# Check the content of the CSV
csv_content = response.content.decode("utf-8")
expected_csv = f"id,url,element_name,xpath,text,user,time_created\r\n{mocked_job['id']},https://example.com,element_name,//div,example,{mocked_job['user']},{mocked_job['time_created']}\r\n"
expected_csv = (
f'"id","url","element_name","xpath","text","user","time_created"\r\n'
f'"{mocked_job["id"]}-{mocked_random_int}","https://example.com","element_name","//div","example",'
f'"{mocked_job["user"]}","{mocked_job["time_created"]}"\r\n'
)
assert csv_content == expected_csv

View File

View File

@@ -0,0 +1,33 @@
import pytest
from unittest.mock import AsyncMock, patch, MagicMock
from api.backend.tests.factories.job_factory import create_job
from api.backend.models import JobOptions
from api.backend.scraping import create_driver
mocked_job = create_job(
job_options=JobOptions(
multi_page_scrape=False, custom_headers={}, proxies=["127.0.0.1:8080"]
)
).model_dump()
@pytest.mark.asyncio
@patch("seleniumwire.webdriver.Chrome.get")
async def test_proxy(mock_get: AsyncMock):
# Mock the response of the requests.get call
mock_response = MagicMock()
mock_get.return_value = mock_response
driver = create_driver(proxies=["127.0.0.1:8080"])
assert driver is not None
# Simulate a request
driver.get("http://example.com")
response = driver.last_request
# Check if the proxy header is set correctly
if response:
assert response.headers["Proxy"] == "127.0.0.1:8080"
driver.quit()

View File

@@ -23,6 +23,7 @@ async def process_job():
[Element(**j) for j in job["elements"]],
job["job_options"]["custom_headers"],
job["job_options"]["multi_page_scrape"],
job["job_options"]["proxies"],
)
LOG.info(
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"

View File

@@ -19,4 +19,4 @@ services:
ports:
- "8000:8000"
volumes:
- "$PWD/api:/project/app/api"
- "$PWD/api:/project/api"

View File

@@ -1,13 +1,14 @@
import { RawJobOptions } from "@/types/job";
import { Box, FormControlLabel, Checkbox, TextField } from "@mui/material";
import { Dispatch, SetStateAction } from "react";
import { JobOptions } from "@/types/job";
export type JobSubmitterOptionsProps = {
jobOptions: JobOptions;
setJobOptions: Dispatch<SetStateAction<JobOptions>>;
jobOptions: RawJobOptions;
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
customJSONSelected: boolean;
setCustomJSONSelected: Dispatch<SetStateAction<boolean>>;
handleSelectProxies: () => void;
proxiesSelected: boolean;
};
export const JobSubmitterOptions = ({
@@ -15,24 +16,69 @@ export const JobSubmitterOptions = ({
setJobOptions,
customJSONSelected,
setCustomJSONSelected,
handleSelectProxies,
proxiesSelected,
}: JobSubmitterOptionsProps) => {
const handleMultiPageScrapeChange = () => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
multi_page_scrape: !prevJobOptions.multi_page_scrape,
}));
};
const handleProxiesChange = (e: React.ChangeEvent<HTMLInputElement>) => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
proxies: e.target.value,
}));
};
const handleCustomHeadersChange = (
e: React.ChangeEvent<HTMLInputElement>
) => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
custom_headers: e.target.value,
}));
};
return (
<Box bgcolor="background.paper" className="flex flex-col mb-2 rounded-md">
<div id="options" className="p-2 flex flex-row space-x-2">
<FormControlLabel
label="Multi-Page Scrape"
className="mr-0"
control={
<Checkbox
checked={jobOptions.multi_page_scrape}
onChange={() =>
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
multi_page_scrape: !prevJobOptions.multi_page_scrape,
}))
}
onChange={handleMultiPageScrapeChange}
/>
}
></FormControlLabel>
<FormControlLabel
label="Proxies"
control={
<Checkbox
checked={proxiesSelected}
onChange={handleSelectProxies}
/>
}
></FormControlLabel>
{proxiesSelected ? (
<div id="proxies">
<TextField
InputLabelProps={{ shrink: false }}
fullWidth
multiline={false}
variant="outlined"
value={jobOptions.proxies || ""}
onChange={handleProxiesChange}
inputProps={{
style: { whiteSpace: "nowrap", overflowX: "auto" },
}}
/>
</div>
) : null}
<FormControlLabel
label="Custom Headers (JSON)"
control={
@@ -58,14 +104,8 @@ export const JobSubmitterOptions = ({
minRows={4}
variant="outlined"
value={jobOptions.custom_headers || ""}
onChange={(e) =>
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
custom_headers: e.target.value,
}))
}
onChange={handleCustomHeadersChange}
style={{ maxHeight: "20vh", overflow: "auto" }}
className="mt-2"
/>
</div>
) : null}

View File

@@ -4,11 +4,12 @@ import React, { useEffect, useState, Dispatch } from "react";
import { Element } from "@/types";
import { useAuth } from "@/contexts/AuthContext";
import { useRouter } from "next/router";
import { Constants } from "@/lib";
import { RawJobOptions } from "@/types/job";
import { parseJobOptions, validateURL } from "@/lib";
import { JobSubmitterHeader } from "./job-submitter-header";
import { JobSubmitterInput } from "./job-submitter-input";
import { JobSubmitterOptions } from "./job-submitter-options";
import { ApiService } from "@/services";
interface StateProps {
submittedURL: string;
@@ -25,22 +26,20 @@ interface Props {
stateProps: StateProps;
}
interface JobOptions {
multi_page_scrape: boolean;
custom_headers: null | string;
}
const initialJobOptions: RawJobOptions = {
multi_page_scrape: false,
custom_headers: null,
proxies: null,
};
export const JobSubmitter = ({ stateProps }: Props) => {
const { user } = useAuth();
const router = useRouter();
const { job_options } = router.query;
const {
submittedURL,
setSubmittedURL,
rows,
isValidURL,
setIsValidUrl,
setSnackbarMessage,
setSnackbarOpen,
@@ -49,22 +48,16 @@ export const JobSubmitter = ({ stateProps }: Props) => {
const [urlError, setUrlError] = useState<string | null>(null);
const [loading, setLoading] = useState<boolean>(false);
const [jobOptions, setJobOptions] = useState<JobOptions>({
multi_page_scrape: false,
custom_headers: null,
});
const [jobOptions, setJobOptions] =
useState<RawJobOptions>(initialJobOptions);
const [customJSONSelected, setCustomJSONSelected] = useState<boolean>(false);
const [proxiesSelected, setProxiesSelected] = useState<boolean>(false);
function validateURL(url: string): boolean {
try {
new URL(url);
return true;
} catch (_) {
return false;
}
}
const handleSelectProxies = () => {
setProxiesSelected(!proxiesSelected);
};
const handleSubmit = () => {
const handleSubmit = async () => {
if (!validateURL(submittedURL)) {
setIsValidUrl(false);
setUrlError("Please enter a valid URL.");
@@ -76,6 +69,7 @@ export const JobSubmitter = ({ stateProps }: Props) => {
setLoading(true);
let customHeaders;
try {
customHeaders = jobOptions.custom_headers
? JSON.parse(jobOptions.custom_headers)
@@ -88,21 +82,14 @@ export const JobSubmitter = ({ stateProps }: Props) => {
return;
}
fetch(`${Constants.DOMAIN}/api/submit-scrape-job`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
url: submittedURL,
elements: rows,
user: user?.email,
time_created: new Date().toISOString(),
job_options: {
...jobOptions,
custom_headers: customHeaders,
},
}),
})
.then((response) => {
await ApiService.submitJob(
submittedURL,
rows,
user,
jobOptions,
customHeaders
)
.then(async (response) => {
if (!response.ok) {
return response.json().then((error) => {
throw new Error(error.error);
@@ -126,26 +113,15 @@ export const JobSubmitter = ({ stateProps }: Props) => {
.finally(() => setLoading(false));
};
// Parse the job options from the query string
useEffect(() => {
if (job_options) {
const jsonOptions = JSON.parse(job_options as string);
const newJobOptions: JobOptions = {
multi_page_scrape: false,
custom_headers: null,
};
if (
jsonOptions.custom_headers &&
Object.keys(jsonOptions.custom_headers).length
) {
setCustomJSONSelected(true);
newJobOptions.custom_headers = JSON.stringify(
jsonOptions.custom_headers
);
}
newJobOptions.multi_page_scrape = jsonOptions.multi_page_scrape;
setJobOptions(newJobOptions);
parseJobOptions(
job_options as string,
setCustomJSONSelected,
setProxiesSelected,
setJobOptions
);
}
}, [job_options]);
@@ -165,6 +141,8 @@ export const JobSubmitter = ({ stateProps }: Props) => {
setJobOptions={setJobOptions}
customJSONSelected={customJSONSelected}
setCustomJSONSelected={setCustomJSONSelected}
handleSelectProxies={handleSelectProxies}
proxiesSelected={proxiesSelected}
/>
</div>
</>

2
src/lib/helpers/index.ts Normal file
View File

@@ -0,0 +1,2 @@
export * from "./parse-job-options";
export * from "./validate-url";

View File

@@ -0,0 +1,36 @@
import { Dispatch, SetStateAction } from "react";
import { RawJobOptions } from "@/types";
export const parseJobOptions = (
job_options: string,
setCustomJSONSelected: Dispatch<SetStateAction<boolean>>,
setProxiesSelected: Dispatch<SetStateAction<boolean>>,
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>
) => {
if (job_options) {
const jsonOptions = JSON.parse(job_options as string);
const newJobOptions: RawJobOptions = {
multi_page_scrape: false,
custom_headers: null,
proxies: null,
};
if (
jsonOptions.custom_headers &&
Object.keys(jsonOptions.custom_headers).length
) {
setCustomJSONSelected(true);
newJobOptions.custom_headers = JSON.stringify(jsonOptions.custom_headers);
}
newJobOptions.multi_page_scrape = jsonOptions.multi_page_scrape;
if (jsonOptions.proxies) {
setProxiesSelected(true);
newJobOptions.proxies = jsonOptions.proxies.join(",");
}
setJobOptions(newJobOptions);
}
};

View File

@@ -0,0 +1,8 @@
export function validateURL(url: string): boolean {
try {
new URL(url);
return true;
} catch (_) {
return false;
}
}

View File

@@ -1,2 +1,3 @@
export * from "./constants";
export * from "./utils";
export * from "./helpers";

View File

@@ -0,0 +1,5 @@
import * as functions from "./functions";
export const ApiService = {
...functions,
};

View File

@@ -0,0 +1 @@
export * from "./submit-job";

View File

@@ -0,0 +1,25 @@
import { Constants } from "@/lib";
export const submitJob = async (
submittedURL: string,
rows: any[],
user: any,
jobOptions: any,
customHeaders: any
) => {
return await fetch(`${Constants.DOMAIN}/api/submit-scrape-job`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
url: submittedURL,
elements: rows,
user: user?.email,
time_created: new Date().toISOString(),
job_options: {
...jobOptions,
custom_headers: customHeaders,
proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [],
},
}),
});
};

View File

@@ -0,0 +1 @@
export * from "./api-service";

1
src/services/index.ts Normal file
View File

@@ -0,0 +1 @@
export * from "./api-service";

View File

@@ -15,4 +15,11 @@ export interface Job {
export type JobOptions = {
multi_page_scrape: boolean;
custom_headers: null | string;
proxies: string[];
};
export type RawJobOptions = {
multi_page_scrape: boolean;
custom_headers: string | null;
proxies: string | null;
};