mirror of
https://github.com/jaypyles/Scraperr.git
synced 2026-05-04 00:10:44 +00:00
875a3684c9
* chore: wip swap to sqlalchemy * feat: swap to sqlalchemy * feat: swap to sqlalchemy * feat: swap to sqlalchemy * feat: swap to sqlalchemy
118 lines
3.5 KiB
Python
118 lines
3.5 KiB
Python
# STL
|
|
import logging
|
|
from typing import Dict
|
|
from datetime import datetime
|
|
|
|
# PDM
|
|
import pytest
|
|
from httpx import AsyncClient
|
|
from sqlalchemy import select
|
|
from fastapi.testclient import TestClient
|
|
from playwright.async_api import Route, Cookie, async_playwright
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
# LOCAL
|
|
from api.backend.app import app
|
|
from api.backend.job.models import Proxy, Element, JobOptions
|
|
from api.backend.schemas.job import Job
|
|
from api.backend.database.models import Job as JobModel
|
|
from api.backend.job.scraping.add_custom import add_custom_items
|
|
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
client = TestClient(app)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_add_custom_items():
|
|
test_cookies = [{"name": "big", "value": "cookie"}]
|
|
test_headers = {"User-Agent": "test-agent", "Accept": "application/json"}
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
context = await browser.new_context()
|
|
page = await context.new_page()
|
|
|
|
# Set up request interception
|
|
captured_headers: Dict[str, str] = {}
|
|
|
|
async def handle_route(route: Route) -> None:
|
|
nonlocal captured_headers
|
|
captured_headers = route.request.headers
|
|
await route.continue_()
|
|
|
|
await page.route("**/*", handle_route)
|
|
|
|
await add_custom_items(
|
|
url="http://example.com",
|
|
page=page,
|
|
cookies=test_cookies,
|
|
headers=test_headers,
|
|
)
|
|
|
|
# Navigate to example.com
|
|
await page.goto("http://example.com")
|
|
|
|
# Verify cookies were added
|
|
cookies: list[Cookie] = await page.context.cookies()
|
|
test_cookie = next((c for c in cookies if c.get("name") == "big"), None)
|
|
|
|
assert test_cookie is not None
|
|
assert test_cookie.get("value") == "cookie"
|
|
assert test_cookie.get("path") == "/" # Default path should be set
|
|
assert test_cookie.get("sameSite") == "Lax" # Default sameSite should be set
|
|
|
|
# Verify headers were added
|
|
assert captured_headers.get("user-agent") == "test-agent"
|
|
|
|
await browser.close()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_proxies(client: AsyncClient, db_session: AsyncSession):
|
|
job = Job(
|
|
url="https://example.com",
|
|
elements=[Element(xpath="//div", name="test")],
|
|
job_options=JobOptions(
|
|
proxies=[
|
|
Proxy(
|
|
server="127.0.0.1:8080",
|
|
username="user",
|
|
password="pass",
|
|
)
|
|
],
|
|
),
|
|
time_created=datetime.now().isoformat(),
|
|
)
|
|
|
|
response = await client.post("/submit-scrape-job", json=job.model_dump())
|
|
assert response.status_code == 200
|
|
|
|
stmt = select(JobModel)
|
|
result = await db_session.execute(stmt)
|
|
jobs = result.scalars().all()
|
|
|
|
assert len(jobs) > 0
|
|
job_from_db = jobs[0]
|
|
|
|
job_dict = job_from_db.__dict__
|
|
job_dict.pop("_sa_instance_state", None)
|
|
|
|
assert job_dict is not None
|
|
print(job_dict)
|
|
assert job_dict["job_options"]["proxies"] == [
|
|
{
|
|
"server": "127.0.0.1:8080",
|
|
"username": "user",
|
|
"password": "pass",
|
|
}
|
|
]
|
|
|
|
# Verify the job was stored correctly in the database
|
|
assert job_dict["url"] == "https://example.com"
|
|
assert job_dict["status"] == "Queued"
|
|
assert len(job_dict["elements"]) == 1
|
|
assert job_dict["elements"][0]["xpath"] == "//div"
|
|
assert job_dict["elements"][0]["name"] == "test"
|