mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-11-15 21:56:22 +00:00
Compare commits
14 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6b33723cac | ||
|
|
5c89e4d7d2 | ||
|
|
ed0828a585 | ||
|
|
1b8c8c779a | ||
|
|
267cc73657 | ||
|
|
92ff16d9c3 | ||
|
|
8b2e5dc9c3 | ||
|
|
7f1bc295ac | ||
|
|
031572325f | ||
|
|
48d3bf9214 | ||
|
|
e07abcd089 | ||
|
|
8a933b88a7 | ||
|
|
863dbcd044 | ||
|
|
de40181a6f |
50
.github/actions/push-to-helm/action.yaml
vendored
Normal file
50
.github/actions/push-to-helm/action.yaml
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
name: Publish Helm Chart
|
||||
description: Publish a Helm chart to a target repository
|
||||
|
||||
inputs:
|
||||
app-repo-token:
|
||||
required: true
|
||||
description: "The token for the target repository"
|
||||
|
||||
runs:
|
||||
using: 'composite'
|
||||
steps:
|
||||
- name: Checkout app repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Helm
|
||||
uses: azure/setup-helm@v3
|
||||
|
||||
- name: Package Helm chart
|
||||
run: |
|
||||
mkdir -p packaged
|
||||
helm package helm -d packaged
|
||||
shell: bash
|
||||
|
||||
- name: Clone target Helm repo
|
||||
run: |
|
||||
git clone https://github.com/jaypyles/helm.git target-repo
|
||||
cd target-repo
|
||||
git config user.name "github-actions"
|
||||
git config user.email "github-actions@github.com"
|
||||
git fetch origin gh-pages # Fetch gh-pages explicitly
|
||||
git checkout gh-pages # Checkout gh-pages branch
|
||||
git pull origin gh-pages # Pull latest changes from gh-pages
|
||||
shell: bash
|
||||
|
||||
- name: Copy package and update index
|
||||
run: |
|
||||
APP_NAME="scraperr"
|
||||
mkdir -p target-repo/charts/$APP_NAME
|
||||
cp packaged/*.tgz target-repo/charts/$APP_NAME/
|
||||
cd target-repo/charts/$APP_NAME
|
||||
helm repo index . --url https://jaypyles.github.io/helm/charts/$APP_NAME
|
||||
shell: bash
|
||||
|
||||
- name: Commit and push to target repo
|
||||
run: |
|
||||
cd target-repo
|
||||
git add charts/
|
||||
git commit -m "Update $APP_NAME chart $(date +'%Y-%m-%d %H:%M:%S')" || echo "No changes"
|
||||
git push https://x-access-token:${{ inputs.app-repo-token }}@github.com/jaypyles/helm.git gh-pages
|
||||
shell: bash
|
||||
29
.github/workflows/docker-image.yml
vendored
29
.github/workflows/docker-image.yml
vendored
@@ -8,12 +8,18 @@ on:
|
||||
|
||||
jobs:
|
||||
build:
|
||||
if: ${{ github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.head_branch == 'master' }}
|
||||
if: ${{ github.event.workflow_run.conclusion == 'success' && github.ref == 'refs/heads/master' }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Get version from helm chart
|
||||
run: |
|
||||
VERSION=$(grep "version:" ./helm/Chart.yaml | cut -d: -f2 | tr -d ' ')
|
||||
echo "VERSION=$VERSION" >> $GITHUB_ENV
|
||||
echo "Version is $VERSION"
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
@@ -29,7 +35,9 @@ jobs:
|
||||
context: .
|
||||
file: ./docker/frontend/Dockerfile
|
||||
push: true
|
||||
tags: ${{ secrets.DOCKERHUB_USERNAME }}/${{ secrets.DOCKERHUB_REPO }}:latest
|
||||
tags: |
|
||||
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:latest
|
||||
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:${{ env.VERSION }}
|
||||
|
||||
- name: Build and push api
|
||||
uses: docker/build-push-action@v5
|
||||
@@ -37,12 +45,27 @@ jobs:
|
||||
context: .
|
||||
file: ./docker/api/Dockerfile
|
||||
push: true
|
||||
tags: ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
|
||||
tags: |
|
||||
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
|
||||
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:${{ env.VERSION }}
|
||||
|
||||
push-helm-chart:
|
||||
runs-on: ubuntu-latest
|
||||
needs:
|
||||
- build
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Push Helm Chart
|
||||
uses: ./.github/actions/push-to-helm
|
||||
with:
|
||||
app-repo-token: ${{ secrets.GPAT_TOKEN }}
|
||||
|
||||
success-message:
|
||||
runs-on: ubuntu-latest
|
||||
needs:
|
||||
- build
|
||||
- push-helm-chart
|
||||
steps:
|
||||
- name: Send Discord Message
|
||||
uses: jaypyles/discord-webhook-action@v1.0.0
|
||||
|
||||
3
.github/workflows/unit-tests.yml
vendored
3
.github/workflows/unit-tests.yml
vendored
@@ -26,6 +26,9 @@ jobs:
|
||||
- name: Install project dependencies
|
||||
run: pdm install
|
||||
|
||||
- name: Install playwright
|
||||
run: pdm run playwright install
|
||||
|
||||
- name: Run tests
|
||||
run: PYTHONPATH=. pdm run pytest api/backend/tests
|
||||
|
||||
|
||||
2
.prettierignore
Normal file
2
.prettierignore
Normal file
@@ -0,0 +1,2 @@
|
||||
*.yaml
|
||||
*.yml
|
||||
1
FUNDING.yml
Normal file
1
FUNDING.yml
Normal file
@@ -0,0 +1 @@
|
||||
custom: ["https://www.buymeacoffee.com/jaypyles"]
|
||||
129
README.md
129
README.md
@@ -1,104 +1,71 @@
|
||||

|
||||
|
||||
<div align="center">
|
||||
<img src="https://img.shields.io/badge/MongoDB-%234ea94b.svg?style=for-the-badge&logo=mongodb&logoColor=white" alt="MongoDB" />
|
||||
<img src="https://img.shields.io/badge/FastAPI-005571?style=for-the-badge&logo=fastapi" alt="FastAPI" />
|
||||
<img src="https://img.shields.io/badge/Next-black?style=for-the-badge&logo=next.js&logoColor=white" alt="Next JS" />
|
||||
<img src="https://img.shields.io/badge/tailwindcss-%2338B2AC.svg?style=for-the-badge&logo=tailwind-css&logoColor=white" alt="TailwindCSS" />
|
||||
<img src="https://github.com/jaypyles/www-scrape/blob/master/docs/logo_picture.png" alt="Scraperr Logo" width="250px">
|
||||
|
||||
**A powerful self-hosted web scraping solution**
|
||||
|
||||
<div>
|
||||
<img src="https://img.shields.io/badge/MongoDB-%234ea94b.svg?style=for-the-badge&logo=mongodb&logoColor=white" alt="MongoDB" />
|
||||
<img src="https://img.shields.io/badge/FastAPI-005571?style=for-the-badge&logo=fastapi" alt="FastAPI" />
|
||||
<img src="https://img.shields.io/badge/Next-black?style=for-the-badge&logo=next.js&logoColor=white" alt="Next JS" />
|
||||
<img src="https://img.shields.io/badge/tailwindcss-%2338B2AC.svg?style=for-the-badge&logo=tailwind-css&logoColor=white" alt="TailwindCSS" />
|
||||
</div>
|
||||
</div>
|
||||
|
||||
# Summary
|
||||
## 📋 Overview
|
||||
|
||||
Scraperr is a self-hosted web application that allows users to scrape data from web pages by specifying elements via XPath. Users can submit URLs and the corresponding elements to be scraped, and the results will be displayed in a table.
|
||||
Scraperr enables you to extract data from websites with precision using XPath selectors. This self-hosted application provides a clean interface to manage scraping jobs, view results, and export data.
|
||||
|
||||
From the table, users can download an excel sheet of the job's results, along with an option to rerun the job.
|
||||
> 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information.
|
||||
|
||||
View the [docs](https://scraperr-docs.pages.dev) for a quickstart guide and more information.
|
||||
<div align="center">
|
||||
<img src="https://github.com/jaypyles/www-scrape/blob/master/docs/main_page.png" alt="Scraperr Main Interface" width="800px">
|
||||
</div>
|
||||
|
||||
## Features
|
||||
## ✨ Key Features
|
||||
|
||||
### Submitting URLs for Scraping
|
||||
- **XPath-Based Extraction**: Precisely target page elements
|
||||
- **Queue Management**: Submit and manage multiple scraping jobs
|
||||
- **Domain Spidering**: Option to scrape all pages within the same domain
|
||||
- **Custom Headers**: Add JSON headers to your scraping requests
|
||||
- **Media Downloads**: Automatically download images, videos, and other media
|
||||
- **Results Visualization**: View scraped data in a structured table format
|
||||
- **Data Export**: Export your results in various formats
|
||||
- **Notifcation Channels**: Send completion notifcations, through various channels
|
||||
|
||||
- Submit/Queue URLs for web scraping
|
||||
- Add and manage elements to scrape using XPath
|
||||
- Scrape all pages within same domain
|
||||
- Add custom json headers to send in requests to URLs
|
||||
- Display results of scraped data
|
||||
- Download media found on the page (images, videos, etc.)
|
||||
## 🚀 Getting Started
|
||||
|
||||

|
||||
### Docker
|
||||
|
||||
### Managing Previous Jobs
|
||||
|
||||
- Download csv containing results
|
||||
- Rerun jobs
|
||||
- View status of queued jobs
|
||||
- Favorite and view favorited jobs
|
||||
|
||||

|
||||
|
||||
### User Management
|
||||
|
||||
- User login/signup to organize jobs (optional)
|
||||
|
||||

|
||||
|
||||
### Log Viewing
|
||||
|
||||
- View app logs inside of web ui
|
||||
|
||||

|
||||
|
||||
### Statistics View
|
||||
|
||||
- View a small statistics view of jobs ran
|
||||
|
||||

|
||||
|
||||
### AI Integration
|
||||
|
||||
- Include the results of a selected job into the context of a conversation
|
||||
- Currently supports:
|
||||
|
||||
1. Ollama
|
||||
2. OpenAI
|
||||
|
||||

|
||||
|
||||
## API Endpoints
|
||||
|
||||
Use this service as an API for your own projects. Due to this using FastAPI, a docs page is available at `/docs` for the API.
|
||||
|
||||

|
||||
|
||||
## Troubleshooting
|
||||
|
||||
Q: When running Scraperr, I'm met with "404 Page not found".
|
||||
A: This is probably an issue with MongoDB related to running Scraperr in a VM. You should see something liks this in `make logs`:
|
||||
|
||||
```
|
||||
WARNING: MongoDB 5.0+ requires a CPU with AVX support, and your current system does not appear to have that!
|
||||
```bash
|
||||
make up
|
||||
```
|
||||
|
||||
To resolve this issue, simply set CPU host type to `host`. This can be done in Proxmox in the VM settings > Processor. [Related issue](https://github.com/jaypyles/Scraperr/issues/9).
|
||||
### Helm
|
||||
|
||||
## Legal and Ethical Considerations
|
||||
> Refer to the docs for helm deployment: https://scraperr-docs.pages.dev/guides/helm-deployment
|
||||
|
||||
When using Scraperr, please ensure that you:
|
||||
## ⚖️ Legal and Ethical Guidelines
|
||||
|
||||
1. **Check Robots.txt**: Verify allowed pages by reviewing the `robots.txt` file of the target website.
|
||||
2. **Compliance**: Always comply with the website's Terms of Service (ToS) regarding web scraping.
|
||||
When using Scraperr, please remember to:
|
||||
|
||||
**Disclaimer**: This tool is intended for use only on websites that permit scraping. The author is not responsible for any misuse of this tool.
|
||||
1. **Respect `robots.txt`**: Always check a website's `robots.txt` file to verify which pages permit scraping
|
||||
2. **Terms of Service**: Adhere to each website's Terms of Service regarding data extraction
|
||||
3. **Rate Limiting**: Implement reasonable delays between requests to avoid overloading servers
|
||||
|
||||
## License
|
||||
> **Disclaimer**: Scraperr is intended for use only on websites that explicitly permit scraping. The creator accepts no responsibility for misuse of this tool.
|
||||
|
||||
## 💬 Join the Community
|
||||
|
||||
Get support, report bugs, and chat with other users and contributors.
|
||||
|
||||
👉 [Join the Scraperr Discord](https://discord.gg/89q7scsGEK)
|
||||
|
||||
## 📄 License
|
||||
|
||||
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
||||
|
||||
### Contributions
|
||||
## 👏 Contributions
|
||||
|
||||
Development made easy by developing from [webapp template](https://github.com/jaypyles/webapp-template). View documentation for extra information.
|
||||
Development made easier with the [webapp template](https://github.com/jaypyles/webapp-template).
|
||||
|
||||
Start development server:
|
||||
|
||||
`make deps build up-dev`
|
||||
To get started, simply run `make build up-dev`.
|
||||
@@ -43,6 +43,14 @@ async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]:
|
||||
async def openai_chat(
|
||||
chat_messages: Iterable[ChatCompletionMessageParam],
|
||||
) -> AsyncGenerator[str, None]:
|
||||
if openai_client and not open_ai_model:
|
||||
LOG.error("OpenAI model is not set")
|
||||
yield "An error occurred while processing your request."
|
||||
|
||||
if not openai_client:
|
||||
LOG.error("OpenAI client is not set")
|
||||
yield "An error occurred while processing your request."
|
||||
|
||||
if openai_client and open_ai_model:
|
||||
try:
|
||||
response = openai_client.chat.completions.create(
|
||||
|
||||
@@ -15,7 +15,6 @@ from api.backend.ai.ai_router import ai_router
|
||||
from api.backend.auth.auth_router import auth_router
|
||||
from api.backend.utils import get_log_level
|
||||
from api.backend.routers.job_router import job_router
|
||||
from api.backend.routers.log_router import log_router
|
||||
from api.backend.routers.stats_router import stats_router
|
||||
from api.backend.database.startup import init_database
|
||||
from fastapi.responses import JSONResponse
|
||||
@@ -48,7 +47,6 @@ app.add_middleware(
|
||||
app.include_router(auth_router)
|
||||
app.include_router(ai_router)
|
||||
app.include_router(job_router)
|
||||
app.include_router(log_router)
|
||||
app.include_router(stats_router)
|
||||
|
||||
|
||||
|
||||
@@ -20,9 +20,9 @@ LOG = logging.getLogger(__name__)
|
||||
|
||||
_ = load_dotenv()
|
||||
|
||||
SECRET_KEY = os.getenv("SECRET_KEY") or ""
|
||||
ALGORITHM = os.getenv("ALGORITHM") or ""
|
||||
ACCESS_TOKEN_EXPIRE_MINUTES = os.getenv("ACCESS_TOKEN_EXPIRE_MINUTES")
|
||||
SECRET_KEY = os.getenv("SECRET_KEY") or "secret"
|
||||
ALGORITHM = os.getenv("ALGORITHM") or "HS256"
|
||||
ACCESS_TOKEN_EXPIRE_MINUTES = os.getenv("ACCESS_TOKEN_EXPIRE_MINUTES") or 600
|
||||
|
||||
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
|
||||
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="auth/token")
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import os
|
||||
from api.backend.database.common import connect, QUERIES
|
||||
from api.backend.database.common import connect, QUERIES, insert
|
||||
import logging
|
||||
|
||||
from api.backend.auth.auth_utils import get_password_hash
|
||||
@@ -31,7 +31,7 @@ def init_database():
|
||||
exit(1)
|
||||
|
||||
query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)"
|
||||
_ = cursor.execute(
|
||||
_ = insert(
|
||||
query,
|
||||
(
|
||||
default_user_email,
|
||||
|
||||
@@ -13,3 +13,4 @@ class JobOptions(BaseModel):
|
||||
proxies: list[str] = []
|
||||
site_map: Optional[SiteMap] = None
|
||||
collect_media: bool = False
|
||||
custom_cookies: list[dict[str, Any]] = []
|
||||
|
||||
48
api/backend/job/scraping/add_custom.py
Normal file
48
api/backend/job/scraping/add_custom.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from typing import Any, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from playwright.async_api import Page, BrowserContext
|
||||
|
||||
import logging
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def add_custom_cookies(
|
||||
custom_cookies: list[dict[str, Any]],
|
||||
url: str,
|
||||
context: BrowserContext,
|
||||
) -> None:
|
||||
parsed_url = urlparse(url)
|
||||
domain = parsed_url.netloc
|
||||
|
||||
for cookie in custom_cookies:
|
||||
cookie_dict = {
|
||||
"name": cookie.get("name", "default_name"),
|
||||
"value": cookie.get("value", "default_value"),
|
||||
"domain": domain,
|
||||
"path": "/",
|
||||
}
|
||||
|
||||
LOG.info(f"Adding cookie: {cookie_dict}")
|
||||
await context.add_cookies([cookie_dict]) # type: ignore
|
||||
|
||||
|
||||
async def add_custom_headers(
|
||||
custom_headers: dict[str, Any],
|
||||
page: Page,
|
||||
) -> None:
|
||||
await page.set_extra_http_headers(custom_headers)
|
||||
|
||||
|
||||
async def add_custom_items(
|
||||
url: str,
|
||||
page: Page,
|
||||
cookies: Optional[list[dict[str, Any]]] = None,
|
||||
headers: Optional[dict[str, Any]] = None,
|
||||
) -> None:
|
||||
if cookies:
|
||||
await add_custom_cookies(cookies, url, page.context)
|
||||
|
||||
if headers:
|
||||
await add_custom_headers(headers, page)
|
||||
@@ -1,14 +1,15 @@
|
||||
import os
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from selenium.webdriver.common.by import By
|
||||
from seleniumwire import webdriver
|
||||
from urllib.parse import urlparse
|
||||
from typing import Dict, List
|
||||
|
||||
import aiohttp
|
||||
from playwright.async_api import Page
|
||||
|
||||
from api.backend.utils import LOG
|
||||
|
||||
|
||||
def collect_media(driver: webdriver.Chrome):
|
||||
async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
|
||||
media_types = {
|
||||
"images": "img",
|
||||
"videos": "video",
|
||||
@@ -24,62 +25,69 @@ def collect_media(driver: webdriver.Chrome):
|
||||
|
||||
media_urls = {}
|
||||
|
||||
for media_type, selector in media_types.items():
|
||||
elements = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
urls: list[dict[str, str]] = []
|
||||
async with aiohttp.ClientSession() as session:
|
||||
for media_type, selector in media_types.items():
|
||||
elements = await page.query_selector_all(selector)
|
||||
urls: List[Dict[str, str]] = []
|
||||
|
||||
media_dir = base_dir / media_type
|
||||
media_dir.mkdir(exist_ok=True)
|
||||
media_dir = base_dir / media_type
|
||||
media_dir.mkdir(exist_ok=True)
|
||||
|
||||
for element in elements:
|
||||
if media_type == "images":
|
||||
url = element.get_attribute("src")
|
||||
elif media_type == "videos":
|
||||
url = element.get_attribute("src") or element.get_attribute("data-src")
|
||||
else:
|
||||
url = element.get_attribute("href")
|
||||
for element in elements:
|
||||
if media_type == "images":
|
||||
url = await element.get_attribute("src")
|
||||
elif media_type == "videos":
|
||||
url = await element.get_attribute(
|
||||
"src"
|
||||
) or await element.get_attribute("data-src")
|
||||
else:
|
||||
url = await element.get_attribute("href")
|
||||
|
||||
if url and url.startswith(("http://", "https://")):
|
||||
try:
|
||||
filename = os.path.basename(urlparse(url).path)
|
||||
if url and url.startswith("/"):
|
||||
root_url = urlparse(page.url)
|
||||
root_domain = f"{root_url.scheme}://{root_url.netloc}"
|
||||
url = f"{root_domain}{url}"
|
||||
|
||||
if not filename:
|
||||
filename = f"{media_type}_{len(urls)}"
|
||||
if url and url.startswith(("http://", "https://")):
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
filename = (
|
||||
os.path.basename(parsed.path) or f"{media_type}_{len(urls)}"
|
||||
)
|
||||
|
||||
if media_type == "images":
|
||||
filename += ".jpg"
|
||||
elif media_type == "videos":
|
||||
filename += ".mp4"
|
||||
elif media_type == "audio":
|
||||
filename += ".mp3"
|
||||
elif media_type == "pdfs":
|
||||
filename += ".pdf"
|
||||
elif media_type == "documents":
|
||||
filename += ".doc"
|
||||
elif media_type == "presentations":
|
||||
filename += ".ppt"
|
||||
elif media_type == "spreadsheets":
|
||||
filename += ".xls"
|
||||
if "." not in filename:
|
||||
ext = {
|
||||
"images": ".jpg",
|
||||
"videos": ".mp4",
|
||||
"audio": ".mp3",
|
||||
"pdfs": ".pdf",
|
||||
"documents": ".doc",
|
||||
"presentations": ".ppt",
|
||||
"spreadsheets": ".xls",
|
||||
}.get(media_type, "")
|
||||
filename += ext
|
||||
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
file_path = media_dir / filename
|
||||
|
||||
# Save the file
|
||||
file_path = media_dir / filename
|
||||
with open(file_path, "wb") as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
async with session.get(url) as response:
|
||||
response.raise_for_status()
|
||||
with open(file_path, "wb") as f:
|
||||
while True:
|
||||
chunk = await response.content.read(8192)
|
||||
if not chunk:
|
||||
break
|
||||
f.write(chunk)
|
||||
|
||||
urls.append({"url": url, "local_path": str(file_path)})
|
||||
LOG.info(f"Downloaded {filename} to {file_path}")
|
||||
urls.append({"url": url, "local_path": str(file_path)})
|
||||
LOG.info(f"Downloaded {filename} to {file_path}")
|
||||
|
||||
except Exception as e:
|
||||
LOG.error(f"Error downloading {url}: {str(e)}")
|
||||
continue
|
||||
except Exception as e:
|
||||
LOG.error(f"Error downloading {url}: {str(e)}")
|
||||
continue
|
||||
|
||||
media_urls[media_type] = urls
|
||||
media_urls[media_type] = urls
|
||||
|
||||
# Write summary
|
||||
with open(base_dir / "download_summary.txt", "w") as f:
|
||||
for media_type, downloads in media_urls.items():
|
||||
if downloads:
|
||||
|
||||
@@ -1,41 +1,32 @@
|
||||
import time
|
||||
from typing import cast
|
||||
|
||||
from seleniumwire import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
import asyncio
|
||||
from typing import Set, Tuple
|
||||
from playwright.async_api import Page
|
||||
|
||||
from api.backend.utils import LOG
|
||||
|
||||
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
|
||||
|
||||
|
||||
def scrape_content(
|
||||
driver: webdriver.Chrome, pages: set[tuple[str, str]], collect_media: bool
|
||||
):
|
||||
_ = WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
||||
)
|
||||
async def scrape_content(
|
||||
page: Page, pages: Set[Tuple[str, str]], collect_media: bool
|
||||
) -> str:
|
||||
last_height = await page.evaluate("document.body.scrollHeight")
|
||||
|
||||
last_height = cast(str, driver.execute_script("return document.body.scrollHeight"))
|
||||
while True:
|
||||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
|
||||
time.sleep(3) # Wait for the page to load
|
||||
new_height = cast(
|
||||
str, driver.execute_script("return document.body.scrollHeight")
|
||||
)
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||
await asyncio.sleep(3)
|
||||
new_height = await page.evaluate("document.body.scrollHeight")
|
||||
|
||||
if new_height == last_height:
|
||||
break
|
||||
|
||||
last_height = new_height
|
||||
|
||||
pages.add((driver.page_source, driver.current_url))
|
||||
html = await page.content()
|
||||
pages.add((html, page.url))
|
||||
|
||||
if collect_media:
|
||||
LOG.info("Collecting media")
|
||||
collect_media_utils(driver)
|
||||
await collect_media_utils(page)
|
||||
|
||||
return driver.page_source
|
||||
return html
|
||||
|
||||
@@ -1,25 +1,19 @@
|
||||
from api.backend.job.models.site_map import Action, SiteMap
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import NoSuchElementException
|
||||
from selenium.webdriver.common.by import By
|
||||
from typing import Any
|
||||
import logging
|
||||
import time
|
||||
import asyncio
|
||||
from copy import deepcopy
|
||||
from typing import Any
|
||||
|
||||
from playwright.async_api import Page
|
||||
|
||||
from api.backend.job.models.site_map import Action, SiteMap
|
||||
from api.backend.job.scraping.scraping_utils import scrape_content
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from seleniumwire.inspect import TimeoutException
|
||||
from seleniumwire.webdriver import Chrome
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def clear_done_actions(site_map: dict[str, Any]):
|
||||
def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Clear all actions that have been clicked."""
|
||||
cleared_site_map = deepcopy(site_map)
|
||||
|
||||
cleared_site_map["actions"] = [
|
||||
action for action in cleared_site_map["actions"] if not action["do_once"]
|
||||
]
|
||||
@@ -27,43 +21,29 @@ def clear_done_actions(site_map: dict[str, Any]):
|
||||
return cleared_site_map
|
||||
|
||||
|
||||
def handle_input(action: Action, driver: webdriver.Chrome):
|
||||
async def handle_input(action: Action, page: Page) -> bool:
|
||||
try:
|
||||
element = WebDriverWait(driver, 10).until(
|
||||
EC.element_to_be_clickable((By.XPATH, action.xpath))
|
||||
)
|
||||
LOG.info(f"Sending keys: {action.input} to element: {element}")
|
||||
|
||||
element.send_keys(action.input)
|
||||
|
||||
except NoSuchElementException:
|
||||
LOG.info(f"Element not found: {action.xpath}")
|
||||
return False
|
||||
|
||||
except TimeoutException:
|
||||
LOG.info(f"Timeout waiting for element: {action.xpath}")
|
||||
return False
|
||||
|
||||
element = page.locator(f"xpath={action.xpath}")
|
||||
await element.wait_for(state="visible", timeout=10000)
|
||||
LOG.info(f"Sending keys: {action.input} to element: {action.xpath}")
|
||||
await element.fill(action.input)
|
||||
return True
|
||||
except Exception as e:
|
||||
LOG.info(f"Error handling input: {e}")
|
||||
LOG.warning(f"Error handling input for xpath '{action.xpath}': {e}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def handle_click(action: Action, driver: webdriver.Chrome):
|
||||
async def handle_click(action: Action, page: Page) -> bool:
|
||||
try:
|
||||
element = driver.find_element(By.XPATH, action.xpath)
|
||||
LOG.info(f"Clicking element: {element}")
|
||||
|
||||
element.click()
|
||||
|
||||
except NoSuchElementException:
|
||||
LOG.info(f"Element not found: {action.xpath}")
|
||||
element = page.locator(f"xpath={action.xpath}")
|
||||
await element.wait_for(state="visible", timeout=10000)
|
||||
LOG.info(f"Clicking element: {action.xpath}")
|
||||
await element.click()
|
||||
return True
|
||||
except Exception as e:
|
||||
LOG.warning(f"Error clicking element at xpath '{action.xpath}': {e}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
ACTION_MAP = {
|
||||
"click": handle_click,
|
||||
@@ -73,21 +53,26 @@ ACTION_MAP = {
|
||||
|
||||
async def handle_site_mapping(
|
||||
site_map_dict: dict[str, Any],
|
||||
driver: Chrome,
|
||||
page: Page,
|
||||
pages: set[tuple[str, str]],
|
||||
collect_media: bool = False,
|
||||
):
|
||||
site_map = SiteMap(**site_map_dict)
|
||||
|
||||
for action in site_map.actions:
|
||||
action_handler = ACTION_MAP[action.type]
|
||||
if not action_handler(action, driver):
|
||||
success = await action_handler(action, page)
|
||||
|
||||
if not success:
|
||||
return
|
||||
|
||||
time.sleep(2)
|
||||
await asyncio.sleep(2)
|
||||
|
||||
_ = scrape_content(driver, pages)
|
||||
await scrape_content(page, pages, collect_media=collect_media)
|
||||
|
||||
cleared_site_map_dict = clear_done_actions(site_map_dict)
|
||||
|
||||
if cleared_site_map_dict["actions"]:
|
||||
await handle_site_mapping(cleared_site_map_dict, driver, pages)
|
||||
await handle_site_mapping(
|
||||
cleared_site_map_dict, page, pages, collect_media=collect_media
|
||||
)
|
||||
|
||||
36
api/backend/job/utils/clean_job_format.py
Normal file
36
api/backend/job/utils/clean_job_format.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from typing import Any
|
||||
|
||||
from api.backend.utils import clean_text
|
||||
|
||||
|
||||
def clean_job_format(jobs: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
"""
|
||||
Convert a single job to a dictionary format.
|
||||
"""
|
||||
headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"]
|
||||
|
||||
cleaned_rows = []
|
||||
|
||||
for job in jobs:
|
||||
for res in job["result"]:
|
||||
for url, elements in res.items():
|
||||
for element_name, values in elements.items():
|
||||
for value in values:
|
||||
text = clean_text(value.get("text", "")).strip()
|
||||
if text:
|
||||
cleaned_rows.append(
|
||||
{
|
||||
"id": job.get("id", ""),
|
||||
"url": url,
|
||||
"element_name": element_name,
|
||||
"xpath": value.get("xpath", ""),
|
||||
"text": text,
|
||||
"user": job.get("user", ""),
|
||||
"time_created": job.get("time_created", ""),
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"headers": headers,
|
||||
"rows": cleaned_rows,
|
||||
}
|
||||
24
api/backend/job/utils/stream_md_from_job_results.py
Normal file
24
api/backend/job/utils/stream_md_from_job_results.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from typing import Any
|
||||
|
||||
from api.backend.utils import clean_text
|
||||
|
||||
|
||||
def stream_md_from_job_results(jobs: list[dict[str, Any]]):
|
||||
md = "# Job Results Summary\n\n"
|
||||
for i, job in enumerate(jobs, start=1):
|
||||
md += f"## Job #{i}\n"
|
||||
yield f"- **Job URL:** {job.get('url', 'N/A')}\n"
|
||||
yield f"- **Timestamp:** {job.get('time_created', 'N/A')}\n"
|
||||
yield f"- **ID:** {job.get('id', 'N/A')}\n"
|
||||
yield "### Extracted Results:\n"
|
||||
|
||||
for res in job.get("result", []):
|
||||
for url, elements in res.items():
|
||||
yield f"\n#### URL: {url}\n"
|
||||
for element_name, values in elements.items():
|
||||
for value in values:
|
||||
text = clean_text(value.get("text", "")).strip()
|
||||
if text:
|
||||
yield f"- **Element:** `{element_name}`\n"
|
||||
yield f" - **Text:** {text}\n"
|
||||
yield "\n---\n"
|
||||
@@ -1,5 +1,5 @@
|
||||
# STL
|
||||
from typing import Any, Optional, Union
|
||||
from typing import Any, Literal, Optional, Union
|
||||
from datetime import datetime
|
||||
|
||||
# LOCAL
|
||||
@@ -27,6 +27,7 @@ class RetrieveScrapeJobs(pydantic.BaseModel):
|
||||
|
||||
class DownloadJob(pydantic.BaseModel):
|
||||
ids: list[str]
|
||||
job_format: Literal["csv", "md"]
|
||||
|
||||
|
||||
class DeleteScrapeJobs(pydantic.BaseModel):
|
||||
|
||||
@@ -39,6 +39,9 @@ from api.backend.job.cron_scheduling.cron_scheduling import (
|
||||
insert_job_from_cron_job,
|
||||
)
|
||||
|
||||
from api.backend.job.utils.clean_job_format import clean_job_format
|
||||
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
job_router = APIRouter()
|
||||
@@ -104,41 +107,72 @@ async def download(download_job: DownloadJob):
|
||||
)
|
||||
results = query(job_query, tuple(download_job.ids))
|
||||
|
||||
csv_buffer = StringIO()
|
||||
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
|
||||
if download_job.job_format == "csv":
|
||||
csv_buffer = StringIO()
|
||||
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
|
||||
|
||||
headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"]
|
||||
csv_writer.writerow(headers)
|
||||
headers = [
|
||||
"id",
|
||||
"url",
|
||||
"element_name",
|
||||
"xpath",
|
||||
"text",
|
||||
"user",
|
||||
"time_created",
|
||||
]
|
||||
csv_writer.writerow(headers)
|
||||
|
||||
for result in results:
|
||||
for res in result["result"]:
|
||||
for url, elements in res.items():
|
||||
for element_name, values in elements.items():
|
||||
for value in values:
|
||||
text = clean_text(value.get("text", "")).strip()
|
||||
if text:
|
||||
csv_writer.writerow(
|
||||
[
|
||||
result.get("id", "")
|
||||
+ "-"
|
||||
+ str(random.randint(0, 1000000)),
|
||||
url,
|
||||
element_name,
|
||||
value.get("xpath", ""),
|
||||
text,
|
||||
result.get("user", ""),
|
||||
result.get("time_created", ""),
|
||||
]
|
||||
)
|
||||
for result in results:
|
||||
for res in result["result"]:
|
||||
for url, elements in res.items():
|
||||
for element_name, values in elements.items():
|
||||
for value in values:
|
||||
text = clean_text(value.get("text", "")).strip()
|
||||
if text:
|
||||
csv_writer.writerow(
|
||||
[
|
||||
result.get("id", "")
|
||||
+ "-"
|
||||
+ str(random.randint(0, 1000000)),
|
||||
url,
|
||||
element_name,
|
||||
value.get("xpath", ""),
|
||||
text,
|
||||
result.get("user", ""),
|
||||
result.get("time_created", ""),
|
||||
]
|
||||
)
|
||||
|
||||
_ = csv_buffer.seek(0)
|
||||
response = StreamingResponse(
|
||||
csv_buffer,
|
||||
media_type="text/csv",
|
||||
)
|
||||
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
|
||||
return response
|
||||
_ = csv_buffer.seek(0)
|
||||
response = StreamingResponse(
|
||||
csv_buffer,
|
||||
media_type="text/csv",
|
||||
)
|
||||
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
|
||||
return response
|
||||
|
||||
elif download_job.job_format == "md":
|
||||
response = StreamingResponse(
|
||||
stream_md_from_job_results(results),
|
||||
media_type="text/markdown",
|
||||
)
|
||||
|
||||
response.headers["Content-Disposition"] = "attachment; filename=export.md"
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
LOG.error(f"Exception occurred: {e}")
|
||||
traceback.print_exc()
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
@job_router.get("/job/{id}/convert-to-csv")
|
||||
async def convert_to_csv(id: str):
|
||||
try:
|
||||
job_query = f"SELECT * FROM jobs WHERE id = ?"
|
||||
results = query(job_query, (id,))
|
||||
|
||||
return JSONResponse(content=clean_job_format(results))
|
||||
except Exception as e:
|
||||
LOG.error(f"Exception occurred: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
# STL
|
||||
import logging
|
||||
import docker
|
||||
|
||||
# PDM
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
log_router = APIRouter()
|
||||
|
||||
client = docker.from_env()
|
||||
|
||||
|
||||
@log_router.get("/initial_logs")
|
||||
async def get_initial_logs():
|
||||
container_id = "scraperr_api"
|
||||
|
||||
try:
|
||||
container = client.containers.get(container_id)
|
||||
log_stream = container.logs(stream=False).decode("utf-8")
|
||||
return JSONResponse(content={"logs": log_stream})
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
|
||||
|
||||
|
||||
@log_router.get("/logs")
|
||||
async def get_own_logs():
|
||||
container_id = "scraperr_api"
|
||||
|
||||
try:
|
||||
container = client.containers.get(container_id)
|
||||
log_stream = container.logs(stream=True, follow=True)
|
||||
|
||||
def log_generator():
|
||||
try:
|
||||
for log in log_stream:
|
||||
yield f"data: {log.decode('utf-8')}\n\n"
|
||||
except Exception as e:
|
||||
yield f"data: {str(e)}\n\n"
|
||||
|
||||
return StreamingResponse(log_generator(), media_type="text/event-stream")
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
@@ -1,28 +1,22 @@
|
||||
import logging
|
||||
from typing import Any, Optional
|
||||
import random
|
||||
from typing import Any, Optional, cast
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from lxml import etree
|
||||
from seleniumwire import webdriver
|
||||
from lxml.etree import _Element
|
||||
from fake_useragent import UserAgent
|
||||
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
||||
from camoufox import AsyncCamoufox
|
||||
from playwright.async_api import Page
|
||||
from urllib.parse import urlparse, urljoin
|
||||
|
||||
from api.backend.models import Element, CapturedElement
|
||||
from api.backend.job.site_mapping.site_mapping import (
|
||||
handle_site_mapping,
|
||||
)
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from api.backend.job.scraping.scraping_utils import scrape_content
|
||||
from api.backend.job.site_mapping.site_mapping import handle_site_mapping
|
||||
|
||||
from api.backend.job.scraping.add_custom import add_custom_items
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HtmlElement(_Element): ...
|
||||
|
||||
|
||||
def is_same_domain(url: str, original_url: str) -> bool:
|
||||
parsed_url = urlparse(url)
|
||||
parsed_original_url = urlparse(original_url)
|
||||
@@ -31,68 +25,15 @@ def is_same_domain(url: str, original_url: str) -> bool:
|
||||
|
||||
def clean_xpath(xpath: str) -> str:
|
||||
parts = xpath.split("/")
|
||||
clean_parts: list[str] = []
|
||||
for part in parts:
|
||||
if part == "":
|
||||
clean_parts.append("/")
|
||||
else:
|
||||
clean_parts.append(part)
|
||||
clean_xpath = "//".join(clean_parts).replace("////", "//")
|
||||
clean_xpath = clean_xpath.replace("'", "\\'")
|
||||
clean_parts = ["/" if part == "" else part for part in parts]
|
||||
clean_xpath = "//".join(clean_parts).replace("////", "//").replace("'", "\\'")
|
||||
LOG.info(f"Cleaned xpath: {clean_xpath}")
|
||||
|
||||
return clean_xpath
|
||||
|
||||
|
||||
def sxpath(context: _Element, xpath: str) -> list[HtmlElement]:
|
||||
return context.xpath(xpath) # pyright: ignore [reportReturnType]
|
||||
|
||||
|
||||
def interceptor(headers: dict[str, Any]):
|
||||
def _interceptor(request: Any):
|
||||
for key, val in headers.items():
|
||||
if request.headers.get(key):
|
||||
del request.headers[key]
|
||||
request.headers[key] = val
|
||||
if "sec-ch-ua" in request.headers:
|
||||
original_value = request.headers["sec-ch-ua"]
|
||||
del request.headers["sec-ch-ua"]
|
||||
modified_value = original_value.replace("HeadlessChrome", "Chrome")
|
||||
request.headers["sec-ch-ua"] = modified_value
|
||||
|
||||
return _interceptor
|
||||
|
||||
|
||||
def create_driver(proxies: Optional[list[str]] = []):
|
||||
ua = UserAgent()
|
||||
chrome_options = ChromeOptions()
|
||||
chrome_options.add_argument("--headless")
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
chrome_options.add_argument(f"user-agent={ua.random}")
|
||||
|
||||
sw_options = {}
|
||||
|
||||
if proxies:
|
||||
selected_proxy = random.choice(proxies)
|
||||
LOG.info(f"Using proxy: {selected_proxy}")
|
||||
|
||||
sw_options = {
|
||||
"proxy": {
|
||||
"https": f"https://{selected_proxy}",
|
||||
"http": f"http://{selected_proxy}",
|
||||
"no_proxy": "localhost,127.0.0.1",
|
||||
}
|
||||
}
|
||||
|
||||
service = Service(ChromeDriverManager().install())
|
||||
|
||||
driver = webdriver.Chrome(
|
||||
service=service,
|
||||
options=chrome_options,
|
||||
seleniumwire_options=sw_options,
|
||||
)
|
||||
|
||||
return driver
|
||||
def sxpath(context: etree._Element, xpath: str):
|
||||
return context.xpath(xpath)
|
||||
|
||||
|
||||
async def make_site_request(
|
||||
@@ -102,91 +43,108 @@ async def make_site_request(
|
||||
visited_urls: set[str] = set(),
|
||||
pages: set[tuple[str, str]] = set(),
|
||||
original_url: str = "",
|
||||
proxies: Optional[list[str]] = [],
|
||||
proxies: Optional[list[str]] = None,
|
||||
site_map: Optional[dict[str, Any]] = None,
|
||||
collect_media: bool = False,
|
||||
) -> None:
|
||||
"""Make basic `GET` request to site using Selenium."""
|
||||
# Check if URL has already been visited
|
||||
custom_cookies: Optional[list[dict[str, Any]]] = None,
|
||||
):
|
||||
if url in visited_urls:
|
||||
return
|
||||
|
||||
driver = create_driver(proxies)
|
||||
driver.implicitly_wait(10)
|
||||
proxy = None
|
||||
|
||||
if headers:
|
||||
driver.request_interceptor = interceptor(headers)
|
||||
if proxies:
|
||||
proxy = random.choice(proxies)
|
||||
LOG.info(f"Using proxy: {proxy}")
|
||||
|
||||
async with AsyncCamoufox(headless=True, proxy=proxy) as browser:
|
||||
page: Page = await browser.new_page()
|
||||
|
||||
# Add cookies and headers
|
||||
await add_custom_items(url, page, custom_cookies, headers)
|
||||
|
||||
try:
|
||||
LOG.info(f"Visiting URL: {url}")
|
||||
driver.get(url)
|
||||
|
||||
final_url = driver.current_url
|
||||
visited_urls.add(url)
|
||||
visited_urls.add(final_url)
|
||||
try:
|
||||
await page.goto(url, timeout=60000)
|
||||
await page.wait_for_load_state("networkidle", timeout=10000)
|
||||
|
||||
page_source = scrape_content(driver, pages, collect_media)
|
||||
final_url = page.url
|
||||
|
||||
if site_map:
|
||||
LOG.info("Site map: %s", site_map)
|
||||
_ = await handle_site_mapping(
|
||||
site_map,
|
||||
driver,
|
||||
pages,
|
||||
)
|
||||
finally:
|
||||
driver.quit()
|
||||
visited_urls.add(url)
|
||||
visited_urls.add(final_url)
|
||||
|
||||
html_content = await scrape_content(page, pages, collect_media)
|
||||
|
||||
html_content = await page.content()
|
||||
pages.add((html_content, final_url))
|
||||
|
||||
if site_map:
|
||||
await handle_site_mapping(
|
||||
site_map, page, pages, collect_media=collect_media
|
||||
)
|
||||
|
||||
finally:
|
||||
await page.close()
|
||||
await browser.close()
|
||||
|
||||
if not multi_page_scrape:
|
||||
return
|
||||
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
|
||||
for a_tag in soup.find_all("a"):
|
||||
if not isinstance(a_tag, Tag):
|
||||
continue
|
||||
|
||||
link = str(a_tag.get("href", ""))
|
||||
link = cast(str, a_tag.get("href", ""))
|
||||
|
||||
if link:
|
||||
if not urlparse(link).netloc:
|
||||
base_url = "{0.scheme}://{0.netloc}".format(urlparse(final_url))
|
||||
link = urljoin(base_url, link)
|
||||
if not link:
|
||||
continue
|
||||
|
||||
if link not in visited_urls and is_same_domain(link, original_url):
|
||||
await make_site_request(
|
||||
link,
|
||||
headers=headers,
|
||||
multi_page_scrape=multi_page_scrape,
|
||||
visited_urls=visited_urls,
|
||||
pages=pages,
|
||||
original_url=original_url,
|
||||
)
|
||||
if not urlparse(link).netloc:
|
||||
base_url = "{0.scheme}://{0.netloc}".format(urlparse(final_url))
|
||||
link = urljoin(base_url, link)
|
||||
|
||||
if link not in visited_urls and is_same_domain(link, original_url):
|
||||
await make_site_request(
|
||||
link,
|
||||
headers=headers,
|
||||
multi_page_scrape=multi_page_scrape,
|
||||
visited_urls=visited_urls,
|
||||
pages=pages,
|
||||
original_url=original_url,
|
||||
proxies=proxies,
|
||||
site_map=site_map,
|
||||
collect_media=collect_media,
|
||||
custom_cookies=custom_cookies,
|
||||
)
|
||||
|
||||
|
||||
async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]):
|
||||
soup = BeautifulSoup(page[0], "lxml")
|
||||
root = etree.HTML(str(soup))
|
||||
|
||||
elements: dict[str, list[CapturedElement]] = dict()
|
||||
elements: dict[str, list[CapturedElement]] = {}
|
||||
|
||||
for elem in xpaths:
|
||||
el = sxpath(root, elem.xpath)
|
||||
|
||||
for e in el:
|
||||
if isinstance(e, etree._Element): # type: ignore
|
||||
text = "\t".join(str(t) for t in e.itertext())
|
||||
else:
|
||||
text = str(e)
|
||||
for e in el: # type: ignore
|
||||
text = (
|
||||
"\t".join(str(t) for t in e.itertext())
|
||||
if isinstance(e, etree._Element)
|
||||
else str(e) # type: ignore
|
||||
)
|
||||
|
||||
captured_element = CapturedElement(
|
||||
xpath=elem.xpath, text=text, name=elem.name
|
||||
)
|
||||
|
||||
if elem.name in elements:
|
||||
elements[elem.name].append(captured_element)
|
||||
continue
|
||||
|
||||
elements[elem.name] = [captured_element]
|
||||
else:
|
||||
elements[elem.name] = [captured_element]
|
||||
|
||||
return {page[1]: elements}
|
||||
|
||||
@@ -194,18 +152,19 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
|
||||
async def scrape(
|
||||
url: str,
|
||||
xpaths: list[Element],
|
||||
headers: Optional[dict[str, Any]],
|
||||
headers: Optional[dict[str, Any]] = None,
|
||||
multi_page_scrape: bool = False,
|
||||
proxies: Optional[list[str]] = [],
|
||||
proxies: Optional[list[str]] = None,
|
||||
site_map: Optional[dict[str, Any]] = None,
|
||||
collect_media: bool = False,
|
||||
custom_cookies: Optional[list[dict[str, Any]]] = None,
|
||||
):
|
||||
visited_urls: set[str] = set()
|
||||
pages: set[tuple[str, str]] = set()
|
||||
|
||||
_ = await make_site_request(
|
||||
await make_site_request(
|
||||
url,
|
||||
headers,
|
||||
headers=headers,
|
||||
multi_page_scrape=multi_page_scrape,
|
||||
visited_urls=visited_urls,
|
||||
pages=pages,
|
||||
@@ -213,9 +172,10 @@ async def scrape(
|
||||
proxies=proxies,
|
||||
site_map=site_map,
|
||||
collect_media=collect_media,
|
||||
custom_cookies=custom_cookies,
|
||||
)
|
||||
|
||||
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()
|
||||
elements: list[dict[str, dict[str, list[CapturedElement]]]] = []
|
||||
|
||||
for page in pages:
|
||||
elements.append(await collect_scraped_elements(page, xpaths))
|
||||
|
||||
@@ -21,7 +21,7 @@ async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
|
||||
mock_randint.return_value = mocked_random_int
|
||||
|
||||
# Create a DownloadJob instance
|
||||
download_job = DownloadJob(ids=[mocked_job["id"]])
|
||||
download_job = DownloadJob(ids=[mocked_job["id"]], job_format="csv")
|
||||
|
||||
# Make a POST request to the /download endpoint
|
||||
response = client.post("/download", json=download_job.model_dump())
|
||||
|
||||
@@ -1,27 +1,53 @@
|
||||
import pytest
|
||||
import logging
|
||||
from unittest.mock import AsyncMock, patch, MagicMock
|
||||
from api.backend.scraping import create_driver
|
||||
from typing import Dict
|
||||
from playwright.async_api import async_playwright, Cookie, Route
|
||||
from api.backend.job.scraping.add_custom import add_custom_items
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch("seleniumwire.webdriver.Chrome.get")
|
||||
async def test_proxy(mock_get: AsyncMock):
|
||||
# Mock the response of the requests.get call
|
||||
mock_response = MagicMock()
|
||||
mock_get.return_value = mock_response
|
||||
async def test_add_custom_items():
|
||||
test_cookies = [{"name": "big", "value": "cookie"}]
|
||||
test_headers = {"User-Agent": "test-agent", "Accept": "application/json"}
|
||||
|
||||
driver = create_driver(proxies=["127.0.0.1:8080"])
|
||||
assert driver is not None
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context()
|
||||
page = await context.new_page()
|
||||
|
||||
# Simulate a request
|
||||
driver.get("http://example.com")
|
||||
response = driver.last_request
|
||||
# Set up request interception
|
||||
captured_headers: Dict[str, str] = {}
|
||||
|
||||
if response:
|
||||
assert response.headers["Proxy-Connection"] == "keep-alive"
|
||||
async def handle_route(route: Route) -> None:
|
||||
nonlocal captured_headers
|
||||
captured_headers = route.request.headers
|
||||
await route.continue_()
|
||||
|
||||
driver.quit()
|
||||
await page.route("**/*", handle_route)
|
||||
|
||||
await add_custom_items(
|
||||
url="http://example.com",
|
||||
page=page,
|
||||
cookies=test_cookies,
|
||||
headers=test_headers,
|
||||
)
|
||||
|
||||
# Navigate to example.com
|
||||
await page.goto("http://example.com")
|
||||
|
||||
# Verify cookies were added
|
||||
cookies: list[Cookie] = await page.context.cookies()
|
||||
test_cookie = next((c for c in cookies if c.get("name") == "big"), None)
|
||||
|
||||
assert test_cookie is not None
|
||||
assert test_cookie.get("value") == "cookie"
|
||||
assert test_cookie.get("path") == "/" # Default path should be set
|
||||
assert test_cookie.get("sameSite") == "Lax" # Default sameSite should be set
|
||||
|
||||
# Verify headers were added
|
||||
assert captured_headers.get("user-agent") == "test-agent"
|
||||
|
||||
await browser.close()
|
||||
|
||||
@@ -1,33 +1,59 @@
|
||||
import os
|
||||
import json
|
||||
|
||||
from api.backend.job import get_queued_job, update_job
|
||||
from api.backend.scraping import scrape
|
||||
from api.backend.models import Element
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
from api.backend.database.startup import init_database
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||
LOG = logging.getLogger(__name__)
|
||||
from api.backend.worker.post_job_complete.post_job_complete import post_job_complete
|
||||
from api.backend.worker.logger import LOG
|
||||
|
||||
|
||||
NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "")
|
||||
NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "")
|
||||
SCRAPERR_FRONTEND_URL = os.getenv("SCRAPERR_FRONTEND_URL", "")
|
||||
EMAIL = os.getenv("EMAIL", "")
|
||||
TO = os.getenv("TO", "")
|
||||
SMTP_HOST = os.getenv("SMTP_HOST", "")
|
||||
SMTP_PORT = int(os.getenv("SMTP_PORT", 587))
|
||||
SMTP_USER = os.getenv("SMTP_USER", "")
|
||||
SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "")
|
||||
USE_TLS = os.getenv("USE_TLS", "false").lower() == "true"
|
||||
|
||||
|
||||
async def process_job():
|
||||
job = await get_queued_job()
|
||||
status = "Queued"
|
||||
|
||||
if job:
|
||||
LOG.info(f"Beginning processing job: {job}.")
|
||||
try:
|
||||
_ = await update_job([job["id"]], field="status", value="Scraping")
|
||||
|
||||
proxies = job["job_options"]["proxies"]
|
||||
|
||||
if proxies and isinstance(proxies[0], str) and proxies[0].startswith("{"):
|
||||
try:
|
||||
proxies = [json.loads(p) for p in proxies]
|
||||
except json.JSONDecodeError:
|
||||
LOG.error(f"Failed to parse proxy JSON: {proxies}")
|
||||
proxies = []
|
||||
|
||||
scraped = await scrape(
|
||||
job["url"],
|
||||
[Element(**j) for j in job["elements"]],
|
||||
job["job_options"]["custom_headers"],
|
||||
job["job_options"]["multi_page_scrape"],
|
||||
job["job_options"]["proxies"],
|
||||
proxies,
|
||||
job["job_options"]["site_map"],
|
||||
job["job_options"]["collect_media"],
|
||||
job["job_options"]["custom_cookies"],
|
||||
)
|
||||
LOG.info(
|
||||
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
|
||||
@@ -36,10 +62,30 @@ async def process_job():
|
||||
[job["id"]], field="result", value=jsonable_encoder(scraped)
|
||||
)
|
||||
_ = await update_job([job["id"]], field="status", value="Completed")
|
||||
status = "Completed"
|
||||
|
||||
except Exception as e:
|
||||
_ = await update_job([job["id"]], field="status", value="Failed")
|
||||
_ = await update_job([job["id"]], field="result", value=e)
|
||||
LOG.error(f"Exception as occured: {e}\n{traceback.print_exc()}")
|
||||
status = "Failed"
|
||||
finally:
|
||||
job["status"] = status
|
||||
await post_job_complete(
|
||||
job,
|
||||
{
|
||||
"channel": NOTIFICATION_CHANNEL,
|
||||
"webhook_url": NOTIFICATION_WEBHOOK_URL,
|
||||
"scraperr_frontend_url": SCRAPERR_FRONTEND_URL,
|
||||
"email": EMAIL,
|
||||
"to": TO,
|
||||
"smtp_host": SMTP_HOST,
|
||||
"smtp_port": SMTP_PORT,
|
||||
"smtp_user": SMTP_USER,
|
||||
"smtp_password": SMTP_PASSWORD,
|
||||
"use_tls": USE_TLS,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
async def main():
|
||||
|
||||
12
api/backend/worker/logger.py
Normal file
12
api/backend/worker/logger.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
from api.backend.utils import get_log_level
|
||||
|
||||
logging.basicConfig(
|
||||
level=get_log_level(os.getenv("LOG_LEVEL")),
|
||||
format="%(levelname)s: %(asctime)s - %(name)s - %(message)s",
|
||||
handlers=[logging.StreamHandler()],
|
||||
)
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
56
api/backend/worker/post_job_complete/discord_notification.py
Normal file
56
api/backend/worker/post_job_complete/discord_notification.py
Normal file
@@ -0,0 +1,56 @@
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
|
||||
from api.backend.worker.logger import LOG
|
||||
from api.backend.worker.post_job_complete.models import (
|
||||
PostJobCompleteOptions,
|
||||
JOB_COLOR_MAP,
|
||||
)
|
||||
|
||||
|
||||
def discord_notification(job: dict[str, Any], options: PostJobCompleteOptions):
|
||||
webhook_url = options["webhook_url"]
|
||||
scraperr_frontend_url = options["scraperr_frontend_url"]
|
||||
|
||||
LOG.info(f"Sending discord notification to {webhook_url}")
|
||||
|
||||
embed = {
|
||||
"title": "Job Completed",
|
||||
"description": "Scraping job has been completed.",
|
||||
"color": JOB_COLOR_MAP[job["status"]],
|
||||
"url": f"{scraperr_frontend_url}/jobs?search={job['id']}&type=id",
|
||||
"image": {
|
||||
"url": "https://github.com/jaypyles/Scraperr/raw/master/docs/logo_picture.png",
|
||||
},
|
||||
"author": {
|
||||
"name": "Scraperr",
|
||||
"url": "https://github.com/jaypyles/Scraperr",
|
||||
},
|
||||
"fields": [
|
||||
{
|
||||
"name": "Status",
|
||||
"value": "Completed",
|
||||
"inline": True,
|
||||
},
|
||||
{
|
||||
"name": "URL",
|
||||
"value": job["url"],
|
||||
"inline": True,
|
||||
},
|
||||
{
|
||||
"name": "ID",
|
||||
"value": job["id"],
|
||||
"inline": False,
|
||||
},
|
||||
{
|
||||
"name": "Options",
|
||||
"value": f"```json\n{json.dumps(job['job_options'], indent=4)}\n```",
|
||||
"inline": False,
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
payload = {"embeds": [embed]}
|
||||
requests.post(webhook_url, json=payload)
|
||||
97
api/backend/worker/post_job_complete/email_notifcation.py
Normal file
97
api/backend/worker/post_job_complete/email_notifcation.py
Normal file
@@ -0,0 +1,97 @@
|
||||
import smtplib
|
||||
import ssl
|
||||
from email.mime.text import MIMEText
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from api.backend.worker.logger import LOG
|
||||
|
||||
from api.backend.worker.post_job_complete.models import (
|
||||
JOB_COLOR_MAP,
|
||||
PostJobCompleteOptions,
|
||||
)
|
||||
|
||||
|
||||
def send_job_complete_email(
|
||||
job: dict[str, Any],
|
||||
options: PostJobCompleteOptions,
|
||||
):
|
||||
status = job["status"]
|
||||
status_color = JOB_COLOR_MAP.get(status, 0x808080)
|
||||
job_url = job["url"]
|
||||
job_id = job["id"]
|
||||
job_options_json = json.dumps(job["job_options"], indent=4)
|
||||
frontend_url = options["scraperr_frontend_url"]
|
||||
|
||||
subject = "📦 Job Completed - Scraperr Notification"
|
||||
|
||||
html = f"""
|
||||
<html>
|
||||
<body style="font-family: Arial, sans-serif;">
|
||||
<h2 style="color: #{status_color:06x};">✅ Job Completed</h2>
|
||||
<p>Scraping job has been completed successfully.</p>
|
||||
|
||||
<a href="{frontend_url}/jobs?search={job_id}&type=id" target="_blank">
|
||||
<img src="https://github.com/jaypyles/Scraperr/raw/master/docs/logo_picture.png" alt="Scraperr Logo" width="200">
|
||||
</a>
|
||||
|
||||
<h3>Job Info:</h3>
|
||||
<ul>
|
||||
<li><strong>Status:</strong> {status}</li>
|
||||
<li><strong>Job URL:</strong> <a href="{job_url}">{job_url}</a></li>
|
||||
<li><strong>Job ID:</strong> {job_id}</li>
|
||||
</ul>
|
||||
|
||||
<h3>Options:</h3>
|
||||
<pre style="background-color:#f4f4f4; padding:10px; border-radius:5px;">
|
||||
{job_options_json}
|
||||
</pre>
|
||||
|
||||
<h3>View your job here:</h3>
|
||||
<a href="{options['scraperr_frontend_url']}/jobs?search={job_id}&type=id">Scraperr Job</a>
|
||||
|
||||
<p style="font-size: 12px; color: gray;">
|
||||
Sent by <a href="https://github.com/jaypyles/Scraperr" target="_blank">Scraperr</a>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
# Create email
|
||||
message = MIMEMultipart("alternative")
|
||||
message["From"] = options["email"]
|
||||
message["To"] = options["to"]
|
||||
message["Subject"] = subject
|
||||
message.attach(
|
||||
MIMEText(
|
||||
"Job completed. View this email in HTML format for full details.", "plain"
|
||||
)
|
||||
)
|
||||
message.attach(MIMEText(html, "html"))
|
||||
|
||||
context = ssl.create_default_context()
|
||||
|
||||
try:
|
||||
if options["use_tls"]:
|
||||
with smtplib.SMTP(options["smtp_host"], options["smtp_port"]) as server:
|
||||
server.starttls(context=context)
|
||||
server.login(options["smtp_user"], options["smtp_password"])
|
||||
server.sendmail(
|
||||
from_addr=options["email"],
|
||||
to_addrs=options["to"],
|
||||
msg=message.as_string(),
|
||||
)
|
||||
else:
|
||||
with smtplib.SMTP_SSL(
|
||||
options["smtp_host"], options["smtp_port"], context=context
|
||||
) as server:
|
||||
server.login(options["smtp_user"], options["smtp_password"])
|
||||
server.sendmail(
|
||||
from_addr=options["email"],
|
||||
to_addrs=options["to"],
|
||||
msg=message.as_string(),
|
||||
)
|
||||
LOG.info("✅ Email sent successfully!")
|
||||
except Exception as e:
|
||||
LOG.error(f"❌ Failed to send email: {e}")
|
||||
22
api/backend/worker/post_job_complete/models.py
Normal file
22
api/backend/worker/post_job_complete/models.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from typing import TypedDict
|
||||
|
||||
|
||||
class PostJobCompleteOptions(TypedDict):
|
||||
channel: str
|
||||
webhook_url: str
|
||||
scraperr_frontend_url: str
|
||||
email: str
|
||||
to: str
|
||||
smtp_host: str
|
||||
smtp_port: int
|
||||
smtp_user: str
|
||||
smtp_password: str
|
||||
use_tls: bool
|
||||
|
||||
|
||||
JOB_COLOR_MAP = {
|
||||
"Queued": 0x0000FF,
|
||||
"Scraping": 0x0000FF,
|
||||
"Completed": 0x00FF00,
|
||||
"Failed": 0xFF0000,
|
||||
}
|
||||
24
api/backend/worker/post_job_complete/post_job_complete.py
Normal file
24
api/backend/worker/post_job_complete/post_job_complete.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from typing import Any
|
||||
|
||||
from api.backend.worker.post_job_complete.models import PostJobCompleteOptions
|
||||
from api.backend.worker.post_job_complete.email_notifcation import (
|
||||
send_job_complete_email,
|
||||
)
|
||||
from api.backend.worker.post_job_complete.discord_notification import (
|
||||
discord_notification,
|
||||
)
|
||||
|
||||
|
||||
async def post_job_complete(job: dict[str, Any], options: PostJobCompleteOptions):
|
||||
if options["channel"] == "":
|
||||
return
|
||||
|
||||
if not options.values():
|
||||
return
|
||||
|
||||
if options["channel"] == "discord":
|
||||
discord_notification(job, options)
|
||||
elif options["channel"] == "email":
|
||||
send_job_complete_email(job, options)
|
||||
else:
|
||||
raise ValueError(f"Invalid channel: {options['channel']}")
|
||||
@@ -24,7 +24,7 @@ describe.only("Job", () => {
|
||||
expect(interception.response.statusCode).to.eq(200);
|
||||
});
|
||||
|
||||
cy.get("li").contains("Previous Jobs").click();
|
||||
cy.get("li").contains("Jobs").click();
|
||||
|
||||
cy.contains("div", "https://example.com", { timeout: 10000 }).should(
|
||||
"exist"
|
||||
|
||||
@@ -13,5 +13,4 @@ services:
|
||||
environment:
|
||||
- LOG_LEVEL=INFO
|
||||
volumes:
|
||||
- "$PWD/api:/project/api"
|
||||
- "$PWD/scraping:/project/scraping"
|
||||
- "$PWD/api:/project/app/api"
|
||||
|
||||
@@ -2,7 +2,7 @@ services:
|
||||
scraperr:
|
||||
depends_on:
|
||||
- scraperr_api
|
||||
image: jpyles0524/scraperr:latest
|
||||
image: jpyles0524/scraperr:1.0.13
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/frontend/Dockerfile
|
||||
@@ -23,17 +23,14 @@ services:
|
||||
dockerfile: docker/api/Dockerfile
|
||||
environment:
|
||||
- LOG_LEVEL=INFO
|
||||
- SECRET_KEY=MRo9PfasPibnqFeK4Oswb6Z+PhFmjzdvxZzwdAkbf/Y= # used to encode authentication tokens (can be a random string)
|
||||
- ALGORITHM=HS256 # authentication encoding algorithm
|
||||
- ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes
|
||||
container_name: scraperr_api
|
||||
ports:
|
||||
- 8000:8000
|
||||
volumes:
|
||||
- "$PWD/data:/project/data"
|
||||
- "$PWD/media:/project/media"
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
- "$PWD/data:/project/app/data"
|
||||
- "$PWD/media:/project/app/media"
|
||||
networks:
|
||||
- web
|
||||
|
||||
networks:
|
||||
web:
|
||||
|
||||
@@ -1,36 +1,36 @@
|
||||
# Build python dependencies
|
||||
FROM python:3.10.12-slim as pybuilder
|
||||
|
||||
RUN apt update && apt install -y uvicorn
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl && \
|
||||
apt-get install -y uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 && \
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
||||
apt-get remove -y curl && \
|
||||
apt-get autoremove -y && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN python -m pip --no-cache-dir install pdm
|
||||
RUN pdm config python.use_venv false
|
||||
|
||||
|
||||
WORKDIR /project/app
|
||||
COPY pyproject.toml pdm.lock /project/app/
|
||||
RUN pdm install
|
||||
|
||||
RUN pdm run playwright install --with-deps
|
||||
|
||||
RUN pdm run camoufox fetch
|
||||
|
||||
COPY ./api/ /project/app/api
|
||||
|
||||
# Create final image
|
||||
FROM python:3.10.12-slim
|
||||
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y wget gnupg supervisor
|
||||
RUN wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add -
|
||||
RUN sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list'
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y google-chrome-stable
|
||||
|
||||
ENV PYTHONPATH=/project/pkgs
|
||||
COPY --from=pybuilder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
|
||||
COPY --from=pybuilder /usr/local/bin /usr/local/bin
|
||||
COPY --from=pybuilder /project/app /project/
|
||||
|
||||
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
WORKDIR /project/
|
||||
WORKDIR /project/app
|
||||
|
||||
RUN mkdir -p /project/app/data
|
||||
RUN touch /project/app/data/database.db
|
||||
|
||||
CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ]
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 46 KiB After Width: | Height: | Size: 47 KiB |
23
helm/.helmignore
Normal file
23
helm/.helmignore
Normal file
@@ -0,0 +1,23 @@
|
||||
# Patterns to ignore when building packages.
|
||||
# This supports shell glob matching, relative path matching, and
|
||||
# negation (prefixed with !). Only one pattern per line.
|
||||
.DS_Store
|
||||
# Common VCS dirs
|
||||
.git/
|
||||
.gitignore
|
||||
.bzr/
|
||||
.bzrignore
|
||||
.hg/
|
||||
.hgignore
|
||||
.svn/
|
||||
# Common backup files
|
||||
*.swp
|
||||
*.bak
|
||||
*.tmp
|
||||
*.orig
|
||||
*~
|
||||
# Various IDEs
|
||||
.project
|
||||
.idea/
|
||||
*.tmproj
|
||||
.vscode/
|
||||
24
helm/Chart.yaml
Normal file
24
helm/Chart.yaml
Normal file
@@ -0,0 +1,24 @@
|
||||
apiVersion: v2
|
||||
name: scraperr
|
||||
description: A Helm chart for Kubernetes
|
||||
|
||||
# A chart can be either an 'application' or a 'library' chart.
|
||||
#
|
||||
# Application charts are a collection of templates that can be packaged into versioned archives
|
||||
# to be deployed.
|
||||
#
|
||||
# Library charts provide useful utilities or functions for the chart developer. They're included as
|
||||
# a dependency of application charts to inject those utilities and functions into the rendering
|
||||
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
|
||||
type: application
|
||||
|
||||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 1.0.14
|
||||
|
||||
# This is the version number of the application being deployed. This version number should be
|
||||
# incremented each time you make changes to the application. Versions are not expected to
|
||||
# follow Semantic Versioning. They should reflect the version the application is using.
|
||||
# It is recommended to use it with quotes.
|
||||
appVersion: "1.16.0"
|
||||
56
helm/templates/deployment.yaml
Normal file
56
helm/templates/deployment.yaml
Normal file
@@ -0,0 +1,56 @@
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: scraperr
|
||||
spec:
|
||||
replicas: {{ .Values.replicaCount }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: scraperr
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: scraperr
|
||||
spec:
|
||||
containers:
|
||||
- name: scraperr
|
||||
{{ if .Values.scraperr.image.repository }}
|
||||
image: "{{ .Values.scraperr.image.repository }}:{{ .Values.scraperr.image.tag }}"
|
||||
{{ else }}
|
||||
image: "{{ .Chart.Name }}:{{ .Chart.Version }}"
|
||||
{{ end }}
|
||||
imagePullPolicy: {{ .Values.scraperr.image.pullPolicy }}
|
||||
command: {{ .Values.scraperr.containerCommand | toJson }}
|
||||
ports:
|
||||
- containerPort: {{ .Values.scraperr.containerPort }}
|
||||
env: {{ toYaml .Values.scraperr.env | nindent 12 }}
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: scraperr-api
|
||||
spec:
|
||||
replicas: {{ .Values.replicaCount }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: scraperr-api
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: scraperr-api
|
||||
spec:
|
||||
containers:
|
||||
- name: scraperr-api
|
||||
{{ if .Values.scraperrApi.image.repository }}
|
||||
image: "{{ .Values.scraperrApi.image.repository }}:{{ .Values.scraperrApi.image.tag }}"
|
||||
{{ else }}
|
||||
image: "{{ .Chart.Name }}:{{ .Chart.Version }}"
|
||||
{{ end }}
|
||||
imagePullPolicy: {{ .Values.scraperrApi.image.pullPolicy }}
|
||||
ports:
|
||||
- containerPort: {{ .Values.scraperrApi.containerPort }}
|
||||
env: {{ toYaml .Values.scraperrApi.env | nindent 12 }}
|
||||
volumeMounts: {{ toYaml .Values.scraperrApi.volumeMounts | nindent 12 }}
|
||||
volumes: {{ toYaml .Values.scraperrApi.volumes | nindent 12 }}
|
||||
37
helm/templates/service.yaml
Normal file
37
helm/templates/service.yaml
Normal file
@@ -0,0 +1,37 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: scraperr
|
||||
spec:
|
||||
type: {{ .Values.scraperr.serviceType }}
|
||||
selector:
|
||||
app: scraperr
|
||||
ports:
|
||||
{{- range .Values.scraperr.ports }}
|
||||
- port: {{ .port }}
|
||||
targetPort: {{ .targetPort }}
|
||||
{{- if .nodePort }}
|
||||
nodePort: {{ .nodePort }}
|
||||
{{- end }}
|
||||
protocol: {{ .protocol | default "TCP" }}
|
||||
{{- end }}
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: scraperr-api
|
||||
spec:
|
||||
type: {{ .Values.scraperrApi.serviceType }}
|
||||
selector:
|
||||
app: scraperr-api
|
||||
ports:
|
||||
{{- range .Values.scraperrApi.ports }}
|
||||
- port: {{ .port }}
|
||||
targetPort: {{ .targetPort }}
|
||||
{{- if .nodePort }}
|
||||
nodePort: {{ .nodePort }}
|
||||
{{- end }}
|
||||
protocol: {{ .protocol | default "TCP" }}
|
||||
{{- end }}
|
||||
47
helm/values.yaml
Normal file
47
helm/values.yaml
Normal file
@@ -0,0 +1,47 @@
|
||||
scraperr:
|
||||
image:
|
||||
repository: jpyles0524/scraperr
|
||||
tag: latest
|
||||
pullPolicy: IfNotPresent
|
||||
containerCommand: ["npm", "run","start"]
|
||||
containerPort: 3000
|
||||
serviceType: NodePort
|
||||
ports:
|
||||
- port: 80
|
||||
targetPort: 3000
|
||||
nodePort: 32300
|
||||
protocol: TCP
|
||||
env:
|
||||
- name: NEXT_PUBLIC_API_URL
|
||||
value: "http://scraperr-api:8000"
|
||||
- name: SERVER_URL
|
||||
value: "http://scraperr-api:8000"
|
||||
|
||||
scraperrApi:
|
||||
image:
|
||||
repository: jpyles0524/scraperr_api
|
||||
tag: latest
|
||||
pullPolicy: IfNotPresent
|
||||
containerPort: 8000
|
||||
serviceType: ClusterIP
|
||||
ports:
|
||||
- port: 8000
|
||||
targetPort: 8000
|
||||
protocol: TCP
|
||||
env:
|
||||
- name: LOG_LEVEL
|
||||
value: "INFO"
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /project/app/data
|
||||
- name: media
|
||||
mountPath: /project/app/media
|
||||
volumes:
|
||||
- name: data
|
||||
hostPath:
|
||||
path: /data/scraperr/data
|
||||
type: DirectoryOrCreate
|
||||
- name: media
|
||||
hostPath:
|
||||
path: /data/scraperr/media
|
||||
replicaCount: 1
|
||||
389
pdm.lock
generated
389
pdm.lock
generated
@@ -5,7 +5,7 @@
|
||||
groups = ["default", "dev"]
|
||||
strategy = ["inherit_metadata"]
|
||||
lock_version = "4.5.0"
|
||||
content_hash = "sha256:1d142e8b44e3a6a04135c54e1967b7c19c5c7ccd6b2ff8ec8bca8792bf961bb9"
|
||||
content_hash = "sha256:cb37fedd6d022515dde14e475588a8da2144ba22e41dfdfacfe3f7a7d14486ca"
|
||||
|
||||
[[metadata.targets]]
|
||||
requires_python = ">=3.10"
|
||||
@@ -457,6 +457,21 @@ files = [
|
||||
{file = "Brotli-1.1.0.tar.gz", hash = "sha256:81de08ac11bcb85841e440c13611c00b67d3bf82698314928d0b676362546724"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "browserforge"
|
||||
version = "1.2.3"
|
||||
requires_python = "<4.0,>=3.8"
|
||||
summary = "Intelligent browser header & fingerprint generator"
|
||||
groups = ["default"]
|
||||
dependencies = [
|
||||
"click",
|
||||
"typing-extensions; python_version < \"3.10\"",
|
||||
]
|
||||
files = [
|
||||
{file = "browserforge-1.2.3-py3-none-any.whl", hash = "sha256:a6c71ed4688b2f1b0bee757ca82ddad0007cbba68a71eca66ca607dde382f132"},
|
||||
{file = "browserforge-1.2.3.tar.gz", hash = "sha256:d5bec6dffd4748b30fbac9f9c1ef33b26c01a23185240bf90011843e174b7ecc"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bs4"
|
||||
version = "0.0.2"
|
||||
@@ -470,6 +485,34 @@ files = [
|
||||
{file = "bs4-0.0.2.tar.gz", hash = "sha256:a48685c58f50fe127722417bae83fe6badf500d54b55f7e39ffe43b798653925"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "camoufox"
|
||||
version = "0.4.11"
|
||||
requires_python = "<4.0,>=3.8"
|
||||
summary = "Wrapper around Playwright to help launch Camoufox"
|
||||
groups = ["default"]
|
||||
dependencies = [
|
||||
"browserforge<2.0.0,>=1.2.1",
|
||||
"click",
|
||||
"language-tags",
|
||||
"lxml",
|
||||
"numpy",
|
||||
"orjson",
|
||||
"platformdirs",
|
||||
"playwright",
|
||||
"pysocks",
|
||||
"pyyaml",
|
||||
"requests",
|
||||
"screeninfo",
|
||||
"tqdm",
|
||||
"typing-extensions",
|
||||
"ua-parser",
|
||||
]
|
||||
files = [
|
||||
{file = "camoufox-0.4.11-py3-none-any.whl", hash = "sha256:83864d434d159a7566990aa6524429a8d1a859cbf84d2f64ef4a9f29e7d2e5ff"},
|
||||
{file = "camoufox-0.4.11.tar.gz", hash = "sha256:0a2c9d24ac5070c104e7c2b125c0a3937f70efa416084ef88afe94c32a72eebe"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "certifi"
|
||||
version = "2025.1.31"
|
||||
@@ -688,6 +731,58 @@ files = [
|
||||
{file = "cssselect-1.3.0.tar.gz", hash = "sha256:57f8a99424cfab289a1b6a816a43075a4b00948c86b4dcf3ef4ee7e15f7ab0c7"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cython"
|
||||
version = "3.1.0"
|
||||
requires_python = ">=3.8"
|
||||
summary = "The Cython compiler for writing C extensions in the Python language."
|
||||
groups = ["default"]
|
||||
marker = "sys_platform == \"darwin\""
|
||||
files = [
|
||||
{file = "cython-3.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:335982ac0b71a75720b99b980570b9a8416fafd1989ccf4292c0f2e0e1902eac"},
|
||||
{file = "cython-3.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c9389b7941e333a1cc11074556adbf6a9f97ed3de141c1b45cc9f957cd7f7fa2"},
|
||||
{file = "cython-3.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:136c938f3c0fe91bea3eab32751b860ab7587285c5225436b76a98fe933c599a"},
|
||||
{file = "cython-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d722d311fee9f0dc80b17b8f9d1f46311be63b631b7aeed8530bf5f5e8849507"},
|
||||
{file = "cython-3.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:95eb189635a4542f1f8471bcf9756bffdac5294c41d4a4de935c77852d54e992"},
|
||||
{file = "cython-3.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:c063146c711751701ad662eefbdf5b396098d646f1239a2f5a6caea2d6707c5d"},
|
||||
{file = "cython-3.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:d78774a25c221fbda3855acbccb249989a04d334fb4ac8112ab5ffe4f1bcc65e"},
|
||||
{file = "cython-3.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:678e204230ece3205c17285383727b9e99097e7a80330fabb75cdd80d1f4c2ee"},
|
||||
{file = "cython-3.1.0-cp310-cp310-win32.whl", hash = "sha256:8029dffafa9ec5e83b6cc28f8b061f158132f2b1e800673716f7b9d9f85f2335"},
|
||||
{file = "cython-3.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:8dbefee67f3c9219cc9d2311e4ebf9f7b930e1db4b6eec2863df0c436e3c78d0"},
|
||||
{file = "cython-3.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c736405078dc376502617eb41c39e223ae176ebd1a4ddc18179d2517bc8c8658"},
|
||||
{file = "cython-3.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1215d3adb4e8691d03e712aed31206d21f387a8003d8de6a574ee75fe6d2e07c"},
|
||||
{file = "cython-3.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:522d4dae1fea71eee5c944fb7a8530de8bdd6df0ccb2bd001d0f75be228eac6c"},
|
||||
{file = "cython-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:462ad6142057e81715ada74e2d24b9a07bf36ae3da72bf973478b5c3e809c26d"},
|
||||
{file = "cython-3.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a8f00cdeb14f004ebeacf946e06bad2e3ed5776af96f5af95f92d822c4ba275f"},
|
||||
{file = "cython-3.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:37d62b8b8919126c75769e5470b288d76c83a1645e73c7aca4b7d7aecb3c1234"},
|
||||
{file = "cython-3.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:bea0b6bfde7493acb0529fc603abd4b3b13c3bb2fff7a889ae5a8d3ea7dc5a84"},
|
||||
{file = "cython-3.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:fe8c1db9ec03d9ef83e33c842c108e892577ade4c5f530c9435beced048e4698"},
|
||||
{file = "cython-3.1.0-cp311-cp311-win32.whl", hash = "sha256:5f6417d378bd11ca55f16e3c1c7c3bf6d7f0a0cc326c46e309fcba46c54ba4f1"},
|
||||
{file = "cython-3.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:dde3726aa5acbe879f849a09606b886491f950cfa993b435e50e9561fdf731c6"},
|
||||
{file = "cython-3.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:8f8c4753f6b926046c0cdf6037ba8560f6677730bf0ab9c1db4e0163b4bb30f9"},
|
||||
{file = "cython-3.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:db8e15c8eeee529468eab08528c9bf714a94354b34375be6c0c110f6012a4768"},
|
||||
{file = "cython-3.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a46b34defa672268474fbb5541f6297f45df9e4ecc4def6edd6fe1c44bfdb795"},
|
||||
{file = "cython-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8818446612461aca3978ebe8e3def817a120d91f85022540843ebe4f24818cd6"},
|
||||
{file = "cython-3.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe401e825b0fbeec75f8cc758c8cf32345c673bdb0edaf9585cd43b9d2798824"},
|
||||
{file = "cython-3.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c96908b302e87e99915b3b66481a976e32b864e95bf054dcd2cb859dffd8cb10"},
|
||||
{file = "cython-3.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:cdde5f25fdb8a5d50dbe5d418fe5bfb2260b1acdbd45b788e77b247e9adf2f56"},
|
||||
{file = "cython-3.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fe3320d13cde70fa8b1936e633b9e0fa68720cc61f97aa371d56d0f84fba3e02"},
|
||||
{file = "cython-3.1.0-cp312-cp312-win32.whl", hash = "sha256:d41d17d7cfcfbddf3b7dc0ceddb6361b8e749b0b3c5f8efa40c31c249127fa15"},
|
||||
{file = "cython-3.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:61eb67401bd6c977084fc789812bd40f96be500049adb2bab99921d696ae0c87"},
|
||||
{file = "cython-3.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:539828d14fbd95eff135e8dc9e93012f5b018657868f15a69cb475b8784efb9a"},
|
||||
{file = "cython-3.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:fd0003171ad84d4812fdb1eb9a4f678ed027e75fbc2b7bef5db482621b72137a"},
|
||||
{file = "cython-3.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4551f9ab91019b6b63cf8b16bf1abb519db67627c31162f604e370e596b8c60c"},
|
||||
{file = "cython-3.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c088ac33f4fa04b3589c4e5cfb8a81e9d9a990405409f9c8bfab0f5a9e8b724f"},
|
||||
{file = "cython-3.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8926651830ada313a04284e711c2cf11e4e800ca080e83012418208edd4334a2"},
|
||||
{file = "cython-3.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e03b3280c7ff99fae7b47327a4e2de7e117b069ce9183dc53774069c3e73d1c8"},
|
||||
{file = "cython-3.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:0605d364a2cc632c9269990777c2b266611724d1fccaa614fde335c2209b82da"},
|
||||
{file = "cython-3.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:856950b7c4282a713bcf4794aaae8f18d4a1ae177d3b63739604c91019ac4117"},
|
||||
{file = "cython-3.1.0-cp313-cp313-win32.whl", hash = "sha256:d6854c89d6c1ff472861376822a9df7a0c62b2be362147d313cf7f10bf230c69"},
|
||||
{file = "cython-3.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:9d6c88e8c86f2c582a2f9b460174ef86d9e01c8bfb12b8f7c44d697242285551"},
|
||||
{file = "cython-3.1.0-py3-none-any.whl", hash = "sha256:4e460bdf1d8742ddf4914959842f2f23ca4934df97f864be799ddf1912acd0ab"},
|
||||
{file = "cython-3.1.0.tar.gz", hash = "sha256:1097dd60d43ad0fff614a57524bfd531b35c13a907d13bee2cc2ec152e6bf4a1"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "decorator"
|
||||
version = "5.2.1"
|
||||
@@ -986,6 +1081,60 @@ files = [
|
||||
{file = "frozenlist-1.6.0.tar.gz", hash = "sha256:b99655c32c1c8e06d111e7f41c06c29a5318cb1835df23a45518e02a47c63b68"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "greenlet"
|
||||
version = "3.2.2"
|
||||
requires_python = ">=3.9"
|
||||
summary = "Lightweight in-process concurrent programming"
|
||||
groups = ["default"]
|
||||
files = [
|
||||
{file = "greenlet-3.2.2-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:c49e9f7c6f625507ed83a7485366b46cbe325717c60837f7244fc99ba16ba9d6"},
|
||||
{file = "greenlet-3.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3cc1a3ed00ecfea8932477f729a9f616ad7347a5e55d50929efa50a86cb7be7"},
|
||||
{file = "greenlet-3.2.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7c9896249fbef2c615853b890ee854f22c671560226c9221cfd27c995db97e5c"},
|
||||
{file = "greenlet-3.2.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7409796591d879425997a518138889d8d17e63ada7c99edc0d7a1c22007d4907"},
|
||||
{file = "greenlet-3.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7791dcb496ec53d60c7f1c78eaa156c21f402dda38542a00afc3e20cae0f480f"},
|
||||
{file = "greenlet-3.2.2-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d8009ae46259e31bc73dc183e402f548e980c96f33a6ef58cc2e7865db012e13"},
|
||||
{file = "greenlet-3.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:fd9fb7c941280e2c837b603850efc93c999ae58aae2b40765ed682a6907ebbc5"},
|
||||
{file = "greenlet-3.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:00cd814b8959b95a546e47e8d589610534cfb71f19802ea8a2ad99d95d702057"},
|
||||
{file = "greenlet-3.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:d0cb7d47199001de7658c213419358aa8937df767936506db0db7ce1a71f4a2f"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:dcb9cebbf3f62cb1e5afacae90761ccce0effb3adaa32339a0670fe7805d8068"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf3fc9145141250907730886b031681dfcc0de1c158f3cc51c092223c0f381ce"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:efcdfb9df109e8a3b475c016f60438fcd4be68cd13a365d42b35914cdab4bb2b"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd139e4943547ce3a56ef4b8b1b9479f9e40bb47e72cc906f0f66b9d0d5cab3"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:71566302219b17ca354eb274dfd29b8da3c268e41b646f330e324e3967546a74"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3091bc45e6b0c73f225374fefa1536cd91b1e987377b12ef5b19129b07d93ebe"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:44671c29da26539a5f142257eaba5110f71887c24d40df3ac87f1117df589e0e"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c23ea227847c9dbe0b3910f5c0dd95658b607137614eb821e6cbaecd60d81cc6"},
|
||||
{file = "greenlet-3.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:0a16fb934fcabfdfacf21d79e6fed81809d8cd97bc1be9d9c89f0e4567143d7b"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:df4d1509efd4977e6a844ac96d8be0b9e5aa5d5c77aa27ca9f4d3f92d3fcf330"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da956d534a6d1b9841f95ad0f18ace637668f680b1339ca4dcfb2c1837880a0b"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9c7b15fb9b88d9ee07e076f5a683027bc3befd5bb5d25954bb633c385d8b737e"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:752f0e79785e11180ebd2e726c8a88109ded3e2301d40abced2543aa5d164275"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ae572c996ae4b5e122331e12bbb971ea49c08cc7c232d1bd43150800a2d6c65"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02f5972ff02c9cf615357c17ab713737cccfd0eaf69b951084a9fd43f39833d3"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4fefc7aa68b34b9224490dfda2e70ccf2131368493add64b4ef2d372955c207e"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a31ead8411a027c2c4759113cf2bd473690517494f3d6e4bf67064589afcd3c5"},
|
||||
{file = "greenlet-3.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:b24c7844c0a0afc3ccbeb0b807adeefb7eff2b5599229ecedddcfeb0ef333bec"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:3ab7194ee290302ca15449f601036007873028712e92ca15fc76597a0aeb4c59"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2dc5c43bb65ec3669452af0ab10729e8fdc17f87a1f2ad7ec65d4aaaefabf6bf"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:decb0658ec19e5c1f519faa9a160c0fc85a41a7e6654b3ce1b44b939f8bf1325"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fadd183186db360b61cb34e81117a096bff91c072929cd1b529eb20dd46e6c5"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1919cbdc1c53ef739c94cf2985056bcc0838c1f217b57647cbf4578576c63825"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3885f85b61798f4192d544aac7b25a04ece5fe2704670b4ab73c2d2c14ab740d"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:85f3e248507125bf4af607a26fd6cb8578776197bd4b66e35229cdf5acf1dfbf"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:1e76106b6fc55fa3d6fe1c527f95ee65e324a13b62e243f77b48317346559708"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313-win_amd64.whl", hash = "sha256:fe46d4f8e94e637634d54477b0cfabcf93c53f29eedcbdeecaf2af32029b4421"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba30e88607fb6990544d84caf3c706c4b48f629e18853fc6a646f82db9629418"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:055916fafad3e3388d27dd68517478933a97edc2fc54ae79d3bec827de2c64c4"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2593283bf81ca37d27d110956b79e8723f9aa50c4bcdc29d3c0543d4743d2763"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89c69e9a10670eb7a66b8cef6354c24671ba241f46152dd3eed447f79c29fb5b"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02a98600899ca1ca5d3a2590974c9e3ec259503b2d6ba6527605fcd74e08e207"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:b50a8c5c162469c3209e5ec92ee4f95c8231b11db6a04db09bbe338176723bb8"},
|
||||
{file = "greenlet-3.2.2-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:45f9f4853fb4cc46783085261c9ec4706628f3b57de3e68bae03e8f8b3c0de51"},
|
||||
{file = "greenlet-3.2.2-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:9ea5231428af34226c05f927e16fc7f6fa5e39e3ad3cd24ffa48ba53a47f4240"},
|
||||
{file = "greenlet-3.2.2.tar.gz", hash = "sha256:ad053d34421a2debba45aa3cc39acf454acbcd025b3fc1a9f8a0dee237abd485"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "h11"
|
||||
version = "0.16.0"
|
||||
@@ -1241,6 +1390,16 @@ files = [
|
||||
{file = "kaitaistruct-0.10.tar.gz", hash = "sha256:a044dee29173d6afbacf27bcac39daf89b654dd418cfa009ab82d9178a9ae52a"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "language-tags"
|
||||
version = "1.2.0"
|
||||
summary = "This project is a Python version of the language-tags Javascript project."
|
||||
groups = ["default"]
|
||||
files = [
|
||||
{file = "language_tags-1.2.0-py3-none-any.whl", hash = "sha256:d815604622242fdfbbfd747b40c31213617fd03734a267f2e39ee4bd73c88722"},
|
||||
{file = "language_tags-1.2.0.tar.gz", hash = "sha256:e934acba3e3dc85f867703eca421847a9ab7b7679b11b5d5cfd096febbf8bde6"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lxml"
|
||||
version = "5.4.0"
|
||||
@@ -1683,7 +1842,6 @@ version = "2.2.5"
|
||||
requires_python = ">=3.10"
|
||||
summary = "Fundamental package for array computing in Python"
|
||||
groups = ["default"]
|
||||
marker = "python_version <= \"3.11\" or python_version >= \"3.12\""
|
||||
files = [
|
||||
{file = "numpy-2.2.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1f4a922da1729f4c40932b2af4fe84909c7a6e167e6e99f71838ce3a29f3fe26"},
|
||||
{file = "numpy-2.2.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b6f91524d31b34f4a5fee24f5bc16dcd1491b668798b6d85585d836c1e633a6a"},
|
||||
@@ -1792,6 +1950,74 @@ files = [
|
||||
{file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "orjson"
|
||||
version = "3.10.18"
|
||||
requires_python = ">=3.9"
|
||||
summary = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy"
|
||||
groups = ["default"]
|
||||
files = [
|
||||
{file = "orjson-3.10.18-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:a45e5d68066b408e4bc383b6e4ef05e717c65219a9e1390abc6155a520cac402"},
|
||||
{file = "orjson-3.10.18-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be3b9b143e8b9db05368b13b04c84d37544ec85bb97237b3a923f076265ec89c"},
|
||||
{file = "orjson-3.10.18-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9b0aa09745e2c9b3bf779b096fa71d1cc2d801a604ef6dd79c8b1bfef52b2f92"},
|
||||
{file = "orjson-3.10.18-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:53a245c104d2792e65c8d225158f2b8262749ffe64bc7755b00024757d957a13"},
|
||||
{file = "orjson-3.10.18-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f9495ab2611b7f8a0a8a505bcb0f0cbdb5469caafe17b0e404c3c746f9900469"},
|
||||
{file = "orjson-3.10.18-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:73be1cbcebadeabdbc468f82b087df435843c809cd079a565fb16f0f3b23238f"},
|
||||
{file = "orjson-3.10.18-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe8936ee2679e38903df158037a2f1c108129dee218975122e37847fb1d4ac68"},
|
||||
{file = "orjson-3.10.18-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7115fcbc8525c74e4c2b608129bef740198e9a120ae46184dac7683191042056"},
|
||||
{file = "orjson-3.10.18-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:771474ad34c66bc4d1c01f645f150048030694ea5b2709b87d3bda273ffe505d"},
|
||||
{file = "orjson-3.10.18-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:7c14047dbbea52886dd87169f21939af5d55143dad22d10db6a7514f058156a8"},
|
||||
{file = "orjson-3.10.18-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:641481b73baec8db14fdf58f8967e52dc8bda1f2aba3aa5f5c1b07ed6df50b7f"},
|
||||
{file = "orjson-3.10.18-cp310-cp310-win32.whl", hash = "sha256:607eb3ae0909d47280c1fc657c4284c34b785bae371d007595633f4b1a2bbe06"},
|
||||
{file = "orjson-3.10.18-cp310-cp310-win_amd64.whl", hash = "sha256:8770432524ce0eca50b7efc2a9a5f486ee0113a5fbb4231526d414e6254eba92"},
|
||||
{file = "orjson-3.10.18-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:e0a183ac3b8e40471e8d843105da6fbe7c070faab023be3b08188ee3f85719b8"},
|
||||
{file = "orjson-3.10.18-cp311-cp311-macosx_15_0_arm64.whl", hash = "sha256:5ef7c164d9174362f85238d0cd4afdeeb89d9e523e4651add6a5d458d6f7d42d"},
|
||||
{file = "orjson-3.10.18-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afd14c5d99cdc7bf93f22b12ec3b294931518aa019e2a147e8aa2f31fd3240f7"},
|
||||
{file = "orjson-3.10.18-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7b672502323b6cd133c4af6b79e3bea36bad2d16bca6c1f645903fce83909a7a"},
|
||||
{file = "orjson-3.10.18-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:51f8c63be6e070ec894c629186b1c0fe798662b8687f3d9fdfa5e401c6bd7679"},
|
||||
{file = "orjson-3.10.18-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f9478ade5313d724e0495d167083c6f3be0dd2f1c9c8a38db9a9e912cdaf947"},
|
||||
{file = "orjson-3.10.18-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:187aefa562300a9d382b4b4eb9694806e5848b0cedf52037bb5c228c61bb66d4"},
|
||||
{file = "orjson-3.10.18-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9da552683bc9da222379c7a01779bddd0ad39dd699dd6300abaf43eadee38334"},
|
||||
{file = "orjson-3.10.18-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e450885f7b47a0231979d9c49b567ed1c4e9f69240804621be87c40bc9d3cf17"},
|
||||
{file = "orjson-3.10.18-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:5e3c9cc2ba324187cd06287ca24f65528f16dfc80add48dc99fa6c836bb3137e"},
|
||||
{file = "orjson-3.10.18-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:50ce016233ac4bfd843ac5471e232b865271d7d9d44cf9d33773bcd883ce442b"},
|
||||
{file = "orjson-3.10.18-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b3ceff74a8f7ffde0b2785ca749fc4e80e4315c0fd887561144059fb1c138aa7"},
|
||||
{file = "orjson-3.10.18-cp311-cp311-win32.whl", hash = "sha256:fdba703c722bd868c04702cac4cb8c6b8ff137af2623bc0ddb3b3e6a2c8996c1"},
|
||||
{file = "orjson-3.10.18-cp311-cp311-win_amd64.whl", hash = "sha256:c28082933c71ff4bc6ccc82a454a2bffcef6e1d7379756ca567c772e4fb3278a"},
|
||||
{file = "orjson-3.10.18-cp311-cp311-win_arm64.whl", hash = "sha256:a6c7c391beaedd3fa63206e5c2b7b554196f14debf1ec9deb54b5d279b1b46f5"},
|
||||
{file = "orjson-3.10.18-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:50c15557afb7f6d63bc6d6348e0337a880a04eaa9cd7c9d569bcb4e760a24753"},
|
||||
{file = "orjson-3.10.18-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:356b076f1662c9813d5fa56db7d63ccceef4c271b1fb3dd522aca291375fcf17"},
|
||||
{file = "orjson-3.10.18-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:559eb40a70a7494cd5beab2d73657262a74a2c59aff2068fdba8f0424ec5b39d"},
|
||||
{file = "orjson-3.10.18-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f3c29eb9a81e2fbc6fd7ddcfba3e101ba92eaff455b8d602bf7511088bbc0eae"},
|
||||
{file = "orjson-3.10.18-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6612787e5b0756a171c7d81ba245ef63a3533a637c335aa7fcb8e665f4a0966f"},
|
||||
{file = "orjson-3.10.18-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ac6bd7be0dcab5b702c9d43d25e70eb456dfd2e119d512447468f6405b4a69c"},
|
||||
{file = "orjson-3.10.18-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9f72f100cee8dde70100406d5c1abba515a7df926d4ed81e20a9730c062fe9ad"},
|
||||
{file = "orjson-3.10.18-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9dca85398d6d093dd41dc0983cbf54ab8e6afd1c547b6b8a311643917fbf4e0c"},
|
||||
{file = "orjson-3.10.18-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:22748de2a07fcc8781a70edb887abf801bb6142e6236123ff93d12d92db3d406"},
|
||||
{file = "orjson-3.10.18-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:3a83c9954a4107b9acd10291b7f12a6b29e35e8d43a414799906ea10e75438e6"},
|
||||
{file = "orjson-3.10.18-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:303565c67a6c7b1f194c94632a4a39918e067bd6176a48bec697393865ce4f06"},
|
||||
{file = "orjson-3.10.18-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:86314fdb5053a2f5a5d881f03fca0219bfdf832912aa88d18676a5175c6916b5"},
|
||||
{file = "orjson-3.10.18-cp312-cp312-win32.whl", hash = "sha256:187ec33bbec58c76dbd4066340067d9ece6e10067bb0cc074a21ae3300caa84e"},
|
||||
{file = "orjson-3.10.18-cp312-cp312-win_amd64.whl", hash = "sha256:f9f94cf6d3f9cd720d641f8399e390e7411487e493962213390d1ae45c7814fc"},
|
||||
{file = "orjson-3.10.18-cp312-cp312-win_arm64.whl", hash = "sha256:3d600be83fe4514944500fa8c2a0a77099025ec6482e8087d7659e891f23058a"},
|
||||
{file = "orjson-3.10.18-cp313-cp313-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:69c34b9441b863175cc6a01f2935de994025e773f814412030f269da4f7be147"},
|
||||
{file = "orjson-3.10.18-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:1ebeda919725f9dbdb269f59bc94f861afbe2a27dce5608cdba2d92772364d1c"},
|
||||
{file = "orjson-3.10.18-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5adf5f4eed520a4959d29ea80192fa626ab9a20b2ea13f8f6dc58644f6927103"},
|
||||
{file = "orjson-3.10.18-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7592bb48a214e18cd670974f289520f12b7aed1fa0b2e2616b8ed9e069e08595"},
|
||||
{file = "orjson-3.10.18-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f872bef9f042734110642b7a11937440797ace8c87527de25e0c53558b579ccc"},
|
||||
{file = "orjson-3.10.18-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0315317601149c244cb3ecef246ef5861a64824ccbcb8018d32c66a60a84ffbc"},
|
||||
{file = "orjson-3.10.18-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e0da26957e77e9e55a6c2ce2e7182a36a6f6b180ab7189315cb0995ec362e049"},
|
||||
{file = "orjson-3.10.18-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb70d489bc79b7519e5803e2cc4c72343c9dc1154258adf2f8925d0b60da7c58"},
|
||||
{file = "orjson-3.10.18-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e9e86a6af31b92299b00736c89caf63816f70a4001e750bda179e15564d7a034"},
|
||||
{file = "orjson-3.10.18-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:c382a5c0b5931a5fc5405053d36c1ce3fd561694738626c77ae0b1dfc0242ca1"},
|
||||
{file = "orjson-3.10.18-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8e4b2ae732431127171b875cb2668f883e1234711d3c147ffd69fe5be51a8012"},
|
||||
{file = "orjson-3.10.18-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2d808e34ddb24fc29a4d4041dcfafbae13e129c93509b847b14432717d94b44f"},
|
||||
{file = "orjson-3.10.18-cp313-cp313-win32.whl", hash = "sha256:ad8eacbb5d904d5591f27dee4031e2c1db43d559edb8f91778efd642d70e6bea"},
|
||||
{file = "orjson-3.10.18-cp313-cp313-win_amd64.whl", hash = "sha256:aed411bcb68bf62e85588f2a7e03a6082cc42e5a2796e06e72a962d7c6310b52"},
|
||||
{file = "orjson-3.10.18-cp313-cp313-win_arm64.whl", hash = "sha256:f54c1385a0e6aba2f15a40d703b858bedad36ded0491e55d35d905b2c34a4cc3"},
|
||||
{file = "orjson-3.10.18.tar.gz", hash = "sha256:e8da3947d92123eda795b68228cafe2724815621fe35e8e320a9e9593a4bcd53"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "outcome"
|
||||
version = "1.3.0.post0"
|
||||
@@ -1929,6 +2155,38 @@ files = [
|
||||
{file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "platformdirs"
|
||||
version = "4.3.8"
|
||||
requires_python = ">=3.9"
|
||||
summary = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
|
||||
groups = ["default"]
|
||||
files = [
|
||||
{file = "platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4"},
|
||||
{file = "platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "playwright"
|
||||
version = "1.52.0"
|
||||
requires_python = ">=3.9"
|
||||
summary = "A high-level API to automate web browsers"
|
||||
groups = ["default"]
|
||||
dependencies = [
|
||||
"greenlet<4.0.0,>=3.1.1",
|
||||
"pyee<14,>=13",
|
||||
]
|
||||
files = [
|
||||
{file = "playwright-1.52.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:19b2cb9d4794062008a635a99bd135b03ebb782d460f96534a91cb583f549512"},
|
||||
{file = "playwright-1.52.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:0797c0479cbdc99607412a3c486a3a2ec9ddc77ac461259fd2878c975bcbb94a"},
|
||||
{file = "playwright-1.52.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:7223960b7dd7ddeec1ba378c302d1d09733b8dac438f492e9854c85d3ca7144f"},
|
||||
{file = "playwright-1.52.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:d010124d24a321e0489a8c0d38a3971a7ca7656becea7656c9376bfea7f916d4"},
|
||||
{file = "playwright-1.52.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4173e453c43180acc60fd77ffe1ebee8d0efbfd9986c03267007b9c3845415af"},
|
||||
{file = "playwright-1.52.0-py3-none-win32.whl", hash = "sha256:cd0bdf92df99db6237a99f828e80a6a50db6180ef8d5352fc9495df2c92f9971"},
|
||||
{file = "playwright-1.52.0-py3-none-win_amd64.whl", hash = "sha256:dcbf75101eba3066b7521c6519de58721ea44379eb17a0dafa94f9f1b17f59e4"},
|
||||
{file = "playwright-1.52.0-py3-none-win_arm64.whl", hash = "sha256:9d0085b8de513de5fb50669f8e6677f0252ef95a9a1d2d23ccee9638e71e65cb"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pluggy"
|
||||
version = "1.5.0"
|
||||
@@ -2307,6 +2565,41 @@ files = [
|
||||
{file = "pymongo-4.12.0.tar.gz", hash = "sha256:d9f74a5cf3fccdb72211e33e07a6c05ac09cd0d7c99d21db5c2473fcfdd03152"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyobjc-core"
|
||||
version = "11.0"
|
||||
requires_python = ">=3.8"
|
||||
summary = "Python<->ObjC Interoperability Module"
|
||||
groups = ["default"]
|
||||
marker = "sys_platform == \"darwin\""
|
||||
files = [
|
||||
{file = "pyobjc_core-11.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:10866b3a734d47caf48e456eea0d4815c2c9b21856157db5917b61dee06893a1"},
|
||||
{file = "pyobjc_core-11.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:50675c0bb8696fe960a28466f9baf6943df2928a1fd85625d678fa2f428bd0bd"},
|
||||
{file = "pyobjc_core-11.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a03061d4955c62ddd7754224a80cdadfdf17b6b5f60df1d9169a3b1b02923f0b"},
|
||||
{file = "pyobjc_core-11.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c338c1deb7ab2e9436d4175d1127da2eeed4a1b564b3d83b9f3ae4844ba97e86"},
|
||||
{file = "pyobjc_core-11.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b4e9dc4296110f251a4033ff3f40320b35873ea7f876bd29a1c9705bb5e08c59"},
|
||||
{file = "pyobjc_core-11.0.tar.gz", hash = "sha256:63bced211cb8a8fb5c8ff46473603da30e51112861bd02c438fbbbc8578d9a70"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyobjc-framework-cocoa"
|
||||
version = "11.0"
|
||||
requires_python = ">=3.9"
|
||||
summary = "Wrappers for the Cocoa frameworks on macOS"
|
||||
groups = ["default"]
|
||||
marker = "sys_platform == \"darwin\""
|
||||
dependencies = [
|
||||
"pyobjc-core>=11.0",
|
||||
]
|
||||
files = [
|
||||
{file = "pyobjc_framework_Cocoa-11.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:fbc65f260d617d5463c7fb9dbaaffc23c9a4fabfe3b1a50b039b61870b8daefd"},
|
||||
{file = "pyobjc_framework_Cocoa-11.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3ea7be6e6dd801b297440de02d312ba3fa7fd3c322db747ae1cb237e975f5d33"},
|
||||
{file = "pyobjc_framework_Cocoa-11.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:280a577b83c68175a28b2b7138d1d2d3111f2b2b66c30e86f81a19c2b02eae71"},
|
||||
{file = "pyobjc_framework_Cocoa-11.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:15b2bd977ed340074f930f1330f03d42912d5882b697d78bd06f8ebe263ef92e"},
|
||||
{file = "pyobjc_framework_Cocoa-11.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:5750001db544e67f2b66f02067d8f0da96bb2ef71732bde104f01b8628f9d7ea"},
|
||||
{file = "pyobjc_framework_cocoa-11.0.tar.gz", hash = "sha256:00346a8cb81ad7b017b32ff7bf596000f9faa905807b1bd234644ebd47f692c5"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyopenssl"
|
||||
version = "25.0.0"
|
||||
@@ -2526,6 +2819,52 @@ files = [
|
||||
{file = "pywin32-310-cp313-cp313-win_arm64.whl", hash = "sha256:e308f831de771482b7cf692a1f308f8fca701b2d8f9dde6cc440c7da17e47b33"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyyaml"
|
||||
version = "6.0.2"
|
||||
requires_python = ">=3.8"
|
||||
summary = "YAML parser and emitter for Python"
|
||||
groups = ["default"]
|
||||
files = [
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"},
|
||||
{file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"},
|
||||
{file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"},
|
||||
{file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"},
|
||||
{file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "requests"
|
||||
version = "2.32.3"
|
||||
@@ -2605,9 +2944,25 @@ files = [
|
||||
{file = "s3transfer-0.12.0.tar.gz", hash = "sha256:8ac58bc1989a3fdb7c7f3ee0918a66b160d038a147c7b5db1500930a607e9a1c"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "screeninfo"
|
||||
version = "0.8.1"
|
||||
requires_python = ">=3.6.2,<4.0.0"
|
||||
summary = "Fetch location and size of physical screens."
|
||||
groups = ["default"]
|
||||
dependencies = [
|
||||
"Cython; sys_platform == \"darwin\"",
|
||||
"dataclasses; python_version < \"3.7\"",
|
||||
"pyobjc-framework-Cocoa; sys_platform == \"darwin\"",
|
||||
]
|
||||
files = [
|
||||
{file = "screeninfo-0.8.1-py3-none-any.whl", hash = "sha256:e97d6b173856edcfa3bd282f81deb528188aff14b11ec3e195584e7641be733c"},
|
||||
{file = "screeninfo-0.8.1.tar.gz", hash = "sha256:9983076bcc7e34402a1a9e4d7dabf3729411fd2abb3f3b4be7eba73519cd2ed1"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "selenium"
|
||||
version = "4.31.0"
|
||||
version = "4.32.0"
|
||||
requires_python = ">=3.9"
|
||||
summary = "Official Python bindings for Selenium WebDriver"
|
||||
groups = ["default"]
|
||||
@@ -2620,8 +2975,8 @@ dependencies = [
|
||||
"websocket-client~=1.8",
|
||||
]
|
||||
files = [
|
||||
{file = "selenium-4.31.0-py3-none-any.whl", hash = "sha256:7b8b8d5e424d7133cb7aa656263b19ac505ec26d65c0f921a696e7e2c5ccd95b"},
|
||||
{file = "selenium-4.31.0.tar.gz", hash = "sha256:441cffc436a2e6659fe3cfb012692435652efd38b0d368d16f661a5db47825f5"},
|
||||
{file = "selenium-4.32.0-py3-none-any.whl", hash = "sha256:c4d9613f8a45693d61530c9660560fadb52db7d730237bc788ddedf442391f97"},
|
||||
{file = "selenium-4.32.0.tar.gz", hash = "sha256:b9509bef4056f4083772abb1ae19ff57247d617a29255384b26be6956615b206"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2912,6 +3267,30 @@ files = [
|
||||
{file = "tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ua-parser"
|
||||
version = "1.0.1"
|
||||
requires_python = ">=3.9"
|
||||
summary = "Python port of Browserscope's user agent parser"
|
||||
groups = ["default"]
|
||||
dependencies = [
|
||||
"ua-parser-builtins",
|
||||
]
|
||||
files = [
|
||||
{file = "ua_parser-1.0.1-py3-none-any.whl", hash = "sha256:b059f2cb0935addea7e551251cbbf42e9a8872f86134163bc1a4f79e0945ffea"},
|
||||
{file = "ua_parser-1.0.1.tar.gz", hash = "sha256:f9d92bf19d4329019cef91707aecc23c6d65143ad7e29a233f0580fb0d15547d"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ua-parser-builtins"
|
||||
version = "0.18.0.post1"
|
||||
requires_python = ">=3.9"
|
||||
summary = "Precompiled rules for User Agent Parser"
|
||||
groups = ["default"]
|
||||
files = [
|
||||
{file = "ua_parser_builtins-0.18.0.post1-py3-none-any.whl", hash = "sha256:eb4f93504040c3a990a6b0742a2afd540d87d7f9f05fd66e94c101db1564674d"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "urllib3"
|
||||
version = "2.4.0"
|
||||
|
||||
@@ -16,7 +16,6 @@ dependencies = [
|
||||
"lxml-stubs>=0.5.1",
|
||||
"fake-useragent>=1.5.1",
|
||||
"requests-html>=0.10.0",
|
||||
"selenium>=4.22.0",
|
||||
"webdriver-manager>=4.0.1",
|
||||
"pydantic[email]>=2.9.2",
|
||||
"pandas>=2.2.2",
|
||||
@@ -40,6 +39,8 @@ dependencies = [
|
||||
"python-multipart>=0.0.1",
|
||||
"bcrypt==4.0.1",
|
||||
"apscheduler>=3.11.0",
|
||||
"playwright>=1.52.0",
|
||||
"camoufox>=0.4.11",
|
||||
]
|
||||
requires-python = ">=3.10"
|
||||
readme = "README.md"
|
||||
|
||||
@@ -0,0 +1,45 @@
|
||||
import { Box, Link, Typography } from "@mui/material";
|
||||
import { SetStateAction, Dispatch, useState } from "react";
|
||||
import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog";
|
||||
import { RawJobOptions } from "@/types";
|
||||
|
||||
export type AdvancedJobOptionsProps = {
|
||||
jobOptions: RawJobOptions;
|
||||
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
|
||||
};
|
||||
|
||||
export const AdvancedJobOptions = ({
|
||||
jobOptions,
|
||||
setJobOptions,
|
||||
}: AdvancedJobOptionsProps) => {
|
||||
const [open, setOpen] = useState(false);
|
||||
return (
|
||||
<Box sx={{ mb: 2 }}>
|
||||
<Link
|
||||
component="button"
|
||||
variant="body2"
|
||||
onClick={() => setOpen(true)}
|
||||
sx={{
|
||||
textDecoration: "none",
|
||||
color: "primary.main",
|
||||
"&:hover": {
|
||||
color: "primary.dark",
|
||||
textDecoration: "underline",
|
||||
},
|
||||
paddingLeft: 1,
|
||||
display: "inline-flex",
|
||||
alignItems: "center",
|
||||
gap: 0.5,
|
||||
}}
|
||||
>
|
||||
<Typography variant="body2">Advanced Job Options</Typography>
|
||||
</Link>
|
||||
<AdvancedJobOptionsDialog
|
||||
open={open}
|
||||
onClose={() => setOpen(false)}
|
||||
jobOptions={jobOptions}
|
||||
setJobOptions={setJobOptions}
|
||||
/>
|
||||
</Box>
|
||||
);
|
||||
};
|
||||
@@ -0,0 +1,269 @@
|
||||
import {
|
||||
Accordion,
|
||||
AccordionDetails,
|
||||
AccordionSummary,
|
||||
Box,
|
||||
Checkbox,
|
||||
Dialog,
|
||||
DialogContent,
|
||||
DialogTitle,
|
||||
Divider,
|
||||
FormControl,
|
||||
FormControlLabel,
|
||||
FormGroup,
|
||||
IconButton,
|
||||
TextField,
|
||||
Tooltip,
|
||||
Typography,
|
||||
useTheme,
|
||||
} from "@mui/material";
|
||||
import {
|
||||
ExpandMore as ExpandMoreIcon,
|
||||
InfoOutlined,
|
||||
Code as CodeIcon,
|
||||
Settings,
|
||||
} from "@mui/icons-material";
|
||||
import { Dispatch, SetStateAction } from "react";
|
||||
import { RawJobOptions } from "@/types";
|
||||
import { ExpandedTableInput } from "../../expanded-table-input";
|
||||
|
||||
export type AdvancedJobOptionsDialogProps = {
|
||||
open: boolean;
|
||||
onClose: () => void;
|
||||
jobOptions: RawJobOptions;
|
||||
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
|
||||
};
|
||||
|
||||
export const AdvancedJobOptionsDialog = ({
|
||||
open,
|
||||
onClose,
|
||||
jobOptions,
|
||||
setJobOptions,
|
||||
}: AdvancedJobOptionsDialogProps) => {
|
||||
const theme = useTheme();
|
||||
const handleMultiPageScrapeChange = () => {
|
||||
setJobOptions((prevJobOptions) => ({
|
||||
...prevJobOptions,
|
||||
multi_page_scrape: !prevJobOptions.multi_page_scrape,
|
||||
}));
|
||||
};
|
||||
|
||||
const handleProxiesChange = (e: React.ChangeEvent<HTMLInputElement>) => {
|
||||
setJobOptions((prevJobOptions) => ({
|
||||
...prevJobOptions,
|
||||
proxies: e.target.value,
|
||||
}));
|
||||
};
|
||||
|
||||
const handleCollectMediaChange = () => {
|
||||
setJobOptions((prevJobOptions) => ({
|
||||
...prevJobOptions,
|
||||
collect_media: !prevJobOptions.collect_media,
|
||||
}));
|
||||
};
|
||||
|
||||
return (
|
||||
<Dialog
|
||||
open={open}
|
||||
onClose={onClose}
|
||||
maxWidth="md"
|
||||
fullWidth
|
||||
PaperProps={{
|
||||
sx: {
|
||||
borderRadius: 2,
|
||||
boxShadow: "0 8px 32px rgba(0, 0, 0, 0.1)",
|
||||
},
|
||||
}}
|
||||
>
|
||||
<DialogTitle
|
||||
sx={{
|
||||
borderBottom: `1px solid ${theme.palette.divider}`,
|
||||
backgroundColor: theme.palette.background.default,
|
||||
color: theme.palette.primary.contrastText,
|
||||
borderRadius: 2,
|
||||
display: "flex",
|
||||
alignItems: "center",
|
||||
justifyContent: "space-between",
|
||||
padding: "1rem 2rem",
|
||||
marginRight: 2,
|
||||
marginLeft: 2,
|
||||
}}
|
||||
>
|
||||
<Typography variant="h6" component="div">
|
||||
Advanced Job Options
|
||||
</Typography>
|
||||
<Settings
|
||||
sx={{
|
||||
color: theme.palette.primary.contrastText,
|
||||
}}
|
||||
/>
|
||||
</DialogTitle>
|
||||
|
||||
<DialogContent
|
||||
sx={{ padding: 3, overflowY: "auto", marginTop: 2, height: "60rem" }}
|
||||
>
|
||||
<FormControl fullWidth>
|
||||
<Box sx={{ mb: 3 }}>
|
||||
<Typography
|
||||
variant="subtitle1"
|
||||
sx={{
|
||||
mb: 1,
|
||||
fontWeight: "bold",
|
||||
color: theme.palette.text.primary,
|
||||
}}
|
||||
>
|
||||
Collection Options
|
||||
</Typography>
|
||||
<Divider sx={{ mb: 2, backgroundColor: theme.palette.divider }} />
|
||||
|
||||
<FormGroup row sx={{ gap: 4, mb: 1 }}>
|
||||
<FormControlLabel
|
||||
control={
|
||||
<Checkbox
|
||||
checked={jobOptions.multi_page_scrape}
|
||||
onChange={handleMultiPageScrapeChange}
|
||||
/>
|
||||
}
|
||||
label={
|
||||
<Box sx={{ display: "flex", alignItems: "center" }}>
|
||||
<Typography>Multi Page Scrape</Typography>
|
||||
<Tooltip title="Enable crawling through multiple pages">
|
||||
<IconButton size="small">
|
||||
<InfoOutlined fontSize="small" />
|
||||
</IconButton>
|
||||
</Tooltip>
|
||||
</Box>
|
||||
}
|
||||
/>
|
||||
<FormControlLabel
|
||||
control={
|
||||
<Checkbox
|
||||
checked={jobOptions.collect_media}
|
||||
onChange={handleCollectMediaChange}
|
||||
/>
|
||||
}
|
||||
label={
|
||||
<Box sx={{ display: "flex", alignItems: "center" }}>
|
||||
<Typography>Collect Media</Typography>
|
||||
<Tooltip title="Download images and other media">
|
||||
<IconButton size="small">
|
||||
<InfoOutlined fontSize="small" />
|
||||
</IconButton>
|
||||
</Tooltip>
|
||||
</Box>
|
||||
}
|
||||
/>
|
||||
</FormGroup>
|
||||
</Box>
|
||||
|
||||
<Box sx={{ mb: 3 }}>
|
||||
<Typography
|
||||
variant="subtitle1"
|
||||
sx={{
|
||||
mb: 1,
|
||||
fontWeight: "bold",
|
||||
color: theme.palette.text.primary,
|
||||
}}
|
||||
>
|
||||
Custom Options
|
||||
</Typography>
|
||||
<Divider sx={{ mb: 2, backgroundColor: theme.palette.divider }} />
|
||||
|
||||
{/* Proxies Section */}
|
||||
<Accordion
|
||||
defaultExpanded
|
||||
elevation={0}
|
||||
sx={{
|
||||
mb: 2,
|
||||
border: `1px solid ${theme.palette.divider}`,
|
||||
"&:before": { display: "none" },
|
||||
borderRadius: 1,
|
||||
overflow: "hidden",
|
||||
padding: 1,
|
||||
}}
|
||||
>
|
||||
<AccordionSummary
|
||||
expandIcon={<ExpandMoreIcon />}
|
||||
sx={{
|
||||
backgroundColor: theme.palette.background.paper,
|
||||
borderBottom: `1px solid ${theme.palette.divider}`,
|
||||
"&.Mui-expanded": {
|
||||
borderBottom: `1px solid ${theme.palette.divider}`,
|
||||
},
|
||||
}}
|
||||
>
|
||||
<Box sx={{ display: "flex", alignItems: "center" }}>
|
||||
<div
|
||||
style={{
|
||||
display: "flex",
|
||||
alignItems: "center",
|
||||
gap: "0.5rem",
|
||||
}}
|
||||
>
|
||||
<Typography
|
||||
sx={{
|
||||
fontWeight: 500,
|
||||
color: theme.palette.text.primary,
|
||||
}}
|
||||
>
|
||||
Proxies
|
||||
</Typography>
|
||||
|
||||
<Tooltip title="Comma separated list of proxies that should follow Playwright proxy format">
|
||||
<InfoOutlined fontSize="small" />
|
||||
</Tooltip>
|
||||
</div>
|
||||
</Box>
|
||||
</AccordionSummary>
|
||||
<AccordionDetails
|
||||
sx={{ p: 2, backgroundColor: theme.palette.background.default }}
|
||||
>
|
||||
<TextField
|
||||
placeholder='Proxies ([{"server": "proxy.example.com:8080", "username": "username", "password": "password"}])'
|
||||
fullWidth
|
||||
variant="outlined"
|
||||
size="small"
|
||||
value={jobOptions.proxies}
|
||||
onChange={handleProxiesChange}
|
||||
InputProps={{
|
||||
startAdornment: (
|
||||
<CodeIcon
|
||||
sx={{ color: theme.palette.text.secondary, mr: 1 }}
|
||||
/>
|
||||
),
|
||||
}}
|
||||
/>
|
||||
</AccordionDetails>
|
||||
</Accordion>
|
||||
|
||||
{/* Custom Headers Section */}
|
||||
<ExpandedTableInput
|
||||
label="Custom Headers"
|
||||
placeholder='{"User-Agent": "CustomAgent", "Accept": "*/*"}'
|
||||
urlParam="custom_headers"
|
||||
onChange={(value) => {
|
||||
setJobOptions((prevJobOptions) => ({
|
||||
...prevJobOptions,
|
||||
custom_headers: value,
|
||||
}));
|
||||
}}
|
||||
/>
|
||||
|
||||
{/* Custom Cookies Section */}
|
||||
<ExpandedTableInput
|
||||
label="Custom Cookies"
|
||||
placeholder='[{"name": "value", "name2": "value2"}]'
|
||||
urlParam="custom_cookies"
|
||||
onChange={(value) => {
|
||||
setJobOptions((prevJobOptions) => ({
|
||||
...prevJobOptions,
|
||||
custom_cookies: value,
|
||||
}));
|
||||
}}
|
||||
/>
|
||||
</Box>
|
||||
</FormControl>
|
||||
</DialogContent>
|
||||
</Dialog>
|
||||
);
|
||||
};
|
||||
@@ -0,0 +1 @@
|
||||
export * from "./advanced-job-options-dialog";
|
||||
1
src/components/common/advanced-job-options/index.ts
Normal file
1
src/components/common/advanced-job-options/index.ts
Normal file
@@ -0,0 +1 @@
|
||||
export * from "./advanced-job-options";
|
||||
165
src/components/common/csv-table/csv-table.tsx
Normal file
165
src/components/common/csv-table/csv-table.tsx
Normal file
@@ -0,0 +1,165 @@
|
||||
import React, { useState } from "react";
|
||||
import {
|
||||
Table,
|
||||
TableBody,
|
||||
TableCell,
|
||||
TableContainer,
|
||||
TableHead,
|
||||
TableRow,
|
||||
Paper,
|
||||
Box,
|
||||
Typography,
|
||||
useTheme,
|
||||
alpha,
|
||||
} from "@mui/material";
|
||||
|
||||
export type CsvRow = {
|
||||
[key: string]: string;
|
||||
};
|
||||
|
||||
export type CsvTableProps = {
|
||||
csv: {
|
||||
rows: CsvRow[];
|
||||
headers: string[];
|
||||
};
|
||||
className?: string;
|
||||
};
|
||||
|
||||
export const CsvTable: React.FC<CsvTableProps> = ({ csv, className }) => {
|
||||
const [expandedRow, setExpandedRow] = useState<number | null>(null);
|
||||
const theme = useTheme();
|
||||
|
||||
const handleRowClick = (rowIndex: number) => {
|
||||
setExpandedRow((prevRow) => (prevRow === rowIndex ? null : rowIndex));
|
||||
};
|
||||
|
||||
return (
|
||||
<Box
|
||||
sx={{
|
||||
height: "100%",
|
||||
display: "flex",
|
||||
flexDirection: "column",
|
||||
overflow: "hidden",
|
||||
width: "100%",
|
||||
}}
|
||||
className={className}
|
||||
>
|
||||
{csv.rows.length > 0 ? (
|
||||
<TableContainer
|
||||
sx={{
|
||||
flex: 1,
|
||||
overflow: "auto",
|
||||
borderRadius: theme.shape.borderRadius,
|
||||
boxShadow: theme.shadows[1],
|
||||
}}
|
||||
>
|
||||
<Table stickyHeader size="small" aria-label="csv data table">
|
||||
<TableHead>
|
||||
<TableRow>
|
||||
{csv.headers.map((header, idx) => (
|
||||
<TableCell
|
||||
key={idx}
|
||||
sx={{
|
||||
fontWeight: "bold",
|
||||
cursor: "pointer",
|
||||
whiteSpace: "nowrap",
|
||||
backgroundColor: theme.palette.background.paper,
|
||||
color: theme.palette.text.primary,
|
||||
"&:hover": {
|
||||
backgroundColor: alpha(theme.palette.primary.main, 0.1),
|
||||
},
|
||||
p: { xs: 1, sm: 2 },
|
||||
}}
|
||||
>
|
||||
{header}
|
||||
</TableCell>
|
||||
))}
|
||||
</TableRow>
|
||||
</TableHead>
|
||||
<TableBody>
|
||||
{csv.rows.map((row, rowIndex) => (
|
||||
<React.Fragment key={rowIndex}>
|
||||
<TableRow
|
||||
onClick={() => handleRowClick(rowIndex)}
|
||||
sx={{
|
||||
"&:nth-of-type(odd)": {
|
||||
backgroundColor: alpha(
|
||||
theme.palette.primary.main,
|
||||
0.02
|
||||
),
|
||||
},
|
||||
"&:hover": {
|
||||
backgroundColor: alpha(
|
||||
theme.palette.primary.main,
|
||||
0.04
|
||||
),
|
||||
},
|
||||
cursor: "pointer",
|
||||
}}
|
||||
>
|
||||
{Object.values(row).map((col, colIndex) => (
|
||||
<TableCell
|
||||
key={colIndex}
|
||||
sx={{
|
||||
whiteSpace: "nowrap",
|
||||
maxWidth: { xs: "150px", sm: "200px", md: "200px" },
|
||||
overflow: "hidden",
|
||||
textOverflow: "ellipsis",
|
||||
p: { xs: 1, sm: 2 },
|
||||
}}
|
||||
>
|
||||
{col}
|
||||
</TableCell>
|
||||
))}
|
||||
</TableRow>
|
||||
|
||||
{expandedRow === rowIndex && (
|
||||
<TableRow>
|
||||
<TableCell
|
||||
colSpan={csv.headers.length}
|
||||
sx={{ padding: 2 }}
|
||||
>
|
||||
<Paper
|
||||
sx={{
|
||||
padding: 2,
|
||||
backgroundColor: alpha(
|
||||
theme.palette.background.paper,
|
||||
0.5
|
||||
),
|
||||
}}
|
||||
>
|
||||
<Typography variant="body2" color="text.secondary">
|
||||
{row.text
|
||||
? row.text
|
||||
.replace(/(\r\n|\n|\r)/g, " ")
|
||||
.replace(/\t/g, " ")
|
||||
: "No text available"}
|
||||
</Typography>
|
||||
</Paper>
|
||||
</TableCell>
|
||||
</TableRow>
|
||||
)}
|
||||
</React.Fragment>
|
||||
))}
|
||||
</TableBody>
|
||||
</Table>
|
||||
</TableContainer>
|
||||
) : (
|
||||
<Paper
|
||||
sx={{
|
||||
p: 4,
|
||||
display: "flex",
|
||||
justifyContent: "center",
|
||||
alignItems: "center",
|
||||
height: "100%",
|
||||
borderRadius: theme.shape.borderRadius,
|
||||
backgroundColor: alpha(theme.palette.background.paper, 0.5),
|
||||
border: `1px dashed ${theme.palette.divider}`,
|
||||
}}
|
||||
>
|
||||
<Typography color="text.secondary">No data available</Typography>
|
||||
</Paper>
|
||||
)}
|
||||
</Box>
|
||||
);
|
||||
};
|
||||
1
src/components/common/csv-table/index.ts
Normal file
1
src/components/common/csv-table/index.ts
Normal file
@@ -0,0 +1 @@
|
||||
export * from "./csv-table";
|
||||
@@ -0,0 +1,204 @@
|
||||
import {
|
||||
Accordion,
|
||||
AccordionSummary,
|
||||
TableCell,
|
||||
TableRow,
|
||||
Paper,
|
||||
TableBody,
|
||||
useTheme,
|
||||
TextField,
|
||||
Box,
|
||||
Typography,
|
||||
AccordionDetails,
|
||||
TableHead,
|
||||
TableContainer,
|
||||
Table,
|
||||
} from "@mui/material";
|
||||
import { useEffect, useState } from "react";
|
||||
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
|
||||
import { parseJsonToEntries } from "@/lib/helpers/parse-json-to-entries";
|
||||
|
||||
export type ExpandedTableInputProps = {
|
||||
label: string;
|
||||
onChange: (value: any) => void;
|
||||
placeholder: string;
|
||||
urlParam: string;
|
||||
};
|
||||
|
||||
export const ExpandedTableInput = ({
|
||||
label,
|
||||
onChange,
|
||||
placeholder,
|
||||
urlParam,
|
||||
}: ExpandedTableInputProps) => {
|
||||
const theme = useTheme();
|
||||
const [value, setValue] = useState("");
|
||||
const [parsedHeaders, setParsedHeaders] = useState<[string, string][] | null>(
|
||||
null
|
||||
);
|
||||
|
||||
const [jsonError, setJsonError] = useState<string | null>(null);
|
||||
|
||||
const urlParams = new URLSearchParams(window.location.search);
|
||||
|
||||
const validateAndParse = (val: string) => {
|
||||
if (val.trim() === "") {
|
||||
setParsedHeaders(null);
|
||||
setJsonError(null);
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(val);
|
||||
const entries = parseJsonToEntries(val);
|
||||
|
||||
if (entries === null) {
|
||||
setParsedHeaders(null);
|
||||
setJsonError("Invalid JSON object");
|
||||
return null;
|
||||
} else {
|
||||
setParsedHeaders(entries);
|
||||
setJsonError(null);
|
||||
return parsed;
|
||||
}
|
||||
} catch (e) {
|
||||
setParsedHeaders(null);
|
||||
setJsonError("Invalid JSON format");
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
const handleChange = (e: React.ChangeEvent<HTMLInputElement>) => {
|
||||
const val = e.target.value;
|
||||
setValue(val);
|
||||
const parsed = validateAndParse(val);
|
||||
onChange(parsed);
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
const jobOptions = urlParams.get("job_options");
|
||||
|
||||
if (!jobOptions) {
|
||||
setParsedHeaders(null);
|
||||
setJsonError(null);
|
||||
return;
|
||||
}
|
||||
|
||||
const jobOptionsObject = JSON.parse(jobOptions || "{}");
|
||||
let val = jobOptionsObject[urlParam];
|
||||
|
||||
if (val.length === 0 || Object.keys(val).length === 0) {
|
||||
setParsedHeaders(null);
|
||||
setJsonError(null);
|
||||
return;
|
||||
}
|
||||
|
||||
if (typeof val === "string") {
|
||||
try {
|
||||
val = JSON.parse(val);
|
||||
} catch {}
|
||||
}
|
||||
|
||||
const finalVal =
|
||||
typeof val === "string" ? val : val != null ? JSON.stringify(val) : "";
|
||||
|
||||
setValue(finalVal);
|
||||
const parsed = validateAndParse(finalVal);
|
||||
onChange(parsed);
|
||||
}, [urlParam]);
|
||||
|
||||
return (
|
||||
<Accordion
|
||||
defaultExpanded
|
||||
elevation={0}
|
||||
sx={{
|
||||
mb: 2,
|
||||
border: `1px solid ${theme.palette.divider}`,
|
||||
"&:before": { display: "none" },
|
||||
borderRadius: 1,
|
||||
overflow: "hidden",
|
||||
padding: 1,
|
||||
}}
|
||||
>
|
||||
<AccordionSummary
|
||||
expandIcon={<ExpandMoreIcon />}
|
||||
sx={{
|
||||
backgroundColor: theme.palette.background.paper,
|
||||
borderBottom: `1px solid ${theme.palette.divider}`,
|
||||
"&.Mui-expanded": {
|
||||
borderBottom: `1px solid ${theme.palette.divider}`,
|
||||
},
|
||||
}}
|
||||
>
|
||||
<Box sx={{ display: "flex", alignItems: "center" }}>
|
||||
<Typography
|
||||
sx={{ fontWeight: 500, color: theme.palette.text.primary }}
|
||||
>
|
||||
{label}
|
||||
</Typography>
|
||||
</Box>
|
||||
</AccordionSummary>
|
||||
<AccordionDetails
|
||||
sx={{ p: 2, backgroundColor: theme.palette.background.default }}
|
||||
>
|
||||
<TextField
|
||||
placeholder={placeholder}
|
||||
value={value}
|
||||
onChange={handleChange}
|
||||
fullWidth
|
||||
variant="outlined"
|
||||
size="small"
|
||||
error={jsonError !== null}
|
||||
helperText={jsonError ?? ""}
|
||||
/>
|
||||
|
||||
{parsedHeaders && parsedHeaders.length > 0 && (
|
||||
<Paper
|
||||
variant="outlined"
|
||||
sx={{
|
||||
marginTop: 1,
|
||||
border: `1px solid ${theme.palette.divider}`,
|
||||
borderRadius: 1,
|
||||
overflow: "hidden",
|
||||
padding: 0,
|
||||
}}
|
||||
>
|
||||
<TableContainer sx={{ maxHeight: 200 }}>
|
||||
<Table size="small" stickyHeader>
|
||||
<TableHead>
|
||||
<TableRow
|
||||
sx={{
|
||||
backgroundColor: theme.palette.background.paper,
|
||||
}}
|
||||
>
|
||||
<TableCell sx={{ fontWeight: "bold" }}>Header</TableCell>
|
||||
<TableCell sx={{ fontWeight: "bold" }}>Value</TableCell>
|
||||
</TableRow>
|
||||
</TableHead>
|
||||
<TableBody>
|
||||
{parsedHeaders.map(([key, val]) => (
|
||||
<TableRow
|
||||
key={key}
|
||||
hover
|
||||
sx={{
|
||||
"&:nth-of-type(odd)": {
|
||||
backgroundColor:
|
||||
theme.palette.mode === "light"
|
||||
? "rgba(0, 0, 0, 0.02)"
|
||||
: "rgba(255, 255, 255, 0.02)",
|
||||
},
|
||||
}}
|
||||
>
|
||||
<TableCell sx={{ fontWeight: 500 }}>{key}</TableCell>
|
||||
<TableCell>{val}</TableCell>
|
||||
</TableRow>
|
||||
))}
|
||||
</TableBody>
|
||||
</Table>
|
||||
</TableContainer>
|
||||
</Paper>
|
||||
)}
|
||||
</AccordionDetails>
|
||||
</Accordion>
|
||||
);
|
||||
};
|
||||
1
src/components/common/expanded-table-input/index.ts
Normal file
1
src/components/common/expanded-table-input/index.ts
Normal file
@@ -0,0 +1 @@
|
||||
export * from "./expanded-table-input";
|
||||
1
src/components/common/job-download-dialog/index.ts
Normal file
1
src/components/common/job-download-dialog/index.ts
Normal file
@@ -0,0 +1 @@
|
||||
export * from "./job-download-dialog";
|
||||
@@ -0,0 +1,95 @@
|
||||
import {
|
||||
Dialog,
|
||||
DialogTitle,
|
||||
DialogContent,
|
||||
DialogActions,
|
||||
Button,
|
||||
FormControl,
|
||||
RadioGroup,
|
||||
FormControlLabel,
|
||||
Radio,
|
||||
FormLabel,
|
||||
Typography,
|
||||
Box,
|
||||
} from "@mui/material";
|
||||
import { useState } from "react";
|
||||
|
||||
export type JobDownloadDialogProps = {
|
||||
open: boolean;
|
||||
onClose: () => void;
|
||||
ids: string[];
|
||||
};
|
||||
|
||||
export const JobDownloadDialog = ({
|
||||
open,
|
||||
onClose,
|
||||
ids,
|
||||
}: JobDownloadDialogProps) => {
|
||||
const [jobFormat, setJobFormat] = useState<string>("csv");
|
||||
const handleDownload = async () => {
|
||||
const response = await fetch("/api/download", {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ data: { ids: ids, job_format: jobFormat } }),
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const blob = await response.blob();
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement("a");
|
||||
a.style.display = "none";
|
||||
a.href = url;
|
||||
a.download = `job_${ids[0]}.${jobFormat}`;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
window.URL.revokeObjectURL(url);
|
||||
document.body.removeChild(a);
|
||||
} else {
|
||||
console.error("Failed to download the file.");
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<Dialog open={open} onClose={onClose}>
|
||||
<DialogTitle>Download Job</DialogTitle>
|
||||
<DialogContent>
|
||||
<FormControl>
|
||||
<Typography variant="body1">
|
||||
You are about to download {ids.length} job(s). Please select the
|
||||
format that you would like to download them in.
|
||||
</Typography>
|
||||
<br />
|
||||
<Box
|
||||
sx={{
|
||||
display: "flex",
|
||||
flexDirection: "column",
|
||||
backgroundColor: "background.paper",
|
||||
padding: 2,
|
||||
border: "1px solid",
|
||||
}}
|
||||
>
|
||||
<FormLabel>Format</FormLabel>
|
||||
<hr style={{ width: "100%", margin: "10px 0" }} />
|
||||
<RadioGroup
|
||||
aria-labelledby="job-download-format-radio-buttons"
|
||||
name="job-download-format-radio-buttons"
|
||||
value={jobFormat}
|
||||
onChange={(e) => setJobFormat(e.target.value)}
|
||||
>
|
||||
<FormControlLabel value="csv" control={<Radio />} label="CSV" />
|
||||
<FormControlLabel
|
||||
value="md"
|
||||
control={<Radio />}
|
||||
label="Markdown"
|
||||
/>
|
||||
</RadioGroup>
|
||||
</Box>
|
||||
<br />
|
||||
<Button onClick={handleDownload} size="small">
|
||||
Download
|
||||
</Button>
|
||||
</FormControl>
|
||||
</DialogContent>
|
||||
</Dialog>
|
||||
);
|
||||
};
|
||||
@@ -17,7 +17,7 @@ const items = [
|
||||
},
|
||||
{
|
||||
icon: <HttpIcon />,
|
||||
text: "Previous Jobs",
|
||||
text: "Jobs",
|
||||
href: "/jobs",
|
||||
},
|
||||
{
|
||||
@@ -30,11 +30,6 @@ const items = [
|
||||
text: "Statistics",
|
||||
href: "/statistics",
|
||||
},
|
||||
{
|
||||
icon: <TerminalIcon />,
|
||||
text: "View App Logs",
|
||||
href: "/logs",
|
||||
},
|
||||
{
|
||||
icon: <Schedule />,
|
||||
text: "Cron Jobs",
|
||||
|
||||
@@ -38,6 +38,7 @@ interface Props {
|
||||
onDownload: (job: string[]) => void;
|
||||
onNavigate: (elements: Object[], url: string, options: any) => void;
|
||||
onFavorite: (ids: string[], field: string, value: any) => void;
|
||||
onJobClick: (job: Job) => void;
|
||||
stateProps: stateProps;
|
||||
}
|
||||
|
||||
@@ -48,6 +49,7 @@ export const JobQueue = ({
|
||||
onDownload,
|
||||
onNavigate,
|
||||
onFavorite,
|
||||
onJobClick,
|
||||
}: Props) => {
|
||||
const { selectedJobs, filteredJobs } = stateProps;
|
||||
const router = useRouter();
|
||||
@@ -106,7 +108,14 @@ export const JobQueue = ({
|
||||
</Tooltip>
|
||||
</TableCell>
|
||||
<TableCell sx={{ maxWidth: 100, overflow: "auto" }}>
|
||||
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.id}</Box>
|
||||
<Box
|
||||
sx={{
|
||||
maxHeight: 100,
|
||||
overflow: "auto",
|
||||
}}
|
||||
>
|
||||
{row.id}
|
||||
</Box>
|
||||
</TableCell>
|
||||
<TableCell sx={{ maxWidth: 200, overflow: "auto" }}>
|
||||
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.url}</Box>
|
||||
@@ -117,41 +126,24 @@ export const JobQueue = ({
|
||||
</Box>
|
||||
</TableCell>
|
||||
<TableCell sx={{ maxWidth: 150, overflow: "auto", padding: 0 }}>
|
||||
<Accordion sx={{ margin: 0, padding: 0.5 }}>
|
||||
<AccordionSummary
|
||||
expandIcon={<ExpandMoreIcon />}
|
||||
aria-controls="panel1a-content"
|
||||
id="panel1a-header"
|
||||
<Box
|
||||
sx={{
|
||||
maxHeight: 100,
|
||||
overflow: "auto",
|
||||
display: "flex",
|
||||
alignItems: "center",
|
||||
justifyContent: "center",
|
||||
}}
|
||||
>
|
||||
<Button
|
||||
sx={{
|
||||
minHeight: 0,
|
||||
"&.Mui-expanded": { minHeight: 0 },
|
||||
fontSize: "0.875rem",
|
||||
}}
|
||||
onClick={() => onJobClick(row)}
|
||||
>
|
||||
<Box
|
||||
sx={{
|
||||
maxHeight: 150,
|
||||
overflow: "auto",
|
||||
width: "100%",
|
||||
}}
|
||||
>
|
||||
<Typography sx={{ fontSize: "0.875rem" }}>
|
||||
Show Result
|
||||
</Typography>
|
||||
</Box>
|
||||
</AccordionSummary>
|
||||
<AccordionDetails sx={{ padding: 1 }}>
|
||||
<Box sx={{ maxHeight: 200, overflow: "auto" }}>
|
||||
<Typography
|
||||
sx={{
|
||||
fontSize: "0.875rem",
|
||||
whiteSpace: "pre-wrap",
|
||||
}}
|
||||
>
|
||||
{JSON.stringify(row.result, null, 2)}
|
||||
</Typography>
|
||||
</Box>
|
||||
</AccordionDetails>
|
||||
</Accordion>
|
||||
Show Result
|
||||
</Button>
|
||||
</Box>
|
||||
</TableCell>
|
||||
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
|
||||
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import React, { Dispatch, SetStateAction, useState } from "react";
|
||||
import React, { SetStateAction, useState } from "react";
|
||||
import {
|
||||
IconButton,
|
||||
Box,
|
||||
@@ -18,8 +18,9 @@ import StarIcon from "@mui/icons-material/Star";
|
||||
import { useRouter } from "next/router";
|
||||
import { Favorites, JobQueue } from ".";
|
||||
import { Job } from "../../types";
|
||||
import { Constants } from "../../lib";
|
||||
import Cookies from "js-cookie";
|
||||
import { useSearchParams } from "next/navigation";
|
||||
import { JobDownloadDialog } from "../common/job-download-dialog";
|
||||
|
||||
interface JobTableProps {
|
||||
jobs: Job[];
|
||||
@@ -38,36 +39,24 @@ const COLOR_MAP: ColorMap = {
|
||||
};
|
||||
|
||||
export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
|
||||
const searchParams = useSearchParams();
|
||||
const search = searchParams.get("search");
|
||||
const type = searchParams.get("type");
|
||||
|
||||
const [selectedJobs, setSelectedJobs] = useState<Set<string>>(new Set());
|
||||
const [allSelected, setAllSelected] = useState(false);
|
||||
const [searchQuery, setSearchQuery] = useState<string>("");
|
||||
const [searchMode, setSearchMode] = useState<string>("url");
|
||||
const [searchQuery, setSearchQuery] = useState<string>(search || "");
|
||||
const [searchMode, setSearchMode] = useState<string>(type || "url");
|
||||
const [favoriteView, setFavoriteView] = useState<boolean>(false);
|
||||
const [jobDownloadDialogOpen, setJobDownloadDialogOpen] =
|
||||
useState<boolean>(false);
|
||||
|
||||
const token = Cookies.get("token");
|
||||
const router = useRouter();
|
||||
|
||||
const handleDownload = async (ids: string[]) => {
|
||||
const response = await fetch("/api/download", {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ data: { ids: ids } }),
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const blob = await response.blob();
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement("a");
|
||||
a.style.display = "none";
|
||||
a.href = url;
|
||||
a.download = `job_${ids[0]}.csv`;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
window.URL.revokeObjectURL(url);
|
||||
document.body.removeChild(a);
|
||||
} else {
|
||||
console.error("Failed to download the file.");
|
||||
}
|
||||
const handleDownload = (ids: string[]) => {
|
||||
setSelectedJobs(new Set(ids));
|
||||
setJobDownloadDialogOpen(true);
|
||||
};
|
||||
|
||||
const handleNavigate = (elements: Object[], url: string, options: any) => {
|
||||
@@ -152,6 +141,10 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
|
||||
});
|
||||
};
|
||||
|
||||
const handleJobClick = (job: Job) => {
|
||||
router.push(`/job/csv/${job.id}`);
|
||||
};
|
||||
|
||||
return (
|
||||
<Box
|
||||
width="100%"
|
||||
@@ -250,17 +243,23 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
|
||||
onNavigate={handleNavigate}
|
||||
onSelectJob={handleSelectJob}
|
||||
onFavorite={favoriteJob}
|
||||
></JobQueue>
|
||||
onJobClick={handleJobClick}
|
||||
/>
|
||||
) : (
|
||||
<Favorites
|
||||
stateProps={{ selectedJobs, filteredJobs }}
|
||||
onNavigate={handleNavigate}
|
||||
onSelectJob={handleSelectJob}
|
||||
onFavorite={favoriteJob}
|
||||
></Favorites>
|
||||
/>
|
||||
)}
|
||||
</Box>
|
||||
</Box>
|
||||
<JobDownloadDialog
|
||||
open={jobDownloadDialogOpen}
|
||||
onClose={() => setJobDownloadDialogOpen(false)}
|
||||
ids={Array.from(selectedJobs)}
|
||||
/>
|
||||
</Box>
|
||||
);
|
||||
};
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
export * from "./log-container";
|
||||
@@ -1,3 +0,0 @@
|
||||
.logContainer {
|
||||
max-width: none !important;
|
||||
}
|
||||
@@ -1,103 +0,0 @@
|
||||
import React, { useState, useEffect, useRef } from "react";
|
||||
import { Container, IconButton } from "@mui/material";
|
||||
import { ArrowUpward, ArrowDownward } from "@mui/icons-material";
|
||||
import { Constants } from "../../../lib/constants";
|
||||
|
||||
import classes from "./log-container.module.css";
|
||||
|
||||
interface LogContainerProps {
|
||||
initialLogs: string;
|
||||
}
|
||||
|
||||
export const LogContainer: React.FC<LogContainerProps> = ({ initialLogs }) => {
|
||||
const [logs, setLogs] = useState<string>(initialLogs);
|
||||
const logsContainerRef = useRef<HTMLDivElement | null>(null);
|
||||
|
||||
useEffect(() => {
|
||||
const eventSource = new EventSource(`/api/logs`);
|
||||
|
||||
setLogs("");
|
||||
|
||||
eventSource.onmessage = (event) => {
|
||||
setLogs((prevLogs) => prevLogs + event.data + "\n");
|
||||
|
||||
if (logsContainerRef.current) {
|
||||
logsContainerRef.current.scrollTop =
|
||||
logsContainerRef.current.scrollHeight;
|
||||
}
|
||||
};
|
||||
|
||||
eventSource.onopen = (e) => {
|
||||
};
|
||||
|
||||
eventSource.onerror = (error) => {
|
||||
console.error("EventSource failed:", error);
|
||||
eventSource.close();
|
||||
};
|
||||
|
||||
return () => {
|
||||
eventSource.close();
|
||||
};
|
||||
}, []);
|
||||
|
||||
const scrollToTop = () => {
|
||||
if (logsContainerRef.current) {
|
||||
logsContainerRef.current.scrollTop = 0;
|
||||
}
|
||||
};
|
||||
|
||||
const scrollToBottom = () => {
|
||||
if (logsContainerRef.current) {
|
||||
logsContainerRef.current.scrollTop =
|
||||
logsContainerRef.current.scrollHeight;
|
||||
}
|
||||
};
|
||||
return (
|
||||
<Container
|
||||
sx={{
|
||||
position: "relative",
|
||||
backgroundColor: "black",
|
||||
color: "white",
|
||||
padding: "10px",
|
||||
overflowY: "scroll",
|
||||
whiteSpace: "pre-wrap",
|
||||
overflowWrap: "normal",
|
||||
maxHeight: "95vh",
|
||||
}}
|
||||
className={classes.logContainer}
|
||||
ref={logsContainerRef}
|
||||
>
|
||||
<pre
|
||||
style={{
|
||||
whiteSpace: "pre-wrap",
|
||||
wordWrap: "break-word",
|
||||
margin: 0,
|
||||
}}
|
||||
>
|
||||
{logs}
|
||||
</pre>
|
||||
<IconButton
|
||||
sx={{
|
||||
position: "fixed",
|
||||
top: 20,
|
||||
right: 20,
|
||||
backgroundColor: "rgba(255, 255, 255, 0.1)",
|
||||
}}
|
||||
onClick={scrollToTop}
|
||||
>
|
||||
<ArrowUpward style={{ color: "white" }} />
|
||||
</IconButton>
|
||||
<IconButton
|
||||
sx={{
|
||||
position: "fixed",
|
||||
bottom: 20,
|
||||
right: 20,
|
||||
backgroundColor: "rgba(255, 255, 255, 0.1)",
|
||||
}}
|
||||
onClick={scrollToBottom}
|
||||
>
|
||||
<ArrowDownward style={{ color: "white" }} />
|
||||
</IconButton>
|
||||
</Container>
|
||||
);
|
||||
};
|
||||
@@ -10,6 +10,7 @@ import {
|
||||
Button,
|
||||
Box,
|
||||
Typography,
|
||||
useTheme,
|
||||
} from "@mui/material";
|
||||
import Cookies from "js-cookie";
|
||||
|
||||
@@ -27,6 +28,7 @@ export const CronJobs = ({
|
||||
const [jobs, setJobs] = useState<Job[]>(initialJobs);
|
||||
const [cronJobs, setCronJobs] = useState<CronJob[]>(initialCronJobs);
|
||||
const [user, setUser] = useState<any>(initialUser);
|
||||
const theme = useTheme();
|
||||
|
||||
useEffect(() => {
|
||||
setJobs(initialJobs);
|
||||
@@ -55,10 +57,28 @@ export const CronJobs = ({
|
||||
|
||||
if (!user) {
|
||||
return (
|
||||
<Box>
|
||||
<Typography variant="h6">
|
||||
<Box
|
||||
sx={{
|
||||
display: "flex",
|
||||
justifyContent: "center",
|
||||
alignItems: "center",
|
||||
height: "100%",
|
||||
borderRadius: "8px",
|
||||
border:
|
||||
theme.palette.mode === "light" ? "solid white" : "solid #4b5057",
|
||||
boxShadow: "0 4px 8px rgba(0, 0, 0, 0.1)",
|
||||
}}
|
||||
>
|
||||
<h4
|
||||
style={{
|
||||
color: "#fff",
|
||||
padding: "20px",
|
||||
borderRadius: "8px",
|
||||
background: "rgba(0, 0, 0, 0.6)",
|
||||
}}
|
||||
>
|
||||
Please login to view your cron jobs
|
||||
</Typography>
|
||||
</h4>
|
||||
</Box>
|
||||
);
|
||||
}
|
||||
|
||||
35
src/components/pages/job/csv/id/get-server-side-props.ts
Normal file
35
src/components/pages/job/csv/id/get-server-side-props.ts
Normal file
@@ -0,0 +1,35 @@
|
||||
import { GetServerSideProps } from "next";
|
||||
import { parseCookies } from "nookies";
|
||||
|
||||
export const getServerSideProps: GetServerSideProps = async (context) => {
|
||||
const { req, params } = context;
|
||||
const id = params?.id;
|
||||
|
||||
const cookies = parseCookies({ req });
|
||||
const token = cookies.token;
|
||||
let csv = null;
|
||||
|
||||
try {
|
||||
const csvResponse = await fetch(
|
||||
`${process.env.NEXT_PUBLIC_API_URL}/api/job/${id}/convert-to-csv`,
|
||||
{
|
||||
|
||||
method: "GET",
|
||||
headers: {
|
||||
"content-type": "application/json",
|
||||
Authorization: `Bearer ${token}`,
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
csv = await csvResponse.json();
|
||||
} catch (error) {
|
||||
console.error("Error fetching job:", error);
|
||||
}
|
||||
|
||||
return {
|
||||
props: {
|
||||
csv,
|
||||
},
|
||||
};
|
||||
};
|
||||
10
src/components/pages/job/csv/id/id.tsx
Normal file
10
src/components/pages/job/csv/id/id.tsx
Normal file
@@ -0,0 +1,10 @@
|
||||
import { CsvRow, CsvTable } from "@/components/common/csv-table/csv-table";
|
||||
|
||||
export type Csv = {
|
||||
rows: CsvRow[];
|
||||
headers: string[];
|
||||
};
|
||||
|
||||
export const JobCsvId = ({ csv }: { csv: Csv }) => {
|
||||
return <CsvTable csv={csv} />;
|
||||
};
|
||||
1
src/components/pages/job/csv/id/index.ts
Normal file
1
src/components/pages/job/csv/id/index.ts
Normal file
@@ -0,0 +1 @@
|
||||
export * from "./id";
|
||||
@@ -34,7 +34,7 @@ export const JobSubmitterInput = ({
|
||||
size="small"
|
||||
onClick={handleSubmit}
|
||||
disabled={!(rows.length > 0) || loading}
|
||||
className={`bg-gradient-to-r from-[#034efc] to-gray-500 text-white font-semibold rounded-md
|
||||
className={`bg-[#034efc] text-white font-semibold rounded-md
|
||||
transition-transform transform hover:scale-105 disabled:opacity-50`}
|
||||
>
|
||||
{loading ? <CircularProgress size={24} color="inherit" /> : "Submit"}
|
||||
|
||||
@@ -10,12 +10,14 @@ import { JobSubmitterInput } from "./job-submitter-input";
|
||||
import { JobSubmitterOptions } from "./job-submitter-options";
|
||||
import { ApiService } from "@/services";
|
||||
import { useJobSubmitterProvider } from "./provider";
|
||||
import { AdvancedJobOptions } from "@/components/common/advanced-job-options";
|
||||
|
||||
const initialJobOptions: RawJobOptions = {
|
||||
multi_page_scrape: false,
|
||||
custom_headers: null,
|
||||
proxies: null,
|
||||
collect_media: false,
|
||||
custom_cookies: null,
|
||||
};
|
||||
|
||||
export const JobSubmitter = () => {
|
||||
@@ -38,12 +40,8 @@ export const JobSubmitter = () => {
|
||||
const [loading, setLoading] = useState<boolean>(false);
|
||||
const [jobOptions, setJobOptions] =
|
||||
useState<RawJobOptions>(initialJobOptions);
|
||||
const [customJSONSelected, setCustomJSONSelected] = useState<boolean>(false);
|
||||
const [proxiesSelected, setProxiesSelected] = useState<boolean>(false);
|
||||
|
||||
const handleSelectProxies = () => {
|
||||
setProxiesSelected(!proxiesSelected);
|
||||
};
|
||||
console.log(jobOptions);
|
||||
|
||||
const handleSubmit = async () => {
|
||||
if (!validateURL(submittedURL)) {
|
||||
@@ -57,12 +55,13 @@ export const JobSubmitter = () => {
|
||||
setLoading(true);
|
||||
|
||||
let customHeaders;
|
||||
let customCookies;
|
||||
|
||||
try {
|
||||
customHeaders = jobOptions.custom_headers
|
||||
? JSON.parse(jobOptions.custom_headers)
|
||||
: null;
|
||||
} catch (error) {
|
||||
customHeaders = jobOptions.custom_headers || null;
|
||||
customCookies = jobOptions.custom_cookies || null;
|
||||
} catch (error: any) {
|
||||
console.error(error);
|
||||
setSnackbarMessage("Invalid JSON in custom headers.");
|
||||
setSnackbarOpen(true);
|
||||
setSnackbarSeverity("error");
|
||||
@@ -76,6 +75,7 @@ export const JobSubmitter = () => {
|
||||
user,
|
||||
jobOptions,
|
||||
customHeaders,
|
||||
customCookies,
|
||||
siteMap
|
||||
)
|
||||
.then(async (response) => {
|
||||
@@ -102,16 +102,9 @@ export const JobSubmitter = () => {
|
||||
.finally(() => setLoading(false));
|
||||
};
|
||||
|
||||
// Parse the job options from the query string
|
||||
useEffect(() => {
|
||||
if (job_options) {
|
||||
parseJobOptions(
|
||||
job_options as string,
|
||||
setCustomJSONSelected,
|
||||
setProxiesSelected,
|
||||
setJobOptions,
|
||||
setSiteMap
|
||||
);
|
||||
parseJobOptions(job_options as string, setJobOptions, setSiteMap);
|
||||
}
|
||||
}, [job_options]);
|
||||
|
||||
@@ -123,13 +116,9 @@ export const JobSubmitter = () => {
|
||||
handleSubmit={handleSubmit}
|
||||
loading={loading}
|
||||
/>
|
||||
<JobSubmitterOptions
|
||||
<AdvancedJobOptions
|
||||
jobOptions={jobOptions}
|
||||
setJobOptions={setJobOptions}
|
||||
customJSONSelected={customJSONSelected}
|
||||
setCustomJSONSelected={setCustomJSONSelected}
|
||||
handleSelectProxies={handleSelectProxies}
|
||||
proxiesSelected={proxiesSelected}
|
||||
/>
|
||||
</div>
|
||||
);
|
||||
|
||||
@@ -42,12 +42,12 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
|
||||
params.append("username", email);
|
||||
params.append("password", password);
|
||||
const response = await axios.post(`/api/token`, params);
|
||||
const isSecure = window.location.protocol === "https:";
|
||||
|
||||
Cookies.set("token", response.data.access_token, {
|
||||
expires: 7,
|
||||
path: "/",
|
||||
domain: "localhost",
|
||||
secure: false,
|
||||
secure: isSecure,
|
||||
sameSite: "Lax",
|
||||
});
|
||||
|
||||
|
||||
@@ -4,10 +4,8 @@ import { RawJobOptions, SiteMap } from "@/types";
|
||||
|
||||
export const parseJobOptions = (
|
||||
job_options: string,
|
||||
setCustomJSONSelected: Dispatch<SetStateAction<boolean>>,
|
||||
setProxiesSelected: Dispatch<SetStateAction<boolean>>,
|
||||
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>,
|
||||
setSiteMap: Dispatch<SetStateAction<any>>
|
||||
setSiteMap: Dispatch<SetStateAction<SiteMap | null>>
|
||||
) => {
|
||||
if (job_options) {
|
||||
const jsonOptions = JSON.parse(job_options as string);
|
||||
@@ -16,20 +14,23 @@ export const parseJobOptions = (
|
||||
custom_headers: null,
|
||||
proxies: null,
|
||||
collect_media: false,
|
||||
custom_cookies: null,
|
||||
};
|
||||
|
||||
if (
|
||||
jsonOptions.custom_headers &&
|
||||
Object.keys(jsonOptions.custom_headers).length
|
||||
) {
|
||||
setCustomJSONSelected(true);
|
||||
newJobOptions.custom_headers = JSON.stringify(jsonOptions.custom_headers);
|
||||
newJobOptions.custom_headers = jsonOptions.custom_headers;
|
||||
}
|
||||
|
||||
if (jsonOptions.custom_cookies && jsonOptions.custom_cookies.length > 0) {
|
||||
newJobOptions.custom_cookies = jsonOptions.custom_cookies;
|
||||
}
|
||||
|
||||
newJobOptions.multi_page_scrape = jsonOptions.multi_page_scrape;
|
||||
|
||||
if (jsonOptions.proxies) {
|
||||
setProxiesSelected(true);
|
||||
if (jsonOptions.proxies.length > 0) {
|
||||
newJobOptions.proxies = jsonOptions.proxies.join(",");
|
||||
}
|
||||
|
||||
|
||||
37
src/lib/helpers/parse-json-to-entries.ts
Normal file
37
src/lib/helpers/parse-json-to-entries.ts
Normal file
@@ -0,0 +1,37 @@
|
||||
export const parseJsonToEntries = (json: string): [string, string][] | null => {
|
||||
try {
|
||||
const parsed = JSON.parse(json);
|
||||
|
||||
if (Array.isArray(parsed)) {
|
||||
if (
|
||||
parsed.length > 0 &&
|
||||
Array.isArray(parsed[0]) &&
|
||||
parsed[0].length === 2 &&
|
||||
typeof parsed[0][0] === "string"
|
||||
) {
|
||||
// Already array of [key, val] tuples
|
||||
// Just ensure values are strings
|
||||
return parsed.map(([k, v]) => [k, String(v)]);
|
||||
}
|
||||
|
||||
// Array of objects
|
||||
const allEntries: [string, string][] = [];
|
||||
for (const item of parsed) {
|
||||
if (typeof item === "object" && item !== null) {
|
||||
allEntries.push(
|
||||
// @ts-ignore
|
||||
...Object.entries(item).map(([k, v]) => [k, String(v)])
|
||||
);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return allEntries.length > 0 ? allEntries : null;
|
||||
} else if (typeof parsed === "object" && parsed !== null) {
|
||||
return Object.entries(parsed).map(([k, v]) => [k, String(v)]);
|
||||
}
|
||||
return null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
2
src/pages/job/csv/[id].tsx
Normal file
2
src/pages/job/csv/[id].tsx
Normal file
@@ -0,0 +1,2 @@
|
||||
export { JobCsvId as default } from "@/components/pages/job/csv/id";
|
||||
export { getServerSideProps } from "@/components/pages/job/csv/id/get-server-side-props";
|
||||
@@ -1,38 +0,0 @@
|
||||
import { LogContainer } from "../components/logs/log-container";
|
||||
|
||||
interface logs {
|
||||
logs: string;
|
||||
}
|
||||
|
||||
export async function getStaticProps() {
|
||||
try {
|
||||
const response = await fetch(
|
||||
`${process.env.NEXT_PUBLIC_API_URL}/initial_logs`
|
||||
);
|
||||
const logJson: logs = await response.json();
|
||||
const initialLogs = logJson.logs;
|
||||
|
||||
return {
|
||||
props: {
|
||||
initialLogs,
|
||||
},
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Error fetching logs:", error);
|
||||
return {
|
||||
props: {
|
||||
initialLogs: "Failed to fetch logs.",
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
interface LogProps {
|
||||
initialLogs: string;
|
||||
}
|
||||
|
||||
const Logs = ({ initialLogs }: LogProps) => {
|
||||
return <LogContainer initialLogs={initialLogs} />;
|
||||
};
|
||||
|
||||
export default Logs;
|
||||
@@ -92,11 +92,10 @@ const Statistics: React.FC<StatProps> = ({ averageElement, averageJob }) => {
|
||||
try {
|
||||
const response = await fetch("/api/get-average-jobs-per-day", {
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${token}`,
|
||||
},
|
||||
}
|
||||
);
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${token}`,
|
||||
},
|
||||
});
|
||||
const data = await response.json();
|
||||
setJobsData(data);
|
||||
} catch (error) {
|
||||
@@ -251,10 +250,16 @@ const Statistics: React.FC<StatProps> = ({ averageElement, averageJob }) => {
|
||||
) : (
|
||||
<Box
|
||||
bgcolor="background.default"
|
||||
minHeight="100vh"
|
||||
display="flex"
|
||||
height="100%"
|
||||
justifyContent="center"
|
||||
alignItems="center"
|
||||
sx={{
|
||||
borderRadius: "8px",
|
||||
border:
|
||||
theme.palette.mode === "light" ? "solid white" : "solid #4b5057",
|
||||
boxShadow: "0 4px 8px rgba(0, 0, 0, 0.1)",
|
||||
}}
|
||||
>
|
||||
<h4
|
||||
style={{
|
||||
@@ -262,7 +267,6 @@ const Statistics: React.FC<StatProps> = ({ averageElement, averageJob }) => {
|
||||
padding: "20px",
|
||||
borderRadius: "8px",
|
||||
background: "rgba(0, 0, 0, 0.6)",
|
||||
boxShadow: "0 4px 8px rgba(0, 0, 0, 0.2)",
|
||||
}}
|
||||
>
|
||||
Statistics for jobs not viewable unless logged in.
|
||||
|
||||
@@ -6,6 +6,7 @@ export const submitJob = async (
|
||||
user: any,
|
||||
jobOptions: any,
|
||||
customHeaders: any,
|
||||
customCookies: any,
|
||||
siteMap: SiteMap | null
|
||||
) => {
|
||||
return await fetch(`/api/submit-scrape-job`, {
|
||||
@@ -23,6 +24,7 @@ export const submitJob = async (
|
||||
custom_headers: customHeaders || {},
|
||||
proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [],
|
||||
site_map: siteMap,
|
||||
custom_cookies: customCookies || [],
|
||||
},
|
||||
},
|
||||
}),
|
||||
|
||||
@@ -5,6 +5,8 @@
|
||||
:root {
|
||||
--delete-red: #ef4444;
|
||||
--delete-red-hover: #ff6969;
|
||||
--primary-blue: #007bff;
|
||||
--primary-gray: #f8f9fa;
|
||||
}
|
||||
|
||||
#__next {
|
||||
@@ -20,3 +22,22 @@ body {
|
||||
.MuiPopover-paper {
|
||||
padding: 0 !important;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar {
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-track {
|
||||
background-color: rgba(0, 0, 0, 0.05);
|
||||
border-radius: 8px;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-thumb {
|
||||
background-color: rgba(0, 0, 0, 0.2);
|
||||
border-radius: 8px;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-thumb:hover {
|
||||
background-color: rgba(0, 0, 0, 0.3);
|
||||
}
|
||||
|
||||
@@ -70,6 +70,16 @@ const commonThemeOptions = {
|
||||
},
|
||||
},
|
||||
},
|
||||
MuiCheckbox: {
|
||||
styleOverrides: {
|
||||
colorPrimary: {
|
||||
color: "#1976d2",
|
||||
"&.Mui-checked": {
|
||||
color: "#034efc",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
MuiPaper: {
|
||||
styleOverrides: {
|
||||
root: {
|
||||
@@ -85,6 +95,7 @@ const lightTheme = createTheme({
|
||||
mode: "light",
|
||||
primary: {
|
||||
main: "#1976d2",
|
||||
contrastText: "#000000",
|
||||
},
|
||||
secondary: {
|
||||
main: "#dc004e",
|
||||
@@ -139,6 +150,7 @@ const darkTheme = createTheme({
|
||||
mode: "dark",
|
||||
primary: {
|
||||
main: "#90caf9",
|
||||
contrastText: "#fff",
|
||||
},
|
||||
secondary: {
|
||||
main: "#f48fb1",
|
||||
|
||||
@@ -24,6 +24,7 @@ export type RawJobOptions = {
|
||||
custom_headers: string | null;
|
||||
proxies: string | null;
|
||||
collect_media: boolean;
|
||||
custom_cookies: string | null;
|
||||
};
|
||||
|
||||
export type ActionOption = "click" | "input";
|
||||
|
||||
@@ -3,7 +3,7 @@ nodaemon=true
|
||||
|
||||
[program:api]
|
||||
command=pdm run python -m uvicorn api.backend.app:app --reload --host 0.0.0.0 --port 8000
|
||||
directory=/project
|
||||
directory=/project/app
|
||||
autostart=true
|
||||
autorestart=true
|
||||
stdout_logfile=/dev/stdout
|
||||
@@ -13,7 +13,7 @@ stderr_logfile_maxbytes=0
|
||||
|
||||
[program:worker]
|
||||
command=pdm run python -m api.backend.worker.job_worker
|
||||
directory=/project
|
||||
directory=/project/app
|
||||
autostart=true
|
||||
autorestart=true
|
||||
stdout_logfile=/dev/stdout
|
||||
|
||||
Reference in New Issue
Block a user