mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-11-27 11:33:25 +00:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6b33723cac | ||
|
|
5c89e4d7d2 | ||
|
|
ed0828a585 | ||
|
|
1b8c8c779a | ||
|
|
267cc73657 | ||
|
|
92ff16d9c3 | ||
|
|
8b2e5dc9c3 |
50
.github/actions/push-to-helm/action.yaml
vendored
Normal file
50
.github/actions/push-to-helm/action.yaml
vendored
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
name: Publish Helm Chart
|
||||||
|
description: Publish a Helm chart to a target repository
|
||||||
|
|
||||||
|
inputs:
|
||||||
|
app-repo-token:
|
||||||
|
required: true
|
||||||
|
description: "The token for the target repository"
|
||||||
|
|
||||||
|
runs:
|
||||||
|
using: 'composite'
|
||||||
|
steps:
|
||||||
|
- name: Checkout app repo
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Helm
|
||||||
|
uses: azure/setup-helm@v3
|
||||||
|
|
||||||
|
- name: Package Helm chart
|
||||||
|
run: |
|
||||||
|
mkdir -p packaged
|
||||||
|
helm package helm -d packaged
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
- name: Clone target Helm repo
|
||||||
|
run: |
|
||||||
|
git clone https://github.com/jaypyles/helm.git target-repo
|
||||||
|
cd target-repo
|
||||||
|
git config user.name "github-actions"
|
||||||
|
git config user.email "github-actions@github.com"
|
||||||
|
git fetch origin gh-pages # Fetch gh-pages explicitly
|
||||||
|
git checkout gh-pages # Checkout gh-pages branch
|
||||||
|
git pull origin gh-pages # Pull latest changes from gh-pages
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
- name: Copy package and update index
|
||||||
|
run: |
|
||||||
|
APP_NAME="scraperr"
|
||||||
|
mkdir -p target-repo/charts/$APP_NAME
|
||||||
|
cp packaged/*.tgz target-repo/charts/$APP_NAME/
|
||||||
|
cd target-repo/charts/$APP_NAME
|
||||||
|
helm repo index . --url https://jaypyles.github.io/helm/charts/$APP_NAME
|
||||||
|
shell: bash
|
||||||
|
|
||||||
|
- name: Commit and push to target repo
|
||||||
|
run: |
|
||||||
|
cd target-repo
|
||||||
|
git add charts/
|
||||||
|
git commit -m "Update $APP_NAME chart $(date +'%Y-%m-%d %H:%M:%S')" || echo "No changes"
|
||||||
|
git push https://x-access-token:${{ inputs.app-repo-token }}@github.com/jaypyles/helm.git gh-pages
|
||||||
|
shell: bash
|
||||||
29
.github/workflows/docker-image.yml
vendored
29
.github/workflows/docker-image.yml
vendored
@@ -8,12 +8,18 @@ on:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
if: ${{ github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.head_branch == 'master' }}
|
if: ${{ github.event.workflow_run.conclusion == 'success' && github.ref == 'refs/heads/master' }}
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Get version from helm chart
|
||||||
|
run: |
|
||||||
|
VERSION=$(grep "version:" ./helm/Chart.yaml | cut -d: -f2 | tr -d ' ')
|
||||||
|
echo "VERSION=$VERSION" >> $GITHUB_ENV
|
||||||
|
echo "Version is $VERSION"
|
||||||
|
|
||||||
- name: Login to Docker Hub
|
- name: Login to Docker Hub
|
||||||
uses: docker/login-action@v3
|
uses: docker/login-action@v3
|
||||||
with:
|
with:
|
||||||
@@ -29,7 +35,9 @@ jobs:
|
|||||||
context: .
|
context: .
|
||||||
file: ./docker/frontend/Dockerfile
|
file: ./docker/frontend/Dockerfile
|
||||||
push: true
|
push: true
|
||||||
tags: ${{ secrets.DOCKERHUB_USERNAME }}/${{ secrets.DOCKERHUB_REPO }}:latest
|
tags: |
|
||||||
|
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:latest
|
||||||
|
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:${{ env.VERSION }}
|
||||||
|
|
||||||
- name: Build and push api
|
- name: Build and push api
|
||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v5
|
||||||
@@ -37,12 +45,27 @@ jobs:
|
|||||||
context: .
|
context: .
|
||||||
file: ./docker/api/Dockerfile
|
file: ./docker/api/Dockerfile
|
||||||
push: true
|
push: true
|
||||||
tags: ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
|
tags: |
|
||||||
|
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
|
||||||
|
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:${{ env.VERSION }}
|
||||||
|
|
||||||
|
push-helm-chart:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs:
|
||||||
|
- build
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Push Helm Chart
|
||||||
|
uses: ./.github/actions/push-to-helm
|
||||||
|
with:
|
||||||
|
app-repo-token: ${{ secrets.GPAT_TOKEN }}
|
||||||
|
|
||||||
success-message:
|
success-message:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs:
|
needs:
|
||||||
- build
|
- build
|
||||||
|
- push-helm-chart
|
||||||
steps:
|
steps:
|
||||||
- name: Send Discord Message
|
- name: Send Discord Message
|
||||||
uses: jaypyles/discord-webhook-action@v1.0.0
|
uses: jaypyles/discord-webhook-action@v1.0.0
|
||||||
|
|||||||
2
.prettierignore
Normal file
2
.prettierignore
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
*.yaml
|
||||||
|
*.yml
|
||||||
12
README.md
12
README.md
@@ -34,10 +34,16 @@ Scraperr enables you to extract data from websites with precision using XPath se
|
|||||||
|
|
||||||
## 🚀 Getting Started
|
## 🚀 Getting Started
|
||||||
|
|
||||||
|
### Docker
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make up
|
make up
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Helm
|
||||||
|
|
||||||
|
> Refer to the docs for helm deployment: https://scraperr-docs.pages.dev/guides/helm-deployment
|
||||||
|
|
||||||
## ⚖️ Legal and Ethical Guidelines
|
## ⚖️ Legal and Ethical Guidelines
|
||||||
|
|
||||||
When using Scraperr, please remember to:
|
When using Scraperr, please remember to:
|
||||||
@@ -48,6 +54,12 @@ When using Scraperr, please remember to:
|
|||||||
|
|
||||||
> **Disclaimer**: Scraperr is intended for use only on websites that explicitly permit scraping. The creator accepts no responsibility for misuse of this tool.
|
> **Disclaimer**: Scraperr is intended for use only on websites that explicitly permit scraping. The creator accepts no responsibility for misuse of this tool.
|
||||||
|
|
||||||
|
## 💬 Join the Community
|
||||||
|
|
||||||
|
Get support, report bugs, and chat with other users and contributors.
|
||||||
|
|
||||||
|
👉 [Join the Scraperr Discord](https://discord.gg/89q7scsGEK)
|
||||||
|
|
||||||
## 📄 License
|
## 📄 License
|
||||||
|
|
||||||
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
||||||
|
|||||||
@@ -13,3 +13,4 @@ class JobOptions(BaseModel):
|
|||||||
proxies: list[str] = []
|
proxies: list[str] = []
|
||||||
site_map: Optional[SiteMap] = None
|
site_map: Optional[SiteMap] = None
|
||||||
collect_media: bool = False
|
collect_media: bool = False
|
||||||
|
custom_cookies: list[dict[str, Any]] = []
|
||||||
|
|||||||
48
api/backend/job/scraping/add_custom.py
Normal file
48
api/backend/job/scraping/add_custom.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
from typing import Any, Optional
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from playwright.async_api import Page, BrowserContext
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def add_custom_cookies(
|
||||||
|
custom_cookies: list[dict[str, Any]],
|
||||||
|
url: str,
|
||||||
|
context: BrowserContext,
|
||||||
|
) -> None:
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
domain = parsed_url.netloc
|
||||||
|
|
||||||
|
for cookie in custom_cookies:
|
||||||
|
cookie_dict = {
|
||||||
|
"name": cookie.get("name", "default_name"),
|
||||||
|
"value": cookie.get("value", "default_value"),
|
||||||
|
"domain": domain,
|
||||||
|
"path": "/",
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG.info(f"Adding cookie: {cookie_dict}")
|
||||||
|
await context.add_cookies([cookie_dict]) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
async def add_custom_headers(
|
||||||
|
custom_headers: dict[str, Any],
|
||||||
|
page: Page,
|
||||||
|
) -> None:
|
||||||
|
await page.set_extra_http_headers(custom_headers)
|
||||||
|
|
||||||
|
|
||||||
|
async def add_custom_items(
|
||||||
|
url: str,
|
||||||
|
page: Page,
|
||||||
|
cookies: Optional[list[dict[str, Any]]] = None,
|
||||||
|
headers: Optional[dict[str, Any]] = None,
|
||||||
|
) -> None:
|
||||||
|
if cookies:
|
||||||
|
await add_custom_cookies(cookies, url, page.context)
|
||||||
|
|
||||||
|
if headers:
|
||||||
|
await add_custom_headers(headers, page)
|
||||||
24
api/backend/job/utils/stream_md_from_job_results.py
Normal file
24
api/backend/job/utils/stream_md_from_job_results.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from api.backend.utils import clean_text
|
||||||
|
|
||||||
|
|
||||||
|
def stream_md_from_job_results(jobs: list[dict[str, Any]]):
|
||||||
|
md = "# Job Results Summary\n\n"
|
||||||
|
for i, job in enumerate(jobs, start=1):
|
||||||
|
md += f"## Job #{i}\n"
|
||||||
|
yield f"- **Job URL:** {job.get('url', 'N/A')}\n"
|
||||||
|
yield f"- **Timestamp:** {job.get('time_created', 'N/A')}\n"
|
||||||
|
yield f"- **ID:** {job.get('id', 'N/A')}\n"
|
||||||
|
yield "### Extracted Results:\n"
|
||||||
|
|
||||||
|
for res in job.get("result", []):
|
||||||
|
for url, elements in res.items():
|
||||||
|
yield f"\n#### URL: {url}\n"
|
||||||
|
for element_name, values in elements.items():
|
||||||
|
for value in values:
|
||||||
|
text = clean_text(value.get("text", "")).strip()
|
||||||
|
if text:
|
||||||
|
yield f"- **Element:** `{element_name}`\n"
|
||||||
|
yield f" - **Text:** {text}\n"
|
||||||
|
yield "\n---\n"
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
# STL
|
# STL
|
||||||
from typing import Any, Optional, Union
|
from typing import Any, Literal, Optional, Union
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
# LOCAL
|
# LOCAL
|
||||||
@@ -27,6 +27,7 @@ class RetrieveScrapeJobs(pydantic.BaseModel):
|
|||||||
|
|
||||||
class DownloadJob(pydantic.BaseModel):
|
class DownloadJob(pydantic.BaseModel):
|
||||||
ids: list[str]
|
ids: list[str]
|
||||||
|
job_format: Literal["csv", "md"]
|
||||||
|
|
||||||
|
|
||||||
class DeleteScrapeJobs(pydantic.BaseModel):
|
class DeleteScrapeJobs(pydantic.BaseModel):
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ from api.backend.job.cron_scheduling.cron_scheduling import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from api.backend.job.utils.clean_job_format import clean_job_format
|
from api.backend.job.utils.clean_job_format import clean_job_format
|
||||||
|
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -106,10 +107,19 @@ async def download(download_job: DownloadJob):
|
|||||||
)
|
)
|
||||||
results = query(job_query, tuple(download_job.ids))
|
results = query(job_query, tuple(download_job.ids))
|
||||||
|
|
||||||
|
if download_job.job_format == "csv":
|
||||||
csv_buffer = StringIO()
|
csv_buffer = StringIO()
|
||||||
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
|
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
|
||||||
|
|
||||||
headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"]
|
headers = [
|
||||||
|
"id",
|
||||||
|
"url",
|
||||||
|
"element_name",
|
||||||
|
"xpath",
|
||||||
|
"text",
|
||||||
|
"user",
|
||||||
|
"time_created",
|
||||||
|
]
|
||||||
csv_writer.writerow(headers)
|
csv_writer.writerow(headers)
|
||||||
|
|
||||||
for result in results:
|
for result in results:
|
||||||
@@ -141,6 +151,15 @@ async def download(download_job: DownloadJob):
|
|||||||
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
|
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
elif download_job.job_format == "md":
|
||||||
|
response = StreamingResponse(
|
||||||
|
stream_md_from_job_results(results),
|
||||||
|
media_type="text/markdown",
|
||||||
|
)
|
||||||
|
|
||||||
|
response.headers["Content-Disposition"] = "attachment; filename=export.md"
|
||||||
|
return response
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
LOG.error(f"Exception occurred: {e}")
|
LOG.error(f"Exception occurred: {e}")
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|||||||
@@ -12,6 +12,8 @@ from api.backend.models import Element, CapturedElement
|
|||||||
from api.backend.job.scraping.scraping_utils import scrape_content
|
from api.backend.job.scraping.scraping_utils import scrape_content
|
||||||
from api.backend.job.site_mapping.site_mapping import handle_site_mapping
|
from api.backend.job.site_mapping.site_mapping import handle_site_mapping
|
||||||
|
|
||||||
|
from api.backend.job.scraping.add_custom import add_custom_items
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@@ -44,11 +46,13 @@ async def make_site_request(
|
|||||||
proxies: Optional[list[str]] = None,
|
proxies: Optional[list[str]] = None,
|
||||||
site_map: Optional[dict[str, Any]] = None,
|
site_map: Optional[dict[str, Any]] = None,
|
||||||
collect_media: bool = False,
|
collect_media: bool = False,
|
||||||
|
custom_cookies: Optional[list[dict[str, Any]]] = None,
|
||||||
):
|
):
|
||||||
if url in visited_urls:
|
if url in visited_urls:
|
||||||
return
|
return
|
||||||
|
|
||||||
proxy = None
|
proxy = None
|
||||||
|
|
||||||
if proxies:
|
if proxies:
|
||||||
proxy = random.choice(proxies)
|
proxy = random.choice(proxies)
|
||||||
LOG.info(f"Using proxy: {proxy}")
|
LOG.info(f"Using proxy: {proxy}")
|
||||||
@@ -56,8 +60,8 @@ async def make_site_request(
|
|||||||
async with AsyncCamoufox(headless=True, proxy=proxy) as browser:
|
async with AsyncCamoufox(headless=True, proxy=proxy) as browser:
|
||||||
page: Page = await browser.new_page()
|
page: Page = await browser.new_page()
|
||||||
|
|
||||||
if headers:
|
# Add cookies and headers
|
||||||
await page.set_extra_http_headers(headers)
|
await add_custom_items(url, page, custom_cookies, headers)
|
||||||
|
|
||||||
LOG.info(f"Visiting URL: {url}")
|
LOG.info(f"Visiting URL: {url}")
|
||||||
|
|
||||||
@@ -113,6 +117,7 @@ async def make_site_request(
|
|||||||
proxies=proxies,
|
proxies=proxies,
|
||||||
site_map=site_map,
|
site_map=site_map,
|
||||||
collect_media=collect_media,
|
collect_media=collect_media,
|
||||||
|
custom_cookies=custom_cookies,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -152,6 +157,7 @@ async def scrape(
|
|||||||
proxies: Optional[list[str]] = None,
|
proxies: Optional[list[str]] = None,
|
||||||
site_map: Optional[dict[str, Any]] = None,
|
site_map: Optional[dict[str, Any]] = None,
|
||||||
collect_media: bool = False,
|
collect_media: bool = False,
|
||||||
|
custom_cookies: Optional[list[dict[str, Any]]] = None,
|
||||||
):
|
):
|
||||||
visited_urls: set[str] = set()
|
visited_urls: set[str] = set()
|
||||||
pages: set[tuple[str, str]] = set()
|
pages: set[tuple[str, str]] = set()
|
||||||
@@ -166,6 +172,7 @@ async def scrape(
|
|||||||
proxies=proxies,
|
proxies=proxies,
|
||||||
site_map=site_map,
|
site_map=site_map,
|
||||||
collect_media=collect_media,
|
collect_media=collect_media,
|
||||||
|
custom_cookies=custom_cookies,
|
||||||
)
|
)
|
||||||
|
|
||||||
elements: list[dict[str, dict[str, list[CapturedElement]]]] = []
|
elements: list[dict[str, dict[str, list[CapturedElement]]]] = []
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
|
|||||||
mock_randint.return_value = mocked_random_int
|
mock_randint.return_value = mocked_random_int
|
||||||
|
|
||||||
# Create a DownloadJob instance
|
# Create a DownloadJob instance
|
||||||
download_job = DownloadJob(ids=[mocked_job["id"]])
|
download_job = DownloadJob(ids=[mocked_job["id"]], job_format="csv")
|
||||||
|
|
||||||
# Make a POST request to the /download endpoint
|
# Make a POST request to the /download endpoint
|
||||||
response = client.post("/download", json=download_job.model_dump())
|
response = client.post("/download", json=download_job.model_dump())
|
||||||
|
|||||||
@@ -1,25 +1,53 @@
|
|||||||
import pytest
|
import pytest
|
||||||
import logging
|
import logging
|
||||||
from playwright.async_api import async_playwright, Error
|
from typing import Dict
|
||||||
|
from playwright.async_api import async_playwright, Cookie, Route
|
||||||
|
from api.backend.job.scraping.add_custom import add_custom_items
|
||||||
|
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_proxy():
|
async def test_add_custom_items():
|
||||||
proxy = "127.0.0.1:8080"
|
test_cookies = [{"name": "big", "value": "cookie"}]
|
||||||
|
test_headers = {"User-Agent": "test-agent", "Accept": "application/json"}
|
||||||
|
|
||||||
async with async_playwright() as p:
|
async with async_playwright() as p:
|
||||||
browser = await p.firefox.launch(
|
browser = await p.chromium.launch(headless=True)
|
||||||
headless=True, proxy={"server": f"http://{proxy}"}
|
|
||||||
)
|
|
||||||
context = await browser.new_context()
|
context = await browser.new_context()
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
|
|
||||||
with pytest.raises(Error) as excinfo:
|
# Set up request interception
|
||||||
|
captured_headers: Dict[str, str] = {}
|
||||||
|
|
||||||
|
async def handle_route(route: Route) -> None:
|
||||||
|
nonlocal captured_headers
|
||||||
|
captured_headers = route.request.headers
|
||||||
|
await route.continue_()
|
||||||
|
|
||||||
|
await page.route("**/*", handle_route)
|
||||||
|
|
||||||
|
await add_custom_items(
|
||||||
|
url="http://example.com",
|
||||||
|
page=page,
|
||||||
|
cookies=test_cookies,
|
||||||
|
headers=test_headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Navigate to example.com
|
||||||
await page.goto("http://example.com")
|
await page.goto("http://example.com")
|
||||||
|
|
||||||
assert "NS_ERROR_PROXY_CONNECTION_REFUSED" in str(excinfo.value)
|
# Verify cookies were added
|
||||||
|
cookies: list[Cookie] = await page.context.cookies()
|
||||||
|
test_cookie = next((c for c in cookies if c.get("name") == "big"), None)
|
||||||
|
|
||||||
|
assert test_cookie is not None
|
||||||
|
assert test_cookie.get("value") == "cookie"
|
||||||
|
assert test_cookie.get("path") == "/" # Default path should be set
|
||||||
|
assert test_cookie.get("sameSite") == "Lax" # Default sameSite should be set
|
||||||
|
|
||||||
|
# Verify headers were added
|
||||||
|
assert captured_headers.get("user-agent") == "test-agent"
|
||||||
|
|
||||||
await browser.close()
|
await browser.close()
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
from api.backend.job import get_queued_job, update_job
|
from api.backend.job import get_queued_job, update_job
|
||||||
from api.backend.scraping import scrape
|
from api.backend.scraping import scrape
|
||||||
@@ -34,14 +35,25 @@ async def process_job():
|
|||||||
LOG.info(f"Beginning processing job: {job}.")
|
LOG.info(f"Beginning processing job: {job}.")
|
||||||
try:
|
try:
|
||||||
_ = await update_job([job["id"]], field="status", value="Scraping")
|
_ = await update_job([job["id"]], field="status", value="Scraping")
|
||||||
|
|
||||||
|
proxies = job["job_options"]["proxies"]
|
||||||
|
|
||||||
|
if proxies and isinstance(proxies[0], str) and proxies[0].startswith("{"):
|
||||||
|
try:
|
||||||
|
proxies = [json.loads(p) for p in proxies]
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
LOG.error(f"Failed to parse proxy JSON: {proxies}")
|
||||||
|
proxies = []
|
||||||
|
|
||||||
scraped = await scrape(
|
scraped = await scrape(
|
||||||
job["url"],
|
job["url"],
|
||||||
[Element(**j) for j in job["elements"]],
|
[Element(**j) for j in job["elements"]],
|
||||||
job["job_options"]["custom_headers"],
|
job["job_options"]["custom_headers"],
|
||||||
job["job_options"]["multi_page_scrape"],
|
job["job_options"]["multi_page_scrape"],
|
||||||
job["job_options"]["proxies"],
|
proxies,
|
||||||
job["job_options"]["site_map"],
|
job["job_options"]["site_map"],
|
||||||
job["job_options"]["collect_media"],
|
job["job_options"]["collect_media"],
|
||||||
|
job["job_options"]["custom_cookies"],
|
||||||
)
|
)
|
||||||
LOG.info(
|
LOG.info(
|
||||||
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
|
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ services:
|
|||||||
scraperr:
|
scraperr:
|
||||||
depends_on:
|
depends_on:
|
||||||
- scraperr_api
|
- scraperr_api
|
||||||
image: jpyles0524/scraperr:latest
|
image: jpyles0524/scraperr:1.0.13
|
||||||
build:
|
build:
|
||||||
context: .
|
context: .
|
||||||
dockerfile: docker/frontend/Dockerfile
|
dockerfile: docker/frontend/Dockerfile
|
||||||
|
|||||||
@@ -30,4 +30,7 @@ EXPOSE 8000
|
|||||||
|
|
||||||
WORKDIR /project/app
|
WORKDIR /project/app
|
||||||
|
|
||||||
|
RUN mkdir -p /project/app/data
|
||||||
|
RUN touch /project/app/data/database.db
|
||||||
|
|
||||||
CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ]
|
CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ]
|
||||||
23
helm/.helmignore
Normal file
23
helm/.helmignore
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
# Patterns to ignore when building packages.
|
||||||
|
# This supports shell glob matching, relative path matching, and
|
||||||
|
# negation (prefixed with !). Only one pattern per line.
|
||||||
|
.DS_Store
|
||||||
|
# Common VCS dirs
|
||||||
|
.git/
|
||||||
|
.gitignore
|
||||||
|
.bzr/
|
||||||
|
.bzrignore
|
||||||
|
.hg/
|
||||||
|
.hgignore
|
||||||
|
.svn/
|
||||||
|
# Common backup files
|
||||||
|
*.swp
|
||||||
|
*.bak
|
||||||
|
*.tmp
|
||||||
|
*.orig
|
||||||
|
*~
|
||||||
|
# Various IDEs
|
||||||
|
.project
|
||||||
|
.idea/
|
||||||
|
*.tmproj
|
||||||
|
.vscode/
|
||||||
24
helm/Chart.yaml
Normal file
24
helm/Chart.yaml
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
apiVersion: v2
|
||||||
|
name: scraperr
|
||||||
|
description: A Helm chart for Kubernetes
|
||||||
|
|
||||||
|
# A chart can be either an 'application' or a 'library' chart.
|
||||||
|
#
|
||||||
|
# Application charts are a collection of templates that can be packaged into versioned archives
|
||||||
|
# to be deployed.
|
||||||
|
#
|
||||||
|
# Library charts provide useful utilities or functions for the chart developer. They're included as
|
||||||
|
# a dependency of application charts to inject those utilities and functions into the rendering
|
||||||
|
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
|
||||||
|
type: application
|
||||||
|
|
||||||
|
# This is the chart version. This version number should be incremented each time you make changes
|
||||||
|
# to the chart and its templates, including the app version.
|
||||||
|
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||||
|
version: 1.0.14
|
||||||
|
|
||||||
|
# This is the version number of the application being deployed. This version number should be
|
||||||
|
# incremented each time you make changes to the application. Versions are not expected to
|
||||||
|
# follow Semantic Versioning. They should reflect the version the application is using.
|
||||||
|
# It is recommended to use it with quotes.
|
||||||
|
appVersion: "1.16.0"
|
||||||
56
helm/templates/deployment.yaml
Normal file
56
helm/templates/deployment.yaml
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: scraperr
|
||||||
|
spec:
|
||||||
|
replicas: {{ .Values.replicaCount }}
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: scraperr
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: scraperr
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: scraperr
|
||||||
|
{{ if .Values.scraperr.image.repository }}
|
||||||
|
image: "{{ .Values.scraperr.image.repository }}:{{ .Values.scraperr.image.tag }}"
|
||||||
|
{{ else }}
|
||||||
|
image: "{{ .Chart.Name }}:{{ .Chart.Version }}"
|
||||||
|
{{ end }}
|
||||||
|
imagePullPolicy: {{ .Values.scraperr.image.pullPolicy }}
|
||||||
|
command: {{ .Values.scraperr.containerCommand | toJson }}
|
||||||
|
ports:
|
||||||
|
- containerPort: {{ .Values.scraperr.containerPort }}
|
||||||
|
env: {{ toYaml .Values.scraperr.env | nindent 12 }}
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: scraperr-api
|
||||||
|
spec:
|
||||||
|
replicas: {{ .Values.replicaCount }}
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: scraperr-api
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: scraperr-api
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: scraperr-api
|
||||||
|
{{ if .Values.scraperrApi.image.repository }}
|
||||||
|
image: "{{ .Values.scraperrApi.image.repository }}:{{ .Values.scraperrApi.image.tag }}"
|
||||||
|
{{ else }}
|
||||||
|
image: "{{ .Chart.Name }}:{{ .Chart.Version }}"
|
||||||
|
{{ end }}
|
||||||
|
imagePullPolicy: {{ .Values.scraperrApi.image.pullPolicy }}
|
||||||
|
ports:
|
||||||
|
- containerPort: {{ .Values.scraperrApi.containerPort }}
|
||||||
|
env: {{ toYaml .Values.scraperrApi.env | nindent 12 }}
|
||||||
|
volumeMounts: {{ toYaml .Values.scraperrApi.volumeMounts | nindent 12 }}
|
||||||
|
volumes: {{ toYaml .Values.scraperrApi.volumes | nindent 12 }}
|
||||||
37
helm/templates/service.yaml
Normal file
37
helm/templates/service.yaml
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: scraperr
|
||||||
|
spec:
|
||||||
|
type: {{ .Values.scraperr.serviceType }}
|
||||||
|
selector:
|
||||||
|
app: scraperr
|
||||||
|
ports:
|
||||||
|
{{- range .Values.scraperr.ports }}
|
||||||
|
- port: {{ .port }}
|
||||||
|
targetPort: {{ .targetPort }}
|
||||||
|
{{- if .nodePort }}
|
||||||
|
nodePort: {{ .nodePort }}
|
||||||
|
{{- end }}
|
||||||
|
protocol: {{ .protocol | default "TCP" }}
|
||||||
|
{{- end }}
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: scraperr-api
|
||||||
|
spec:
|
||||||
|
type: {{ .Values.scraperrApi.serviceType }}
|
||||||
|
selector:
|
||||||
|
app: scraperr-api
|
||||||
|
ports:
|
||||||
|
{{- range .Values.scraperrApi.ports }}
|
||||||
|
- port: {{ .port }}
|
||||||
|
targetPort: {{ .targetPort }}
|
||||||
|
{{- if .nodePort }}
|
||||||
|
nodePort: {{ .nodePort }}
|
||||||
|
{{- end }}
|
||||||
|
protocol: {{ .protocol | default "TCP" }}
|
||||||
|
{{- end }}
|
||||||
47
helm/values.yaml
Normal file
47
helm/values.yaml
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
scraperr:
|
||||||
|
image:
|
||||||
|
repository: jpyles0524/scraperr
|
||||||
|
tag: latest
|
||||||
|
pullPolicy: IfNotPresent
|
||||||
|
containerCommand: ["npm", "run","start"]
|
||||||
|
containerPort: 3000
|
||||||
|
serviceType: NodePort
|
||||||
|
ports:
|
||||||
|
- port: 80
|
||||||
|
targetPort: 3000
|
||||||
|
nodePort: 32300
|
||||||
|
protocol: TCP
|
||||||
|
env:
|
||||||
|
- name: NEXT_PUBLIC_API_URL
|
||||||
|
value: "http://scraperr-api:8000"
|
||||||
|
- name: SERVER_URL
|
||||||
|
value: "http://scraperr-api:8000"
|
||||||
|
|
||||||
|
scraperrApi:
|
||||||
|
image:
|
||||||
|
repository: jpyles0524/scraperr_api
|
||||||
|
tag: latest
|
||||||
|
pullPolicy: IfNotPresent
|
||||||
|
containerPort: 8000
|
||||||
|
serviceType: ClusterIP
|
||||||
|
ports:
|
||||||
|
- port: 8000
|
||||||
|
targetPort: 8000
|
||||||
|
protocol: TCP
|
||||||
|
env:
|
||||||
|
- name: LOG_LEVEL
|
||||||
|
value: "INFO"
|
||||||
|
volumeMounts:
|
||||||
|
- name: data
|
||||||
|
mountPath: /project/app/data
|
||||||
|
- name: media
|
||||||
|
mountPath: /project/app/media
|
||||||
|
volumes:
|
||||||
|
- name: data
|
||||||
|
hostPath:
|
||||||
|
path: /data/scraperr/data
|
||||||
|
type: DirectoryOrCreate
|
||||||
|
- name: media
|
||||||
|
hostPath:
|
||||||
|
path: /data/scraperr/media
|
||||||
|
replicaCount: 1
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
import { Box, Link, Typography } from "@mui/material";
|
||||||
|
import { SetStateAction, Dispatch, useState } from "react";
|
||||||
|
import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog";
|
||||||
|
import { RawJobOptions } from "@/types";
|
||||||
|
|
||||||
|
export type AdvancedJobOptionsProps = {
|
||||||
|
jobOptions: RawJobOptions;
|
||||||
|
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
|
||||||
|
};
|
||||||
|
|
||||||
|
export const AdvancedJobOptions = ({
|
||||||
|
jobOptions,
|
||||||
|
setJobOptions,
|
||||||
|
}: AdvancedJobOptionsProps) => {
|
||||||
|
const [open, setOpen] = useState(false);
|
||||||
|
return (
|
||||||
|
<Box sx={{ mb: 2 }}>
|
||||||
|
<Link
|
||||||
|
component="button"
|
||||||
|
variant="body2"
|
||||||
|
onClick={() => setOpen(true)}
|
||||||
|
sx={{
|
||||||
|
textDecoration: "none",
|
||||||
|
color: "primary.main",
|
||||||
|
"&:hover": {
|
||||||
|
color: "primary.dark",
|
||||||
|
textDecoration: "underline",
|
||||||
|
},
|
||||||
|
paddingLeft: 1,
|
||||||
|
display: "inline-flex",
|
||||||
|
alignItems: "center",
|
||||||
|
gap: 0.5,
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<Typography variant="body2">Advanced Job Options</Typography>
|
||||||
|
</Link>
|
||||||
|
<AdvancedJobOptionsDialog
|
||||||
|
open={open}
|
||||||
|
onClose={() => setOpen(false)}
|
||||||
|
jobOptions={jobOptions}
|
||||||
|
setJobOptions={setJobOptions}
|
||||||
|
/>
|
||||||
|
</Box>
|
||||||
|
);
|
||||||
|
};
|
||||||
@@ -0,0 +1,269 @@
|
|||||||
|
import {
|
||||||
|
Accordion,
|
||||||
|
AccordionDetails,
|
||||||
|
AccordionSummary,
|
||||||
|
Box,
|
||||||
|
Checkbox,
|
||||||
|
Dialog,
|
||||||
|
DialogContent,
|
||||||
|
DialogTitle,
|
||||||
|
Divider,
|
||||||
|
FormControl,
|
||||||
|
FormControlLabel,
|
||||||
|
FormGroup,
|
||||||
|
IconButton,
|
||||||
|
TextField,
|
||||||
|
Tooltip,
|
||||||
|
Typography,
|
||||||
|
useTheme,
|
||||||
|
} from "@mui/material";
|
||||||
|
import {
|
||||||
|
ExpandMore as ExpandMoreIcon,
|
||||||
|
InfoOutlined,
|
||||||
|
Code as CodeIcon,
|
||||||
|
Settings,
|
||||||
|
} from "@mui/icons-material";
|
||||||
|
import { Dispatch, SetStateAction } from "react";
|
||||||
|
import { RawJobOptions } from "@/types";
|
||||||
|
import { ExpandedTableInput } from "../../expanded-table-input";
|
||||||
|
|
||||||
|
export type AdvancedJobOptionsDialogProps = {
|
||||||
|
open: boolean;
|
||||||
|
onClose: () => void;
|
||||||
|
jobOptions: RawJobOptions;
|
||||||
|
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
|
||||||
|
};
|
||||||
|
|
||||||
|
export const AdvancedJobOptionsDialog = ({
|
||||||
|
open,
|
||||||
|
onClose,
|
||||||
|
jobOptions,
|
||||||
|
setJobOptions,
|
||||||
|
}: AdvancedJobOptionsDialogProps) => {
|
||||||
|
const theme = useTheme();
|
||||||
|
const handleMultiPageScrapeChange = () => {
|
||||||
|
setJobOptions((prevJobOptions) => ({
|
||||||
|
...prevJobOptions,
|
||||||
|
multi_page_scrape: !prevJobOptions.multi_page_scrape,
|
||||||
|
}));
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleProxiesChange = (e: React.ChangeEvent<HTMLInputElement>) => {
|
||||||
|
setJobOptions((prevJobOptions) => ({
|
||||||
|
...prevJobOptions,
|
||||||
|
proxies: e.target.value,
|
||||||
|
}));
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleCollectMediaChange = () => {
|
||||||
|
setJobOptions((prevJobOptions) => ({
|
||||||
|
...prevJobOptions,
|
||||||
|
collect_media: !prevJobOptions.collect_media,
|
||||||
|
}));
|
||||||
|
};
|
||||||
|
|
||||||
|
return (
|
||||||
|
<Dialog
|
||||||
|
open={open}
|
||||||
|
onClose={onClose}
|
||||||
|
maxWidth="md"
|
||||||
|
fullWidth
|
||||||
|
PaperProps={{
|
||||||
|
sx: {
|
||||||
|
borderRadius: 2,
|
||||||
|
boxShadow: "0 8px 32px rgba(0, 0, 0, 0.1)",
|
||||||
|
},
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<DialogTitle
|
||||||
|
sx={{
|
||||||
|
borderBottom: `1px solid ${theme.palette.divider}`,
|
||||||
|
backgroundColor: theme.palette.background.default,
|
||||||
|
color: theme.palette.primary.contrastText,
|
||||||
|
borderRadius: 2,
|
||||||
|
display: "flex",
|
||||||
|
alignItems: "center",
|
||||||
|
justifyContent: "space-between",
|
||||||
|
padding: "1rem 2rem",
|
||||||
|
marginRight: 2,
|
||||||
|
marginLeft: 2,
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<Typography variant="h6" component="div">
|
||||||
|
Advanced Job Options
|
||||||
|
</Typography>
|
||||||
|
<Settings
|
||||||
|
sx={{
|
||||||
|
color: theme.palette.primary.contrastText,
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
</DialogTitle>
|
||||||
|
|
||||||
|
<DialogContent
|
||||||
|
sx={{ padding: 3, overflowY: "auto", marginTop: 2, height: "60rem" }}
|
||||||
|
>
|
||||||
|
<FormControl fullWidth>
|
||||||
|
<Box sx={{ mb: 3 }}>
|
||||||
|
<Typography
|
||||||
|
variant="subtitle1"
|
||||||
|
sx={{
|
||||||
|
mb: 1,
|
||||||
|
fontWeight: "bold",
|
||||||
|
color: theme.palette.text.primary,
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
Collection Options
|
||||||
|
</Typography>
|
||||||
|
<Divider sx={{ mb: 2, backgroundColor: theme.palette.divider }} />
|
||||||
|
|
||||||
|
<FormGroup row sx={{ gap: 4, mb: 1 }}>
|
||||||
|
<FormControlLabel
|
||||||
|
control={
|
||||||
|
<Checkbox
|
||||||
|
checked={jobOptions.multi_page_scrape}
|
||||||
|
onChange={handleMultiPageScrapeChange}
|
||||||
|
/>
|
||||||
|
}
|
||||||
|
label={
|
||||||
|
<Box sx={{ display: "flex", alignItems: "center" }}>
|
||||||
|
<Typography>Multi Page Scrape</Typography>
|
||||||
|
<Tooltip title="Enable crawling through multiple pages">
|
||||||
|
<IconButton size="small">
|
||||||
|
<InfoOutlined fontSize="small" />
|
||||||
|
</IconButton>
|
||||||
|
</Tooltip>
|
||||||
|
</Box>
|
||||||
|
}
|
||||||
|
/>
|
||||||
|
<FormControlLabel
|
||||||
|
control={
|
||||||
|
<Checkbox
|
||||||
|
checked={jobOptions.collect_media}
|
||||||
|
onChange={handleCollectMediaChange}
|
||||||
|
/>
|
||||||
|
}
|
||||||
|
label={
|
||||||
|
<Box sx={{ display: "flex", alignItems: "center" }}>
|
||||||
|
<Typography>Collect Media</Typography>
|
||||||
|
<Tooltip title="Download images and other media">
|
||||||
|
<IconButton size="small">
|
||||||
|
<InfoOutlined fontSize="small" />
|
||||||
|
</IconButton>
|
||||||
|
</Tooltip>
|
||||||
|
</Box>
|
||||||
|
}
|
||||||
|
/>
|
||||||
|
</FormGroup>
|
||||||
|
</Box>
|
||||||
|
|
||||||
|
<Box sx={{ mb: 3 }}>
|
||||||
|
<Typography
|
||||||
|
variant="subtitle1"
|
||||||
|
sx={{
|
||||||
|
mb: 1,
|
||||||
|
fontWeight: "bold",
|
||||||
|
color: theme.palette.text.primary,
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
Custom Options
|
||||||
|
</Typography>
|
||||||
|
<Divider sx={{ mb: 2, backgroundColor: theme.palette.divider }} />
|
||||||
|
|
||||||
|
{/* Proxies Section */}
|
||||||
|
<Accordion
|
||||||
|
defaultExpanded
|
||||||
|
elevation={0}
|
||||||
|
sx={{
|
||||||
|
mb: 2,
|
||||||
|
border: `1px solid ${theme.palette.divider}`,
|
||||||
|
"&:before": { display: "none" },
|
||||||
|
borderRadius: 1,
|
||||||
|
overflow: "hidden",
|
||||||
|
padding: 1,
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<AccordionSummary
|
||||||
|
expandIcon={<ExpandMoreIcon />}
|
||||||
|
sx={{
|
||||||
|
backgroundColor: theme.palette.background.paper,
|
||||||
|
borderBottom: `1px solid ${theme.palette.divider}`,
|
||||||
|
"&.Mui-expanded": {
|
||||||
|
borderBottom: `1px solid ${theme.palette.divider}`,
|
||||||
|
},
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<Box sx={{ display: "flex", alignItems: "center" }}>
|
||||||
|
<div
|
||||||
|
style={{
|
||||||
|
display: "flex",
|
||||||
|
alignItems: "center",
|
||||||
|
gap: "0.5rem",
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<Typography
|
||||||
|
sx={{
|
||||||
|
fontWeight: 500,
|
||||||
|
color: theme.palette.text.primary,
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
Proxies
|
||||||
|
</Typography>
|
||||||
|
|
||||||
|
<Tooltip title="Comma separated list of proxies that should follow Playwright proxy format">
|
||||||
|
<InfoOutlined fontSize="small" />
|
||||||
|
</Tooltip>
|
||||||
|
</div>
|
||||||
|
</Box>
|
||||||
|
</AccordionSummary>
|
||||||
|
<AccordionDetails
|
||||||
|
sx={{ p: 2, backgroundColor: theme.palette.background.default }}
|
||||||
|
>
|
||||||
|
<TextField
|
||||||
|
placeholder='Proxies ([{"server": "proxy.example.com:8080", "username": "username", "password": "password"}])'
|
||||||
|
fullWidth
|
||||||
|
variant="outlined"
|
||||||
|
size="small"
|
||||||
|
value={jobOptions.proxies}
|
||||||
|
onChange={handleProxiesChange}
|
||||||
|
InputProps={{
|
||||||
|
startAdornment: (
|
||||||
|
<CodeIcon
|
||||||
|
sx={{ color: theme.palette.text.secondary, mr: 1 }}
|
||||||
|
/>
|
||||||
|
),
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
</AccordionDetails>
|
||||||
|
</Accordion>
|
||||||
|
|
||||||
|
{/* Custom Headers Section */}
|
||||||
|
<ExpandedTableInput
|
||||||
|
label="Custom Headers"
|
||||||
|
placeholder='{"User-Agent": "CustomAgent", "Accept": "*/*"}'
|
||||||
|
urlParam="custom_headers"
|
||||||
|
onChange={(value) => {
|
||||||
|
setJobOptions((prevJobOptions) => ({
|
||||||
|
...prevJobOptions,
|
||||||
|
custom_headers: value,
|
||||||
|
}));
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
|
||||||
|
{/* Custom Cookies Section */}
|
||||||
|
<ExpandedTableInput
|
||||||
|
label="Custom Cookies"
|
||||||
|
placeholder='[{"name": "value", "name2": "value2"}]'
|
||||||
|
urlParam="custom_cookies"
|
||||||
|
onChange={(value) => {
|
||||||
|
setJobOptions((prevJobOptions) => ({
|
||||||
|
...prevJobOptions,
|
||||||
|
custom_cookies: value,
|
||||||
|
}));
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
</Box>
|
||||||
|
</FormControl>
|
||||||
|
</DialogContent>
|
||||||
|
</Dialog>
|
||||||
|
);
|
||||||
|
};
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
export * from "./advanced-job-options-dialog";
|
||||||
1
src/components/common/advanced-job-options/index.ts
Normal file
1
src/components/common/advanced-job-options/index.ts
Normal file
@@ -0,0 +1 @@
|
|||||||
|
export * from "./advanced-job-options";
|
||||||
@@ -0,0 +1,204 @@
|
|||||||
|
import {
|
||||||
|
Accordion,
|
||||||
|
AccordionSummary,
|
||||||
|
TableCell,
|
||||||
|
TableRow,
|
||||||
|
Paper,
|
||||||
|
TableBody,
|
||||||
|
useTheme,
|
||||||
|
TextField,
|
||||||
|
Box,
|
||||||
|
Typography,
|
||||||
|
AccordionDetails,
|
||||||
|
TableHead,
|
||||||
|
TableContainer,
|
||||||
|
Table,
|
||||||
|
} from "@mui/material";
|
||||||
|
import { useEffect, useState } from "react";
|
||||||
|
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
|
||||||
|
import { parseJsonToEntries } from "@/lib/helpers/parse-json-to-entries";
|
||||||
|
|
||||||
|
export type ExpandedTableInputProps = {
|
||||||
|
label: string;
|
||||||
|
onChange: (value: any) => void;
|
||||||
|
placeholder: string;
|
||||||
|
urlParam: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export const ExpandedTableInput = ({
|
||||||
|
label,
|
||||||
|
onChange,
|
||||||
|
placeholder,
|
||||||
|
urlParam,
|
||||||
|
}: ExpandedTableInputProps) => {
|
||||||
|
const theme = useTheme();
|
||||||
|
const [value, setValue] = useState("");
|
||||||
|
const [parsedHeaders, setParsedHeaders] = useState<[string, string][] | null>(
|
||||||
|
null
|
||||||
|
);
|
||||||
|
|
||||||
|
const [jsonError, setJsonError] = useState<string | null>(null);
|
||||||
|
|
||||||
|
const urlParams = new URLSearchParams(window.location.search);
|
||||||
|
|
||||||
|
const validateAndParse = (val: string) => {
|
||||||
|
if (val.trim() === "") {
|
||||||
|
setParsedHeaders(null);
|
||||||
|
setJsonError(null);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const parsed = JSON.parse(val);
|
||||||
|
const entries = parseJsonToEntries(val);
|
||||||
|
|
||||||
|
if (entries === null) {
|
||||||
|
setParsedHeaders(null);
|
||||||
|
setJsonError("Invalid JSON object");
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
setParsedHeaders(entries);
|
||||||
|
setJsonError(null);
|
||||||
|
return parsed;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
setParsedHeaders(null);
|
||||||
|
setJsonError("Invalid JSON format");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleChange = (e: React.ChangeEvent<HTMLInputElement>) => {
|
||||||
|
const val = e.target.value;
|
||||||
|
setValue(val);
|
||||||
|
const parsed = validateAndParse(val);
|
||||||
|
onChange(parsed);
|
||||||
|
};
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
const jobOptions = urlParams.get("job_options");
|
||||||
|
|
||||||
|
if (!jobOptions) {
|
||||||
|
setParsedHeaders(null);
|
||||||
|
setJsonError(null);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const jobOptionsObject = JSON.parse(jobOptions || "{}");
|
||||||
|
let val = jobOptionsObject[urlParam];
|
||||||
|
|
||||||
|
if (val.length === 0 || Object.keys(val).length === 0) {
|
||||||
|
setParsedHeaders(null);
|
||||||
|
setJsonError(null);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof val === "string") {
|
||||||
|
try {
|
||||||
|
val = JSON.parse(val);
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
|
||||||
|
const finalVal =
|
||||||
|
typeof val === "string" ? val : val != null ? JSON.stringify(val) : "";
|
||||||
|
|
||||||
|
setValue(finalVal);
|
||||||
|
const parsed = validateAndParse(finalVal);
|
||||||
|
onChange(parsed);
|
||||||
|
}, [urlParam]);
|
||||||
|
|
||||||
|
return (
|
||||||
|
<Accordion
|
||||||
|
defaultExpanded
|
||||||
|
elevation={0}
|
||||||
|
sx={{
|
||||||
|
mb: 2,
|
||||||
|
border: `1px solid ${theme.palette.divider}`,
|
||||||
|
"&:before": { display: "none" },
|
||||||
|
borderRadius: 1,
|
||||||
|
overflow: "hidden",
|
||||||
|
padding: 1,
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<AccordionSummary
|
||||||
|
expandIcon={<ExpandMoreIcon />}
|
||||||
|
sx={{
|
||||||
|
backgroundColor: theme.palette.background.paper,
|
||||||
|
borderBottom: `1px solid ${theme.palette.divider}`,
|
||||||
|
"&.Mui-expanded": {
|
||||||
|
borderBottom: `1px solid ${theme.palette.divider}`,
|
||||||
|
},
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<Box sx={{ display: "flex", alignItems: "center" }}>
|
||||||
|
<Typography
|
||||||
|
sx={{ fontWeight: 500, color: theme.palette.text.primary }}
|
||||||
|
>
|
||||||
|
{label}
|
||||||
|
</Typography>
|
||||||
|
</Box>
|
||||||
|
</AccordionSummary>
|
||||||
|
<AccordionDetails
|
||||||
|
sx={{ p: 2, backgroundColor: theme.palette.background.default }}
|
||||||
|
>
|
||||||
|
<TextField
|
||||||
|
placeholder={placeholder}
|
||||||
|
value={value}
|
||||||
|
onChange={handleChange}
|
||||||
|
fullWidth
|
||||||
|
variant="outlined"
|
||||||
|
size="small"
|
||||||
|
error={jsonError !== null}
|
||||||
|
helperText={jsonError ?? ""}
|
||||||
|
/>
|
||||||
|
|
||||||
|
{parsedHeaders && parsedHeaders.length > 0 && (
|
||||||
|
<Paper
|
||||||
|
variant="outlined"
|
||||||
|
sx={{
|
||||||
|
marginTop: 1,
|
||||||
|
border: `1px solid ${theme.palette.divider}`,
|
||||||
|
borderRadius: 1,
|
||||||
|
overflow: "hidden",
|
||||||
|
padding: 0,
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<TableContainer sx={{ maxHeight: 200 }}>
|
||||||
|
<Table size="small" stickyHeader>
|
||||||
|
<TableHead>
|
||||||
|
<TableRow
|
||||||
|
sx={{
|
||||||
|
backgroundColor: theme.palette.background.paper,
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<TableCell sx={{ fontWeight: "bold" }}>Header</TableCell>
|
||||||
|
<TableCell sx={{ fontWeight: "bold" }}>Value</TableCell>
|
||||||
|
</TableRow>
|
||||||
|
</TableHead>
|
||||||
|
<TableBody>
|
||||||
|
{parsedHeaders.map(([key, val]) => (
|
||||||
|
<TableRow
|
||||||
|
key={key}
|
||||||
|
hover
|
||||||
|
sx={{
|
||||||
|
"&:nth-of-type(odd)": {
|
||||||
|
backgroundColor:
|
||||||
|
theme.palette.mode === "light"
|
||||||
|
? "rgba(0, 0, 0, 0.02)"
|
||||||
|
: "rgba(255, 255, 255, 0.02)",
|
||||||
|
},
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<TableCell sx={{ fontWeight: 500 }}>{key}</TableCell>
|
||||||
|
<TableCell>{val}</TableCell>
|
||||||
|
</TableRow>
|
||||||
|
))}
|
||||||
|
</TableBody>
|
||||||
|
</Table>
|
||||||
|
</TableContainer>
|
||||||
|
</Paper>
|
||||||
|
)}
|
||||||
|
</AccordionDetails>
|
||||||
|
</Accordion>
|
||||||
|
);
|
||||||
|
};
|
||||||
1
src/components/common/expanded-table-input/index.ts
Normal file
1
src/components/common/expanded-table-input/index.ts
Normal file
@@ -0,0 +1 @@
|
|||||||
|
export * from "./expanded-table-input";
|
||||||
1
src/components/common/job-download-dialog/index.ts
Normal file
1
src/components/common/job-download-dialog/index.ts
Normal file
@@ -0,0 +1 @@
|
|||||||
|
export * from "./job-download-dialog";
|
||||||
@@ -0,0 +1,95 @@
|
|||||||
|
import {
|
||||||
|
Dialog,
|
||||||
|
DialogTitle,
|
||||||
|
DialogContent,
|
||||||
|
DialogActions,
|
||||||
|
Button,
|
||||||
|
FormControl,
|
||||||
|
RadioGroup,
|
||||||
|
FormControlLabel,
|
||||||
|
Radio,
|
||||||
|
FormLabel,
|
||||||
|
Typography,
|
||||||
|
Box,
|
||||||
|
} from "@mui/material";
|
||||||
|
import { useState } from "react";
|
||||||
|
|
||||||
|
export type JobDownloadDialogProps = {
|
||||||
|
open: boolean;
|
||||||
|
onClose: () => void;
|
||||||
|
ids: string[];
|
||||||
|
};
|
||||||
|
|
||||||
|
export const JobDownloadDialog = ({
|
||||||
|
open,
|
||||||
|
onClose,
|
||||||
|
ids,
|
||||||
|
}: JobDownloadDialogProps) => {
|
||||||
|
const [jobFormat, setJobFormat] = useState<string>("csv");
|
||||||
|
const handleDownload = async () => {
|
||||||
|
const response = await fetch("/api/download", {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify({ data: { ids: ids, job_format: jobFormat } }),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
const blob = await response.blob();
|
||||||
|
const url = window.URL.createObjectURL(blob);
|
||||||
|
const a = document.createElement("a");
|
||||||
|
a.style.display = "none";
|
||||||
|
a.href = url;
|
||||||
|
a.download = `job_${ids[0]}.${jobFormat}`;
|
||||||
|
document.body.appendChild(a);
|
||||||
|
a.click();
|
||||||
|
window.URL.revokeObjectURL(url);
|
||||||
|
document.body.removeChild(a);
|
||||||
|
} else {
|
||||||
|
console.error("Failed to download the file.");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
return (
|
||||||
|
<Dialog open={open} onClose={onClose}>
|
||||||
|
<DialogTitle>Download Job</DialogTitle>
|
||||||
|
<DialogContent>
|
||||||
|
<FormControl>
|
||||||
|
<Typography variant="body1">
|
||||||
|
You are about to download {ids.length} job(s). Please select the
|
||||||
|
format that you would like to download them in.
|
||||||
|
</Typography>
|
||||||
|
<br />
|
||||||
|
<Box
|
||||||
|
sx={{
|
||||||
|
display: "flex",
|
||||||
|
flexDirection: "column",
|
||||||
|
backgroundColor: "background.paper",
|
||||||
|
padding: 2,
|
||||||
|
border: "1px solid",
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<FormLabel>Format</FormLabel>
|
||||||
|
<hr style={{ width: "100%", margin: "10px 0" }} />
|
||||||
|
<RadioGroup
|
||||||
|
aria-labelledby="job-download-format-radio-buttons"
|
||||||
|
name="job-download-format-radio-buttons"
|
||||||
|
value={jobFormat}
|
||||||
|
onChange={(e) => setJobFormat(e.target.value)}
|
||||||
|
>
|
||||||
|
<FormControlLabel value="csv" control={<Radio />} label="CSV" />
|
||||||
|
<FormControlLabel
|
||||||
|
value="md"
|
||||||
|
control={<Radio />}
|
||||||
|
label="Markdown"
|
||||||
|
/>
|
||||||
|
</RadioGroup>
|
||||||
|
</Box>
|
||||||
|
<br />
|
||||||
|
<Button onClick={handleDownload} size="small">
|
||||||
|
Download
|
||||||
|
</Button>
|
||||||
|
</FormControl>
|
||||||
|
</DialogContent>
|
||||||
|
</Dialog>
|
||||||
|
);
|
||||||
|
};
|
||||||
@@ -20,6 +20,7 @@ import { Favorites, JobQueue } from ".";
|
|||||||
import { Job } from "../../types";
|
import { Job } from "../../types";
|
||||||
import Cookies from "js-cookie";
|
import Cookies from "js-cookie";
|
||||||
import { useSearchParams } from "next/navigation";
|
import { useSearchParams } from "next/navigation";
|
||||||
|
import { JobDownloadDialog } from "../common/job-download-dialog";
|
||||||
|
|
||||||
interface JobTableProps {
|
interface JobTableProps {
|
||||||
jobs: Job[];
|
jobs: Job[];
|
||||||
@@ -47,31 +48,15 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
|
|||||||
const [searchQuery, setSearchQuery] = useState<string>(search || "");
|
const [searchQuery, setSearchQuery] = useState<string>(search || "");
|
||||||
const [searchMode, setSearchMode] = useState<string>(type || "url");
|
const [searchMode, setSearchMode] = useState<string>(type || "url");
|
||||||
const [favoriteView, setFavoriteView] = useState<boolean>(false);
|
const [favoriteView, setFavoriteView] = useState<boolean>(false);
|
||||||
|
const [jobDownloadDialogOpen, setJobDownloadDialogOpen] =
|
||||||
|
useState<boolean>(false);
|
||||||
|
|
||||||
const token = Cookies.get("token");
|
const token = Cookies.get("token");
|
||||||
const router = useRouter();
|
const router = useRouter();
|
||||||
|
|
||||||
const handleDownload = async (ids: string[]) => {
|
const handleDownload = (ids: string[]) => {
|
||||||
const response = await fetch("/api/download", {
|
setSelectedJobs(new Set(ids));
|
||||||
method: "POST",
|
setJobDownloadDialogOpen(true);
|
||||||
headers: { "Content-Type": "application/json" },
|
|
||||||
body: JSON.stringify({ data: { ids: ids } }),
|
|
||||||
});
|
|
||||||
|
|
||||||
if (response.ok) {
|
|
||||||
const blob = await response.blob();
|
|
||||||
const url = window.URL.createObjectURL(blob);
|
|
||||||
const a = document.createElement("a");
|
|
||||||
a.style.display = "none";
|
|
||||||
a.href = url;
|
|
||||||
a.download = `job_${ids[0]}.csv`;
|
|
||||||
document.body.appendChild(a);
|
|
||||||
a.click();
|
|
||||||
window.URL.revokeObjectURL(url);
|
|
||||||
document.body.removeChild(a);
|
|
||||||
} else {
|
|
||||||
console.error("Failed to download the file.");
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const handleNavigate = (elements: Object[], url: string, options: any) => {
|
const handleNavigate = (elements: Object[], url: string, options: any) => {
|
||||||
@@ -259,17 +244,22 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
|
|||||||
onSelectJob={handleSelectJob}
|
onSelectJob={handleSelectJob}
|
||||||
onFavorite={favoriteJob}
|
onFavorite={favoriteJob}
|
||||||
onJobClick={handleJobClick}
|
onJobClick={handleJobClick}
|
||||||
></JobQueue>
|
/>
|
||||||
) : (
|
) : (
|
||||||
<Favorites
|
<Favorites
|
||||||
stateProps={{ selectedJobs, filteredJobs }}
|
stateProps={{ selectedJobs, filteredJobs }}
|
||||||
onNavigate={handleNavigate}
|
onNavigate={handleNavigate}
|
||||||
onSelectJob={handleSelectJob}
|
onSelectJob={handleSelectJob}
|
||||||
onFavorite={favoriteJob}
|
onFavorite={favoriteJob}
|
||||||
></Favorites>
|
/>
|
||||||
)}
|
)}
|
||||||
</Box>
|
</Box>
|
||||||
</Box>
|
</Box>
|
||||||
|
<JobDownloadDialog
|
||||||
|
open={jobDownloadDialogOpen}
|
||||||
|
onClose={() => setJobDownloadDialogOpen(false)}
|
||||||
|
ids={Array.from(selectedJobs)}
|
||||||
|
/>
|
||||||
</Box>
|
</Box>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -10,12 +10,14 @@ import { JobSubmitterInput } from "./job-submitter-input";
|
|||||||
import { JobSubmitterOptions } from "./job-submitter-options";
|
import { JobSubmitterOptions } from "./job-submitter-options";
|
||||||
import { ApiService } from "@/services";
|
import { ApiService } from "@/services";
|
||||||
import { useJobSubmitterProvider } from "./provider";
|
import { useJobSubmitterProvider } from "./provider";
|
||||||
|
import { AdvancedJobOptions } from "@/components/common/advanced-job-options";
|
||||||
|
|
||||||
const initialJobOptions: RawJobOptions = {
|
const initialJobOptions: RawJobOptions = {
|
||||||
multi_page_scrape: false,
|
multi_page_scrape: false,
|
||||||
custom_headers: null,
|
custom_headers: null,
|
||||||
proxies: null,
|
proxies: null,
|
||||||
collect_media: false,
|
collect_media: false,
|
||||||
|
custom_cookies: null,
|
||||||
};
|
};
|
||||||
|
|
||||||
export const JobSubmitter = () => {
|
export const JobSubmitter = () => {
|
||||||
@@ -38,12 +40,8 @@ export const JobSubmitter = () => {
|
|||||||
const [loading, setLoading] = useState<boolean>(false);
|
const [loading, setLoading] = useState<boolean>(false);
|
||||||
const [jobOptions, setJobOptions] =
|
const [jobOptions, setJobOptions] =
|
||||||
useState<RawJobOptions>(initialJobOptions);
|
useState<RawJobOptions>(initialJobOptions);
|
||||||
const [customJSONSelected, setCustomJSONSelected] = useState<boolean>(false);
|
|
||||||
const [proxiesSelected, setProxiesSelected] = useState<boolean>(false);
|
|
||||||
|
|
||||||
const handleSelectProxies = () => {
|
console.log(jobOptions);
|
||||||
setProxiesSelected(!proxiesSelected);
|
|
||||||
};
|
|
||||||
|
|
||||||
const handleSubmit = async () => {
|
const handleSubmit = async () => {
|
||||||
if (!validateURL(submittedURL)) {
|
if (!validateURL(submittedURL)) {
|
||||||
@@ -57,12 +55,13 @@ export const JobSubmitter = () => {
|
|||||||
setLoading(true);
|
setLoading(true);
|
||||||
|
|
||||||
let customHeaders;
|
let customHeaders;
|
||||||
|
let customCookies;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
customHeaders = jobOptions.custom_headers
|
customHeaders = jobOptions.custom_headers || null;
|
||||||
? JSON.parse(jobOptions.custom_headers)
|
customCookies = jobOptions.custom_cookies || null;
|
||||||
: null;
|
} catch (error: any) {
|
||||||
} catch (error) {
|
console.error(error);
|
||||||
setSnackbarMessage("Invalid JSON in custom headers.");
|
setSnackbarMessage("Invalid JSON in custom headers.");
|
||||||
setSnackbarOpen(true);
|
setSnackbarOpen(true);
|
||||||
setSnackbarSeverity("error");
|
setSnackbarSeverity("error");
|
||||||
@@ -76,6 +75,7 @@ export const JobSubmitter = () => {
|
|||||||
user,
|
user,
|
||||||
jobOptions,
|
jobOptions,
|
||||||
customHeaders,
|
customHeaders,
|
||||||
|
customCookies,
|
||||||
siteMap
|
siteMap
|
||||||
)
|
)
|
||||||
.then(async (response) => {
|
.then(async (response) => {
|
||||||
@@ -102,16 +102,9 @@ export const JobSubmitter = () => {
|
|||||||
.finally(() => setLoading(false));
|
.finally(() => setLoading(false));
|
||||||
};
|
};
|
||||||
|
|
||||||
// Parse the job options from the query string
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (job_options) {
|
if (job_options) {
|
||||||
parseJobOptions(
|
parseJobOptions(job_options as string, setJobOptions, setSiteMap);
|
||||||
job_options as string,
|
|
||||||
setCustomJSONSelected,
|
|
||||||
setProxiesSelected,
|
|
||||||
setJobOptions,
|
|
||||||
setSiteMap
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}, [job_options]);
|
}, [job_options]);
|
||||||
|
|
||||||
@@ -123,13 +116,9 @@ export const JobSubmitter = () => {
|
|||||||
handleSubmit={handleSubmit}
|
handleSubmit={handleSubmit}
|
||||||
loading={loading}
|
loading={loading}
|
||||||
/>
|
/>
|
||||||
<JobSubmitterOptions
|
<AdvancedJobOptions
|
||||||
jobOptions={jobOptions}
|
jobOptions={jobOptions}
|
||||||
setJobOptions={setJobOptions}
|
setJobOptions={setJobOptions}
|
||||||
customJSONSelected={customJSONSelected}
|
|
||||||
setCustomJSONSelected={setCustomJSONSelected}
|
|
||||||
handleSelectProxies={handleSelectProxies}
|
|
||||||
proxiesSelected={proxiesSelected}
|
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -42,12 +42,12 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
|
|||||||
params.append("username", email);
|
params.append("username", email);
|
||||||
params.append("password", password);
|
params.append("password", password);
|
||||||
const response = await axios.post(`/api/token`, params);
|
const response = await axios.post(`/api/token`, params);
|
||||||
|
const isSecure = window.location.protocol === "https:";
|
||||||
|
|
||||||
Cookies.set("token", response.data.access_token, {
|
Cookies.set("token", response.data.access_token, {
|
||||||
expires: 7,
|
expires: 7,
|
||||||
path: "/",
|
path: "/",
|
||||||
domain: "localhost",
|
secure: isSecure,
|
||||||
secure: false,
|
|
||||||
sameSite: "Lax",
|
sameSite: "Lax",
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -4,10 +4,8 @@ import { RawJobOptions, SiteMap } from "@/types";
|
|||||||
|
|
||||||
export const parseJobOptions = (
|
export const parseJobOptions = (
|
||||||
job_options: string,
|
job_options: string,
|
||||||
setCustomJSONSelected: Dispatch<SetStateAction<boolean>>,
|
|
||||||
setProxiesSelected: Dispatch<SetStateAction<boolean>>,
|
|
||||||
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>,
|
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>,
|
||||||
setSiteMap: Dispatch<SetStateAction<any>>
|
setSiteMap: Dispatch<SetStateAction<SiteMap | null>>
|
||||||
) => {
|
) => {
|
||||||
if (job_options) {
|
if (job_options) {
|
||||||
const jsonOptions = JSON.parse(job_options as string);
|
const jsonOptions = JSON.parse(job_options as string);
|
||||||
@@ -16,20 +14,23 @@ export const parseJobOptions = (
|
|||||||
custom_headers: null,
|
custom_headers: null,
|
||||||
proxies: null,
|
proxies: null,
|
||||||
collect_media: false,
|
collect_media: false,
|
||||||
|
custom_cookies: null,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (
|
if (
|
||||||
jsonOptions.custom_headers &&
|
jsonOptions.custom_headers &&
|
||||||
Object.keys(jsonOptions.custom_headers).length
|
Object.keys(jsonOptions.custom_headers).length
|
||||||
) {
|
) {
|
||||||
setCustomJSONSelected(true);
|
newJobOptions.custom_headers = jsonOptions.custom_headers;
|
||||||
newJobOptions.custom_headers = JSON.stringify(jsonOptions.custom_headers);
|
}
|
||||||
|
|
||||||
|
if (jsonOptions.custom_cookies && jsonOptions.custom_cookies.length > 0) {
|
||||||
|
newJobOptions.custom_cookies = jsonOptions.custom_cookies;
|
||||||
}
|
}
|
||||||
|
|
||||||
newJobOptions.multi_page_scrape = jsonOptions.multi_page_scrape;
|
newJobOptions.multi_page_scrape = jsonOptions.multi_page_scrape;
|
||||||
|
|
||||||
if (jsonOptions.proxies.length > 0) {
|
if (jsonOptions.proxies.length > 0) {
|
||||||
setProxiesSelected(true);
|
|
||||||
newJobOptions.proxies = jsonOptions.proxies.join(",");
|
newJobOptions.proxies = jsonOptions.proxies.join(",");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
37
src/lib/helpers/parse-json-to-entries.ts
Normal file
37
src/lib/helpers/parse-json-to-entries.ts
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
export const parseJsonToEntries = (json: string): [string, string][] | null => {
|
||||||
|
try {
|
||||||
|
const parsed = JSON.parse(json);
|
||||||
|
|
||||||
|
if (Array.isArray(parsed)) {
|
||||||
|
if (
|
||||||
|
parsed.length > 0 &&
|
||||||
|
Array.isArray(parsed[0]) &&
|
||||||
|
parsed[0].length === 2 &&
|
||||||
|
typeof parsed[0][0] === "string"
|
||||||
|
) {
|
||||||
|
// Already array of [key, val] tuples
|
||||||
|
// Just ensure values are strings
|
||||||
|
return parsed.map(([k, v]) => [k, String(v)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Array of objects
|
||||||
|
const allEntries: [string, string][] = [];
|
||||||
|
for (const item of parsed) {
|
||||||
|
if (typeof item === "object" && item !== null) {
|
||||||
|
allEntries.push(
|
||||||
|
// @ts-ignore
|
||||||
|
...Object.entries(item).map(([k, v]) => [k, String(v)])
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return allEntries.length > 0 ? allEntries : null;
|
||||||
|
} else if (typeof parsed === "object" && parsed !== null) {
|
||||||
|
return Object.entries(parsed).map(([k, v]) => [k, String(v)]);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
@@ -6,6 +6,7 @@ export const submitJob = async (
|
|||||||
user: any,
|
user: any,
|
||||||
jobOptions: any,
|
jobOptions: any,
|
||||||
customHeaders: any,
|
customHeaders: any,
|
||||||
|
customCookies: any,
|
||||||
siteMap: SiteMap | null
|
siteMap: SiteMap | null
|
||||||
) => {
|
) => {
|
||||||
return await fetch(`/api/submit-scrape-job`, {
|
return await fetch(`/api/submit-scrape-job`, {
|
||||||
@@ -23,6 +24,7 @@ export const submitJob = async (
|
|||||||
custom_headers: customHeaders || {},
|
custom_headers: customHeaders || {},
|
||||||
proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [],
|
proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [],
|
||||||
site_map: siteMap,
|
site_map: siteMap,
|
||||||
|
custom_cookies: customCookies || [],
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
|
|||||||
@@ -70,6 +70,16 @@ const commonThemeOptions = {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
MuiCheckbox: {
|
||||||
|
styleOverrides: {
|
||||||
|
colorPrimary: {
|
||||||
|
color: "#1976d2",
|
||||||
|
"&.Mui-checked": {
|
||||||
|
color: "#034efc",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
MuiPaper: {
|
MuiPaper: {
|
||||||
styleOverrides: {
|
styleOverrides: {
|
||||||
root: {
|
root: {
|
||||||
@@ -85,6 +95,7 @@ const lightTheme = createTheme({
|
|||||||
mode: "light",
|
mode: "light",
|
||||||
primary: {
|
primary: {
|
||||||
main: "#1976d2",
|
main: "#1976d2",
|
||||||
|
contrastText: "#000000",
|
||||||
},
|
},
|
||||||
secondary: {
|
secondary: {
|
||||||
main: "#dc004e",
|
main: "#dc004e",
|
||||||
@@ -139,6 +150,7 @@ const darkTheme = createTheme({
|
|||||||
mode: "dark",
|
mode: "dark",
|
||||||
primary: {
|
primary: {
|
||||||
main: "#90caf9",
|
main: "#90caf9",
|
||||||
|
contrastText: "#fff",
|
||||||
},
|
},
|
||||||
secondary: {
|
secondary: {
|
||||||
main: "#f48fb1",
|
main: "#f48fb1",
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ export type RawJobOptions = {
|
|||||||
custom_headers: string | null;
|
custom_headers: string | null;
|
||||||
proxies: string | null;
|
proxies: string | null;
|
||||||
collect_media: boolean;
|
collect_media: boolean;
|
||||||
|
custom_cookies: string | null;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ActionOption = "click" | "input";
|
export type ActionOption = "click" | "input";
|
||||||
|
|||||||
Reference in New Issue
Block a user