13 Commits

Author SHA1 Message Date
Jayden Pyles
6639e8b48f chore: update chart version [skip ci] 2025-05-17 16:33:18 -05:00
Jayden Pyles
263e46ba4d feat: add media viewer + other fixes (#79)
* feat: add media viewer + other fixes

* chore: remove logging [skip ci]

* chore: remove logging [skip ci]

* feat: add unit test for media

* feat: add unit test for media

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* chore: update docs [skip ci]
2025-05-17 16:31:34 -05:00
Jayden Pyles
f815a58efc chore: update docker version [skip ci] 2025-05-16 22:04:46 -05:00
Jayden Pyles
50ec5df657 chore: update chart version [skip ci] 2025-05-16 21:39:04 -05:00
Jayden Pyles
28de0f362c feat: add recording viewer and vnc (#78)
* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* chore: update gitignore [skip ci]

* chore: update dev compose [skip ci]

* fix: only run manually
2025-05-16 21:37:09 -05:00
Jayden Pyles
6b33723cac feat: update version
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
2025-05-16 14:15:53 -05:00
Jayden Pyles
5c89e4d7d2 feat: allow custom cookies (#77)
* feat: working new advanced job options

* feat: working new advanced job options

* feat: add tests for adding custom cookies/headers
2025-05-16 14:13:58 -05:00
Jayden Pyles
ed0828a585 fix: deployment
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
2025-05-13 21:03:21 -05:00
Jayden Pyles
1b8c8c779a Feature: Allow Multiple Download Options (#75)
* feat: allow downloading in MD format

* fix: unit tests

* fix: deployments [skip ci]

* fix: deployment
2025-05-13 18:23:59 -05:00
Jayden Pyles
267cc73657 docs: update docs [skip ci] 2025-05-13 13:11:52 -05:00
Jayden Pyles
92ff16d9c3 docs: update docs [skip ci] 2025-05-12 21:37:37 -05:00
Jayden Pyles
8b2e5dc9c3 Feat/add helm chart (#69)
* chore: start on helm chart

* chore: start on helm chart

* chore: start on helm chart

* chore: start on helm chart

* chore: start on helm chart

* chore: start on helm chart

* chore: start on helm chart

* chore: start on helm chart
2025-05-12 21:19:17 -05:00
Jayden Pyles
7f1bc295ac Feat/add data reader (#68)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
* feat: working new data view

* feat: working new data view

* fix: remove unused deps

* fix: typing

* chore: cleanup code
2025-05-12 17:58:45 -05:00
94 changed files with 8881 additions and 11667 deletions

4
.dockerignore Normal file
View File

@@ -0,0 +1,4 @@
node_modules
npm-debug.log
Dockerfile
.dockerignore

View File

@@ -0,0 +1,50 @@
name: Publish Helm Chart
description: Publish a Helm chart to a target repository
inputs:
app-repo-token:
required: true
description: "The token for the target repository"
runs:
using: 'composite'
steps:
- name: Checkout app repo
uses: actions/checkout@v4
- name: Set up Helm
uses: azure/setup-helm@v3
- name: Package Helm chart
run: |
mkdir -p packaged
helm package helm -d packaged
shell: bash
- name: Clone target Helm repo
run: |
git clone https://github.com/jaypyles/helm.git target-repo
cd target-repo
git config user.name "github-actions"
git config user.email "github-actions@github.com"
git fetch origin gh-pages # Fetch gh-pages explicitly
git checkout gh-pages # Checkout gh-pages branch
git pull origin gh-pages # Pull latest changes from gh-pages
shell: bash
- name: Copy package and update index
run: |
APP_NAME="scraperr"
mkdir -p target-repo/charts/$APP_NAME
cp packaged/*.tgz target-repo/charts/$APP_NAME/
cd target-repo/charts/$APP_NAME
helm repo index . --url https://jaypyles.github.io/helm/charts/$APP_NAME
shell: bash
- name: Commit and push to target repo
run: |
cd target-repo
git add charts/
git commit -m "Update $APP_NAME chart $(date +'%Y-%m-%d %H:%M:%S')" || echo "No changes"
git push https://x-access-token:${{ inputs.app-repo-token }}@github.com/jaypyles/helm.git gh-pages
shell: bash

View File

@@ -15,11 +15,11 @@ runs:
- name: Setup Docker project
shell: bash
run: make build up-dev
run: make build-ci up-ci
- name: Install dependencies
shell: bash
run: npm install
run: yarn install
- name: Wait for frontend to be ready
shell: bash

View File

@@ -1,19 +1,20 @@
name: Docker Image
on:
workflow_run:
workflows: ["Unit Tests"]
types:
- completed
workflow_dispatch:
jobs:
build:
if: ${{ github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.head_branch == 'master' }}
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Get version from helm chart
run: |
VERSION=$(grep "version:" ./helm/Chart.yaml | cut -d: -f2 | tr -d ' ')
echo "VERSION=$VERSION" >> $GITHUB_ENV
echo "Version is $VERSION"
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
@@ -29,7 +30,9 @@ jobs:
context: .
file: ./docker/frontend/Dockerfile
push: true
tags: ${{ secrets.DOCKERHUB_USERNAME }}/${{ secrets.DOCKERHUB_REPO }}:latest
tags: |
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:latest
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:${{ env.VERSION }}
- name: Build and push api
uses: docker/build-push-action@v5
@@ -37,12 +40,27 @@ jobs:
context: .
file: ./docker/api/Dockerfile
push: true
tags: ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
tags: |
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:${{ env.VERSION }}
push-helm-chart:
runs-on: ubuntu-latest
needs:
- build
steps:
- uses: actions/checkout@v4
- name: Push Helm Chart
uses: ./.github/actions/push-to-helm
with:
app-repo-token: ${{ secrets.GPAT_TOKEN }}
success-message:
runs-on: ubuntu-latest
needs:
- build
- push-helm-chart
steps:
- name: Send Discord Message
uses: jaypyles/discord-webhook-action@v1.0.0

16
.gitignore vendored
View File

@@ -188,4 +188,18 @@ postgres_data
.vscode
ollama
data
media
media/images
media/videos
media/audio
media/pdfs
media/spreadsheets
media/presentations
media/documents
media/recordings
media/download_summary.txt
cypress/screenshots
cypress/videos
docker-compose.dev.local.yml

2
.prettierignore Normal file
View File

@@ -0,0 +1,2 @@
*.yaml
*.yml

View File

@@ -1,6 +1,6 @@
.DEFAULT_GOAL := help
COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.yml
COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.local.yml
COMPOSE_PROD = docker compose -f docker-compose.yml
.PHONY: help deps build pull up up-dev down setup deploy
@@ -17,6 +17,7 @@ help:
@echo " make down - Stop and remove containers, networks, images, and volumes"
@echo " make setup - Setup server with dependencies and clone repo"
@echo " make deploy - Deploy site onto server"
@echo " make cypress-start - Start Cypress"
@echo ""
logs:
@@ -51,3 +52,12 @@ setup:
deploy:
ansible-playbook -i ./ansible/inventory.yaml ./ansible/deploy_site.yaml -v
build-ci:
docker compose -f docker-compose.yml -f docker-compose.dev.yml build
up-ci:
docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d --force-recreate
cypress-start:
DISPLAY=:0 npx cypress open

View File

@@ -13,7 +13,7 @@
## 📋 Overview
Scraperr enables you to extract data from websites with precision using XPath selectors. This self-hosted application provides a clean interface to manage scraping jobs, view results, and export data.
Scrape websites without writing a single line of code.
> 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information.
@@ -29,15 +29,21 @@ Scraperr enables you to extract data from websites with precision using XPath se
- **Custom Headers**: Add JSON headers to your scraping requests
- **Media Downloads**: Automatically download images, videos, and other media
- **Results Visualization**: View scraped data in a structured table format
- **Data Export**: Export your results in various formats
- **Data Export**: Export your results in markdown and csv formats
- **Notifcation Channels**: Send completion notifcations, through various channels
## 🚀 Getting Started
### Docker
```bash
make up
```
### Helm
> Refer to the docs for helm deployment: https://scraperr-docs.pages.dev/guides/helm-deployment
## ⚖️ Legal and Ethical Guidelines
When using Scraperr, please remember to:
@@ -48,6 +54,12 @@ When using Scraperr, please remember to:
> **Disclaimer**: Scraperr is intended for use only on websites that explicitly permit scraping. The creator accepts no responsibility for misuse of this tool.
## 💬 Join the Community
Get support, report bugs, and chat with other users and contributors.
👉 [Join the Scraperr Discord](https://discord.gg/89q7scsGEK)
## 📄 License
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.

View File

@@ -43,6 +43,14 @@ async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]:
async def openai_chat(
chat_messages: Iterable[ChatCompletionMessageParam],
) -> AsyncGenerator[str, None]:
if openai_client and not open_ai_model:
LOG.error("OpenAI model is not set")
yield "An error occurred while processing your request."
if not openai_client:
LOG.error("OpenAI client is not set")
yield "An error occurred while processing your request."
if openai_client and open_ai_model:
try:
response = openai_client.chat.completions.create(

View File

@@ -2,6 +2,7 @@
import os
import logging
import apscheduler # type: ignore
from contextlib import asynccontextmanager
# PDM
import apscheduler.schedulers
@@ -33,7 +34,30 @@ logging.basicConfig(
LOG = logging.getLogger(__name__)
app = FastAPI(title="api", root_path="/api")
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup
LOG.info("Starting application...")
init_database()
LOG.info("Starting cron scheduler...")
start_cron_scheduler(scheduler)
scheduler.start()
LOG.info("Cron scheduler started successfully")
yield
# Shutdown
LOG.info("Shutting down application...")
LOG.info("Stopping cron scheduler...")
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
LOG.info("Cron scheduler stopped")
LOG.info("Application shutdown complete")
app = FastAPI(title="api", root_path="/api", lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
@@ -43,28 +67,12 @@ app.add_middleware(
allow_headers=["*"],
)
app.include_router(auth_router)
app.include_router(ai_router)
app.include_router(job_router)
app.include_router(stats_router)
@app.on_event("startup")
async def startup_event():
start_cron_scheduler(scheduler)
scheduler.start()
if os.getenv("ENV") != "test":
init_database()
LOG.info("Starting up...")
@app.on_event("shutdown")
def shutdown_scheduler():
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(request: Request, exc: RequestValidationError):
exc_str = f"{exc}".replace("\n", " ").replace(" ", " ")

View File

@@ -66,4 +66,8 @@ async def read_users_me(current_user: User = Depends(get_current_user)):
@auth_router.get("/auth/check")
async def check_auth():
return {"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True"}
return {
"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True",
"recordings_enabled": os.environ.get("RECORDINGS_ENABLED", "true").lower()
== "true",
}

View File

@@ -1 +1,16 @@
from pathlib import Path
import os
DATABASE_PATH = "data/database.db"
RECORDINGS_DIR = Path("media/recordings")
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
MEDIA_DIR = Path("media")
MEDIA_TYPES = [
"audio",
"documents",
"images",
"pdfs",
"presentations",
"spreadsheets",
"videos",
]

View File

@@ -13,3 +13,4 @@ class JobOptions(BaseModel):
proxies: list[str] = []
site_map: Optional[SiteMap] = None
collect_media: bool = False
custom_cookies: list[dict[str, Any]] = []

View File

@@ -0,0 +1,48 @@
from typing import Any, Optional
from urllib.parse import urlparse
from playwright.async_api import Page, BrowserContext
import logging
LOG = logging.getLogger(__name__)
async def add_custom_cookies(
custom_cookies: list[dict[str, Any]],
url: str,
context: BrowserContext,
) -> None:
parsed_url = urlparse(url)
domain = parsed_url.netloc
for cookie in custom_cookies:
cookie_dict = {
"name": cookie.get("name", "default_name"),
"value": cookie.get("value", "default_value"),
"domain": domain,
"path": "/",
}
LOG.info(f"Adding cookie: {cookie_dict}")
await context.add_cookies([cookie_dict]) # type: ignore
async def add_custom_headers(
custom_headers: dict[str, Any],
page: Page,
) -> None:
await page.set_extra_http_headers(custom_headers)
async def add_custom_items(
url: str,
page: Page,
cookies: Optional[list[dict[str, Any]]] = None,
headers: Optional[dict[str, Any]] = None,
) -> None:
if cookies:
await add_custom_cookies(cookies, url, page.context)
if headers:
await add_custom_headers(headers, page)

View File

@@ -1,6 +1,7 @@
import os
from pathlib import Path
from urllib.parse import urlparse
import re
from urllib.parse import urljoin, urlparse
from typing import Dict, List
import aiohttp
@@ -9,12 +10,12 @@ from playwright.async_api import Page
from api.backend.utils import LOG
async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
async def collect_media(id: str, page: Page) -> dict[str, list[dict[str, str]]]:
media_types = {
"images": "img",
"videos": "video",
"audio": "audio",
"pdfs": 'a[href$=".pdf"]',
"pdfs": 'a[href$=".pdf"], a[href*=".pdf#page="]',
"documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]',
"presentations": 'a[href$=".ppt"], a[href$=".pptx"]',
"spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]',
@@ -48,6 +49,11 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
root_domain = f"{root_url.scheme}://{root_url.netloc}"
url = f"{root_domain}{url}"
if url and re.match(r"^[\w\-]+/", url):
root_url = urlparse(page.url)
root_domain = f"{root_url.scheme}://{root_url.netloc}"
url = urljoin(root_domain + "/", url)
if url and url.startswith(("http://", "https://")):
try:
parsed = urlparse(url)
@@ -67,15 +73,20 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
}.get(media_type, "")
filename += ext
file_path = media_dir / filename
if not os.path.exists(media_dir / id):
os.makedirs(media_dir / id, exist_ok=True)
file_path = media_dir / id / f"{filename}"
async with session.get(url) as response:
response.raise_for_status()
with open(file_path, "wb") as f:
while True:
chunk = await response.content.read(8192)
if not chunk:
break
f.write(chunk)
urls.append({"url": url, "local_path": str(file_path)})

View File

@@ -8,7 +8,7 @@ from api.backend.job.scraping.collect_media import collect_media as collect_medi
async def scrape_content(
page: Page, pages: Set[Tuple[str, str]], collect_media: bool
id: str, page: Page, pages: Set[Tuple[str, str]], collect_media: bool
) -> str:
last_height = await page.evaluate("document.body.scrollHeight")
@@ -27,6 +27,6 @@ async def scrape_content(
if collect_media:
LOG.info("Collecting media")
await collect_media_utils(page)
await collect_media_utils(id, page)
return html

View File

@@ -0,0 +1,36 @@
from typing import Any
from api.backend.utils import clean_text
def clean_job_format(jobs: list[dict[str, Any]]) -> dict[str, Any]:
"""
Convert a single job to a dictionary format.
"""
headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"]
cleaned_rows = []
for job in jobs:
for res in job["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
cleaned_rows.append(
{
"id": job.get("id", ""),
"url": url,
"element_name": element_name,
"xpath": value.get("xpath", ""),
"text": text,
"user": job.get("user", ""),
"time_created": job.get("time_created", ""),
}
)
return {
"headers": headers,
"rows": cleaned_rows,
}

View File

@@ -0,0 +1,24 @@
from typing import Any
from api.backend.utils import clean_text
def stream_md_from_job_results(jobs: list[dict[str, Any]]):
md = "# Job Results Summary\n\n"
for i, job in enumerate(jobs, start=1):
md += f"## Job #{i}\n"
yield f"- **Job URL:** {job.get('url', 'N/A')}\n"
yield f"- **Timestamp:** {job.get('time_created', 'N/A')}\n"
yield f"- **ID:** {job.get('id', 'N/A')}\n"
yield "### Extracted Results:\n"
for res in job.get("result", []):
for url, elements in res.items():
yield f"\n#### URL: {url}\n"
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
yield f"- **Element:** `{element_name}`\n"
yield f" - **Text:** {text}\n"
yield "\n---\n"

View File

@@ -1,5 +1,5 @@
# STL
from typing import Any, Optional, Union
from typing import Any, Literal, Optional, Union
from datetime import datetime
# LOCAL
@@ -27,6 +27,7 @@ class RetrieveScrapeJobs(pydantic.BaseModel):
class DownloadJob(pydantic.BaseModel):
ids: list[str]
job_format: Literal["csv", "md"]
class DeleteScrapeJobs(pydantic.BaseModel):

View File

@@ -10,7 +10,7 @@ import random
# PDM
from fastapi import Depends, APIRouter
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse, StreamingResponse
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
from api.backend.scheduler import scheduler
from apscheduler.triggers.cron import CronTrigger # type: ignore
@@ -39,6 +39,11 @@ from api.backend.job.cron_scheduling.cron_scheduling import (
insert_job_from_cron_job,
)
from api.backend.job.utils.clean_job_format import clean_job_format
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
from api.backend.constants import MEDIA_DIR, MEDIA_TYPES, RECORDINGS_DIR
LOG = logging.getLogger(__name__)
job_router = APIRouter()
@@ -104,41 +109,72 @@ async def download(download_job: DownloadJob):
)
results = query(job_query, tuple(download_job.ids))
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
if download_job.job_format == "csv":
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"]
csv_writer.writerow(headers)
headers = [
"id",
"url",
"element_name",
"xpath",
"text",
"user",
"time_created",
]
csv_writer.writerow(headers)
for result in results:
for res in result["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
csv_writer.writerow(
[
result.get("id", "")
+ "-"
+ str(random.randint(0, 1000000)),
url,
element_name,
value.get("xpath", ""),
text,
result.get("user", ""),
result.get("time_created", ""),
]
)
for result in results:
for res in result["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
csv_writer.writerow(
[
result.get("id", "")
+ "-"
+ str(random.randint(0, 1000000)),
url,
element_name,
value.get("xpath", ""),
text,
result.get("user", ""),
result.get("time_created", ""),
]
)
_ = csv_buffer.seek(0)
response = StreamingResponse(
csv_buffer,
media_type="text/csv",
)
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
_ = csv_buffer.seek(0)
response = StreamingResponse(
csv_buffer,
media_type="text/csv",
)
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
elif download_job.job_format == "md":
response = StreamingResponse(
stream_md_from_job_results(results),
media_type="text/markdown",
)
response.headers["Content-Disposition"] = "attachment; filename=export.md"
return response
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
return {"error": str(e)}
@job_router.get("/job/{id}/convert-to-csv")
async def convert_to_csv(id: str):
try:
job_query = f"SELECT * FROM jobs WHERE id = ?"
results = query(job_query, (id,))
return JSONResponse(content=clean_job_format(results))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
@@ -197,3 +233,41 @@ async def delete_cron_job_request(request: DeleteCronJob):
async def get_cron_jobs_request(user: User = Depends(get_current_user)):
cron_jobs = get_cron_jobs(user.email)
return JSONResponse(content=jsonable_encoder(cron_jobs))
@job_router.get("/recordings/{id}")
async def get_recording(id: str):
path = RECORDINGS_DIR / f"{id}.mp4"
if not path.exists():
return JSONResponse(content={"error": "Recording not found."}, status_code=404)
return FileResponse(
path, headers={"Content-Type": "video/mp4", "Accept-Ranges": "bytes"}
)
@job_router.get("/get-media")
async def get_media(id: str):
try:
files: dict[str, list[str]] = {}
for media_type in MEDIA_TYPES:
path = MEDIA_DIR / media_type / f"{id}"
files[media_type] = [file.name for file in path.glob("*")]
return JSONResponse(content={"files": files})
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
return JSONResponse(content={"error": str(e)}, status_code=500)
@job_router.get("/media")
async def get_media_file(id: str, type: str, file: str):
path = MEDIA_DIR / type / f"{id}" / file
if not path.exists():
return JSONResponse(content={"error": "Media file not found."}, status_code=404)
return FileResponse(path)

View File

@@ -12,6 +12,10 @@ from api.backend.models import Element, CapturedElement
from api.backend.job.scraping.scraping_utils import scrape_content
from api.backend.job.site_mapping.site_mapping import handle_site_mapping
from api.backend.job.scraping.add_custom import add_custom_items
from api.backend.constants import RECORDINGS_ENABLED
LOG = logging.getLogger(__name__)
@@ -35,6 +39,7 @@ def sxpath(context: etree._Element, xpath: str):
async def make_site_request(
id: str,
url: str,
headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False,
@@ -44,33 +49,36 @@ async def make_site_request(
proxies: Optional[list[str]] = None,
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
custom_cookies: Optional[list[dict[str, Any]]] = None,
):
if url in visited_urls:
return
proxy = None
if proxies:
proxy = random.choice(proxies)
LOG.info(f"Using proxy: {proxy}")
async with AsyncCamoufox(headless=True, proxy=proxy) as browser:
async with AsyncCamoufox(headless=not RECORDINGS_ENABLED, proxy=proxy) as browser:
page: Page = await browser.new_page()
await page.set_viewport_size({"width": 1920, "height": 1080})
if headers:
await page.set_extra_http_headers(headers)
# Add cookies and headers
await add_custom_items(url, page, custom_cookies, headers)
LOG.info(f"Visiting URL: {url}")
try:
await page.goto(url, timeout=60000)
await page.wait_for_load_state("networkidle", timeout=10000)
await page.wait_for_load_state("networkidle")
final_url = page.url
visited_urls.add(url)
visited_urls.add(final_url)
html_content = await scrape_content(page, pages, collect_media)
html_content = await scrape_content(id, page, pages, collect_media)
html_content = await page.content()
pages.add((html_content, final_url))
@@ -104,6 +112,7 @@ async def make_site_request(
if link not in visited_urls and is_same_domain(link, original_url):
await make_site_request(
id,
link,
headers=headers,
multi_page_scrape=multi_page_scrape,
@@ -113,6 +122,7 @@ async def make_site_request(
proxies=proxies,
site_map=site_map,
collect_media=collect_media,
custom_cookies=custom_cookies,
)
@@ -127,11 +137,20 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
for e in el: # type: ignore
text = (
"\t".join(str(t) for t in e.itertext())
" ".join(str(t) for t in e.itertext())
if isinstance(e, etree._Element)
else str(e) # type: ignore
)
text = text.strip()
text = text.replace("\n", " ")
text = text.replace("\t", " ")
text = text.replace("\r", " ")
text = text.replace("\f", " ")
text = text.replace("\v", " ")
text = text.replace("\b", " ")
text = text.replace("\a", " ")
captured_element = CapturedElement(
xpath=elem.xpath, text=text, name=elem.name
)
@@ -145,6 +164,7 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
async def scrape(
id: str,
url: str,
xpaths: list[Element],
headers: Optional[dict[str, Any]] = None,
@@ -152,11 +172,13 @@ async def scrape(
proxies: Optional[list[str]] = None,
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
custom_cookies: Optional[list[dict[str, Any]]] = None,
):
visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set()
await make_site_request(
id,
url,
headers=headers,
multi_page_scrape=multi_page_scrape,
@@ -166,6 +188,7 @@ async def scrape(
proxies=proxies,
site_map=site_map,
collect_media=collect_media,
custom_cookies=custom_cookies,
)
elements: list[dict[str, dict[str, list[CapturedElement]]]] = []

View File

@@ -21,7 +21,7 @@ async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
mock_randint.return_value = mocked_random_int
# Create a DownloadJob instance
download_job = DownloadJob(ids=[mocked_job["id"]])
download_job = DownloadJob(ids=[mocked_job["id"]], job_format="csv")
# Make a POST request to the /download endpoint
response = client.post("/download", json=download_job.model_dump())

View File

@@ -1,25 +1,53 @@
import pytest
import logging
from playwright.async_api import async_playwright, Error
from typing import Dict
from playwright.async_api import async_playwright, Cookie, Route
from api.backend.job.scraping.add_custom import add_custom_items
logging.basicConfig(level=logging.DEBUG)
LOG = logging.getLogger(__name__)
@pytest.mark.asyncio
async def test_proxy():
proxy = "127.0.0.1:8080"
async def test_add_custom_items():
test_cookies = [{"name": "big", "value": "cookie"}]
test_headers = {"User-Agent": "test-agent", "Accept": "application/json"}
async with async_playwright() as p:
browser = await p.firefox.launch(
headless=True, proxy={"server": f"http://{proxy}"}
)
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
with pytest.raises(Error) as excinfo:
await page.goto("http://example.com")
# Set up request interception
captured_headers: Dict[str, str] = {}
assert "NS_ERROR_PROXY_CONNECTION_REFUSED" in str(excinfo.value)
async def handle_route(route: Route) -> None:
nonlocal captured_headers
captured_headers = route.request.headers
await route.continue_()
await page.route("**/*", handle_route)
await add_custom_items(
url="http://example.com",
page=page,
cookies=test_cookies,
headers=test_headers,
)
# Navigate to example.com
await page.goto("http://example.com")
# Verify cookies were added
cookies: list[Cookie] = await page.context.cookies()
test_cookie = next((c for c in cookies if c.get("name") == "big"), None)
assert test_cookie is not None
assert test_cookie.get("value") == "cookie"
assert test_cookie.get("path") == "/" # Default path should be set
assert test_cookie.get("sameSite") == "Lax" # Default sameSite should be set
# Verify headers were added
assert captured_headers.get("user-agent") == "test-agent"
await browser.close()

View File

@@ -1,9 +1,12 @@
import os
import json
from pathlib import Path
from api.backend.job import get_queued_job, update_job
from api.backend.scraping import scrape
from api.backend.models import Element
from fastapi.encoders import jsonable_encoder
import subprocess
import asyncio
import traceback
@@ -25,23 +28,63 @@ SMTP_USER = os.getenv("SMTP_USER", "")
SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "")
USE_TLS = os.getenv("USE_TLS", "false").lower() == "true"
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
RECORDINGS_DIR = Path("/project/app/media/recordings")
async def process_job():
job = await get_queued_job()
ffmpeg_proc = None
status = "Queued"
if job:
LOG.info(f"Beginning processing job: {job}.")
try:
output_path = RECORDINGS_DIR / f"{job['id']}.mp4"
if RECORDINGS_ENABLED:
ffmpeg_proc = subprocess.Popen(
[
"ffmpeg",
"-y",
"-video_size",
"1280x1024",
"-framerate",
"15",
"-f",
"x11grab",
"-i",
":99",
"-codec:v",
"libx264",
"-preset",
"ultrafast",
output_path,
]
)
_ = await update_job([job["id"]], field="status", value="Scraping")
proxies = job["job_options"]["proxies"]
if proxies and isinstance(proxies[0], str) and proxies[0].startswith("{"):
try:
proxies = [json.loads(p) for p in proxies]
except json.JSONDecodeError:
LOG.error(f"Failed to parse proxy JSON: {proxies}")
proxies = []
scraped = await scrape(
job["id"],
job["url"],
[Element(**j) for j in job["elements"]],
job["job_options"]["custom_headers"],
job["job_options"]["multi_page_scrape"],
job["job_options"]["proxies"],
proxies,
job["job_options"]["site_map"],
job["job_options"]["collect_media"],
job["job_options"]["custom_cookies"],
)
LOG.info(
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
@@ -75,12 +118,18 @@ async def process_job():
},
)
if ffmpeg_proc:
ffmpeg_proc.terminate()
ffmpeg_proc.wait()
async def main():
LOG.info("Starting job worker...")
init_database()
RECORDINGS_DIR.mkdir(parents=True, exist_ok=True)
while True:
await process_job()
await asyncio.sleep(5)

View File

@@ -30,5 +30,59 @@ describe.only("Job", () => {
"exist"
);
cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
cy.get("tbody tr")
.first()
.within(() => {
cy.get('input[type="checkbox"]').click();
});
cy.get("[data-testid='DeleteIcon']").click();
cy.contains("div", "https://example.com", { timeout: 10000 }).should(
"not.exist"
);
});
it("should create a job with advanced options (media)", () => {
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
cy.visit("/");
cy.get("button").contains("Advanced Job Options").click();
cy.get('[data-cy="collect-media-checkbox"]').click();
cy.get("body").type("{esc}");
cy.get('[data-cy="url-input"]').type("https://books.toscrape.com");
cy.get('[data-cy="name-field"]').type("example");
cy.get('[data-cy="xpath-field"]').type("//body");
cy.get('[data-cy="add-button"]').click();
cy.get("button").contains("Submit").click();
cy.get("li").contains("Jobs").click();
cy.contains("div", "https://books.toscrape.com", { timeout: 10000 }).should(
"exist"
);
cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
cy.get("li").contains("Media").click();
cy.get("div[id='select-job']").click();
cy.get("li[role='option']").click();
cy.get("[data-testid='media-grid']", { timeout: 10000 }).should("exist");
cy.get("li").contains("Jobs").click();
cy.get("tbody tr")
.first()
.within(() => {
cy.get('input[type="checkbox"]').click();
});
cy.get("[data-testid='DeleteIcon']").click();
});
});

View File

@@ -1,6 +1,9 @@
version: "3"
services:
scraperr:
build:
context: .
dockerfile: docker/frontend/Dockerfile
command: ["npm", "run", "dev"]
volumes:
- "$PWD/src:/app/src"
@@ -10,7 +13,12 @@ services:
- "$PWD/package-lock.json:/app/package-lock.json"
- "$PWD/tsconfig.json:/app/tsconfig.json"
scraperr_api:
build:
context: .
dockerfile: docker/api/Dockerfile
environment:
- LOG_LEVEL=INFO
volumes:
- "$PWD/api:/project/app/api"
ports:
- "5900:5900"

View File

@@ -1,11 +1,6 @@
services:
scraperr:
depends_on:
- scraperr_api
image: jpyles0524/scraperr:latest
build:
context: .
dockerfile: docker/frontend/Dockerfile
container_name: scraperr
command: ["npm", "run", "start"]
environment:
@@ -18,9 +13,6 @@ services:
scraperr_api:
init: True
image: jpyles0524/scraperr_api:latest
build:
context: .
dockerfile: docker/api/Dockerfile
environment:
- LOG_LEVEL=INFO
container_name: scraperr_api

View File

@@ -3,7 +3,7 @@ FROM python:3.10.12-slim as pybuilder
RUN apt-get update && \
apt-get install -y curl && \
apt-get install -y uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 && \
apt-get install -y x11vnc xvfb uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 ffmpeg && \
curl -LsSf https://astral.sh/uv/install.sh | sh && \
apt-get remove -y curl && \
apt-get autoremove -y && \
@@ -14,7 +14,8 @@ RUN pdm config python.use_venv false
WORKDIR /project/app
COPY pyproject.toml pdm.lock /project/app/
RUN pdm install
RUN pdm install -v --frozen-lockfile
RUN pdm run playwright install --with-deps
@@ -30,4 +31,12 @@ EXPOSE 8000
WORKDIR /project/app
RUN mkdir -p /project/app/media
RUN mkdir -p /project/app/data
RUN touch /project/app/data/database.db
EXPOSE 5900
COPY start.sh /project/app/start.sh
CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ]

View File

@@ -1,10 +1,14 @@
# Build next dependencies
FROM node:23.1
FROM node:23.1-slim
WORKDIR /app
COPY package*.json ./
RUN npm install
# Copy package files first to leverage Docker cache
COPY package.json yarn.lock ./
# Install dependencies in a separate layer
RUN yarn install --frozen-lockfile
# Copy the rest of the application
COPY tsconfig.json /app/tsconfig.json
COPY tailwind.config.js /app/tailwind.config.js
COPY next.config.mjs /app/next.config.mjs
@@ -13,6 +17,7 @@ COPY postcss.config.js /app/postcss.config.js
COPY public /app/public
COPY src /app/src
RUN npm run build
# Build the application
RUN yarn build
EXPOSE 3000

Binary file not shown.

Before

Width:  |  Height:  |  Size: 47 KiB

After

Width:  |  Height:  |  Size: 48 KiB

23
helm/.helmignore Normal file
View File

@@ -0,0 +1,23 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/

24
helm/Chart.yaml Normal file
View File

@@ -0,0 +1,24 @@
apiVersion: v2
name: scraperr
description: A Helm chart for Kubernetes
# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 1.0.16
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "1.16.0"

View File

@@ -0,0 +1,56 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: scraperr
spec:
replicas: {{ .Values.replicaCount }}
selector:
matchLabels:
app: scraperr
template:
metadata:
labels:
app: scraperr
spec:
containers:
- name: scraperr
{{ if .Values.scraperr.image.repository }}
image: "{{ .Values.scraperr.image.repository }}:{{ .Values.scraperr.image.tag }}"
{{ else }}
image: "{{ .Chart.Name }}:{{ .Chart.Version }}"
{{ end }}
imagePullPolicy: {{ .Values.scraperr.image.pullPolicy }}
command: {{ .Values.scraperr.containerCommand | toJson }}
ports:
- containerPort: {{ .Values.scraperr.containerPort }}
env: {{ toYaml .Values.scraperr.env | nindent 12 }}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: scraperr-api
spec:
replicas: {{ .Values.replicaCount }}
selector:
matchLabels:
app: scraperr-api
template:
metadata:
labels:
app: scraperr-api
spec:
containers:
- name: scraperr-api
{{ if .Values.scraperrApi.image.repository }}
image: "{{ .Values.scraperrApi.image.repository }}:{{ .Values.scraperrApi.image.tag }}"
{{ else }}
image: "{{ .Chart.Name }}:{{ .Chart.Version }}"
{{ end }}
imagePullPolicy: {{ .Values.scraperrApi.image.pullPolicy }}
ports:
- containerPort: {{ .Values.scraperrApi.containerPort }}
env: {{ toYaml .Values.scraperrApi.env | nindent 12 }}
volumeMounts: {{ toYaml .Values.scraperrApi.volumeMounts | nindent 12 }}
volumes: {{ toYaml .Values.scraperrApi.volumes | nindent 12 }}

View File

@@ -0,0 +1,37 @@
---
apiVersion: v1
kind: Service
metadata:
name: scraperr
spec:
type: {{ .Values.scraperr.serviceType }}
selector:
app: scraperr
ports:
{{- range .Values.scraperr.ports }}
- port: {{ .port }}
targetPort: {{ .targetPort }}
{{- if .nodePort }}
nodePort: {{ .nodePort }}
{{- end }}
protocol: {{ .protocol | default "TCP" }}
{{- end }}
---
apiVersion: v1
kind: Service
metadata:
name: scraperr-api
spec:
type: {{ .Values.scraperrApi.serviceType }}
selector:
app: scraperr-api
ports:
{{- range .Values.scraperrApi.ports }}
- port: {{ .port }}
targetPort: {{ .targetPort }}
{{- if .nodePort }}
nodePort: {{ .nodePort }}
{{- end }}
protocol: {{ .protocol | default "TCP" }}
{{- end }}

47
helm/values.yaml Normal file
View File

@@ -0,0 +1,47 @@
scraperr:
image:
repository: jpyles0524/scraperr
tag: latest
pullPolicy: IfNotPresent
containerCommand: ["npm", "run","start"]
containerPort: 3000
serviceType: NodePort
ports:
- port: 80
targetPort: 3000
nodePort: 32300
protocol: TCP
env:
- name: NEXT_PUBLIC_API_URL
value: "http://scraperr-api:8000"
- name: SERVER_URL
value: "http://scraperr-api:8000"
scraperrApi:
image:
repository: jpyles0524/scraperr_api
tag: latest
pullPolicy: IfNotPresent
containerPort: 8000
serviceType: ClusterIP
ports:
- port: 8000
targetPort: 8000
protocol: TCP
env:
- name: LOG_LEVEL
value: "INFO"
volumeMounts:
- name: data
mountPath: /project/app/data
- name: media
mountPath: /project/app/media
volumes:
- name: data
hostPath:
path: /data/scraperr/data
type: DirectoryOrCreate
- name: media
hostPath:
path: /data/scraperr/media
replicaCount: 1

11371
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -12,9 +12,11 @@
"@minchat/react-chat-ui": "^0.16.2",
"@mui/icons-material": "^5.15.3",
"@mui/material": "^5.16.0",
"@reduxjs/toolkit": "^2.8.2",
"@testing-library/jest-dom": "^5.16.5",
"@testing-library/react": "^13.4.0",
"@testing-library/user-event": "^13.5.0",
"@types/react": "^18.3.21",
"axios": "^1.7.2",
"bootstrap": "^5.3.0",
"chart.js": "^4.4.3",
@@ -30,16 +32,18 @@
"react-dom": "^18.3.1",
"react-markdown": "^9.0.0",
"react-modal-image": "^2.6.0",
"react-redux": "^9.2.0",
"react-router": "^6.14.1",
"react-router-dom": "^6.14.1",
"react-spinners": "^0.14.1",
"redux-persist": "^6.0.0",
"typescript": "^4.9.5",
"web-vitals": "^2.1.4"
},
"scripts": {
"dev": "next dev",
"build": "next build",
"start": "next start",
"dev": "yarn next dev",
"build": "yarn next build",
"start": "yarn next start",
"serve": "serve -s ./dist",
"cy:open": "cypress open",
"cy:run": "cypress run"

View File

@@ -1,17 +1,23 @@
import React, { useState, useEffect, Dispatch, useRef } from "react";
import React, { useState, Dispatch, useEffect } from "react";
import { Job } from "../../types";
import { fetchJobs } from "../../lib";
import Box from "@mui/material/Box";
import InputLabel from "@mui/material/InputLabel";
import FormControl from "@mui/material/FormControl";
import Select from "@mui/material/Select";
import Popover from "@mui/material/Popover";
import { Typography, MenuItem, useTheme } from "@mui/material";
import {
Typography,
MenuItem,
useTheme,
ClickAwayListener,
} from "@mui/material";
import { SxProps } from "@mui/material";
interface Props {
sxProps: SxProps;
setSelectedJob: Dispatch<React.SetStateAction<Job | null>>;
sxProps?: SxProps;
setSelectedJob:
| Dispatch<React.SetStateAction<Job | null>>
| ((job: Job) => void);
selectedJob: Job | null;
setJobs: Dispatch<React.SetStateAction<Job[]>>;
jobs: Job[];
@@ -43,6 +49,12 @@ export const JobSelector = ({
const open = Boolean(anchorEl);
useEffect(() => {
if (!open) {
setAnchorEl(null);
}
}, [open]);
return (
<Box sx={sxProps}>
<FormControl fullWidth>
@@ -55,9 +67,11 @@ export const JobSelector = ({
value={selectedJob?.id || ""}
label="Job"
onChange={(e) => {
setSelectedJob(
jobs.find((job) => job.id === e.target.value) || null
);
const job = jobs.find((job) => job.id === e.target.value);
if (job) {
setSelectedJob(job);
}
}}
>
{jobs.map((job) => (
@@ -77,57 +91,63 @@ export const JobSelector = ({
</>
) : null}
</FormControl>
<Popover
id="mouse-over-popover"
sx={{
pointerEvents: "none",
padding: 0,
}}
open={open}
anchorEl={anchorEl}
anchorOrigin={{
vertical: "bottom",
horizontal: "left",
}}
transformOrigin={{
vertical: "top",
horizontal: "left",
}}
onClose={handlePopoverClose}
>
{popoverJob && (
<Box
{open && (
<ClickAwayListener onClickAway={handlePopoverClose}>
<Popover
id="mouse-over-popover"
sx={{
border:
theme.palette.mode === "light"
? "2px solid black"
: "2px solid white",
pointerEvents: "none",
padding: 0,
}}
open={open}
anchorEl={anchorEl}
anchorOrigin={{
vertical: "bottom",
horizontal: "left",
}}
transformOrigin={{
vertical: "top",
horizontal: "left",
}}
onClose={handlePopoverClose}
>
<Typography
variant="body1"
sx={{ paddingLeft: 1, paddingRight: 1 }}
>
{popoverJob.url}
</Typography>
<div className="flex flex-row w-full justify-end mb-1">
<Typography
variant="body2"
{popoverJob && (
<Box
sx={{
paddingLeft: 1,
paddingRight: 1,
color: theme.palette.mode === "dark" ? "#d3d7e6" : "#5b5d63",
fontStyle: "italic",
border:
theme.palette.mode === "light"
? "2px solid black"
: "2px solid white",
}}
>
{popoverJob.time_created
? new Date(popoverJob.time_created).toLocaleString()
: "Unknown"}
</Typography>
</div>
</Box>
)}
</Popover>
<Typography
variant="body1"
sx={{ paddingLeft: 1, paddingRight: 1 }}
>
{popoverJob.url}
</Typography>
<div className="flex flex-row w-full justify-end mb-1">
<Typography
variant="body2"
sx={{
paddingLeft: 1,
paddingRight: 1,
color:
theme.palette.mode === "dark" ? "#d3d7e6" : "#5b5d63",
fontStyle: "italic",
}}
>
{popoverJob.time_created
? new Date(popoverJob.time_created).toLocaleString()
: "Unknown"}
</Typography>
</div>
</Box>
)}
</Popover>
</ClickAwayListener>
)}
</Box>
);
};

View File

@@ -0,0 +1,45 @@
import { Box, Link, Typography } from "@mui/material";
import { SetStateAction, Dispatch, useState } from "react";
import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog";
import { RawJobOptions } from "@/types";
export type AdvancedJobOptionsProps = {
jobOptions: RawJobOptions;
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
};
export const AdvancedJobOptions = ({
jobOptions,
setJobOptions,
}: AdvancedJobOptionsProps) => {
const [open, setOpen] = useState(false);
return (
<Box sx={{ mb: 2 }}>
<Link
component="button"
variant="body2"
onClick={() => setOpen(true)}
sx={{
textDecoration: "none",
color: "primary.main",
"&:hover": {
color: "primary.dark",
textDecoration: "underline",
},
paddingLeft: 1,
display: "inline-flex",
alignItems: "center",
gap: 0.5,
}}
>
<Typography variant="body2">Advanced Job Options</Typography>
</Link>
<AdvancedJobOptionsDialog
open={open}
onClose={() => setOpen(false)}
jobOptions={jobOptions}
setJobOptions={setJobOptions}
/>
</Box>
);
};

View File

@@ -0,0 +1,270 @@
import {
Accordion,
AccordionDetails,
AccordionSummary,
Box,
Checkbox,
Dialog,
DialogContent,
DialogTitle,
Divider,
FormControl,
FormControlLabel,
FormGroup,
IconButton,
TextField,
Tooltip,
Typography,
useTheme,
} from "@mui/material";
import {
ExpandMore as ExpandMoreIcon,
InfoOutlined,
Code as CodeIcon,
Settings,
} from "@mui/icons-material";
import { Dispatch, SetStateAction } from "react";
import { RawJobOptions } from "@/types";
import { ExpandedTableInput } from "../../expanded-table-input";
export type AdvancedJobOptionsDialogProps = {
open: boolean;
onClose: () => void;
jobOptions: RawJobOptions;
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
};
export const AdvancedJobOptionsDialog = ({
open,
onClose,
jobOptions,
setJobOptions,
}: AdvancedJobOptionsDialogProps) => {
const theme = useTheme();
const handleMultiPageScrapeChange = () => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
multi_page_scrape: !prevJobOptions.multi_page_scrape,
}));
};
const handleProxiesChange = (e: React.ChangeEvent<HTMLInputElement>) => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
proxies: e.target.value,
}));
};
const handleCollectMediaChange = () => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
collect_media: !prevJobOptions.collect_media,
}));
};
return (
<Dialog
open={open}
onClose={onClose}
maxWidth="md"
fullWidth
PaperProps={{
sx: {
borderRadius: 2,
boxShadow: "0 8px 32px rgba(0, 0, 0, 0.1)",
},
}}
>
<DialogTitle
sx={{
borderBottom: `1px solid ${theme.palette.divider}`,
backgroundColor: theme.palette.background.default,
color: theme.palette.primary.contrastText,
borderRadius: 2,
display: "flex",
alignItems: "center",
justifyContent: "space-between",
padding: "1rem 2rem",
marginRight: 2,
marginLeft: 2,
}}
>
<Typography variant="h6" component="div">
Advanced Job Options
</Typography>
<Settings
sx={{
color: theme.palette.primary.contrastText,
}}
/>
</DialogTitle>
<DialogContent
sx={{ padding: 3, overflowY: "auto", marginTop: 2, height: "60rem" }}
>
<FormControl fullWidth>
<Box sx={{ mb: 3 }}>
<Typography
variant="subtitle1"
sx={{
mb: 1,
fontWeight: "bold",
color: theme.palette.text.primary,
}}
>
Collection Options
</Typography>
<Divider sx={{ mb: 2, backgroundColor: theme.palette.divider }} />
<FormGroup row sx={{ gap: 4, mb: 1 }}>
<FormControlLabel
control={
<Checkbox
checked={jobOptions.multi_page_scrape}
onChange={handleMultiPageScrapeChange}
/>
}
label={
<Box sx={{ display: "flex", alignItems: "center" }}>
<Typography>Multi Page Scrape</Typography>
<Tooltip title="Enable crawling through multiple pages">
<IconButton size="small">
<InfoOutlined fontSize="small" />
</IconButton>
</Tooltip>
</Box>
}
/>
<FormControlLabel
control={
<Checkbox
checked={jobOptions.collect_media}
onChange={handleCollectMediaChange}
data-cy="collect-media-checkbox"
/>
}
label={
<Box sx={{ display: "flex", alignItems: "center" }}>
<Typography>Collect Media</Typography>
<Tooltip title="Download images and other media">
<IconButton size="small">
<InfoOutlined fontSize="small" />
</IconButton>
</Tooltip>
</Box>
}
/>
</FormGroup>
</Box>
<Box sx={{ mb: 3 }}>
<Typography
variant="subtitle1"
sx={{
mb: 1,
fontWeight: "bold",
color: theme.palette.text.primary,
}}
>
Custom Options
</Typography>
<Divider sx={{ mb: 2, backgroundColor: theme.palette.divider }} />
{/* Proxies Section */}
<Accordion
defaultExpanded
elevation={0}
sx={{
mb: 2,
border: `1px solid ${theme.palette.divider}`,
"&:before": { display: "none" },
borderRadius: 1,
overflow: "hidden",
padding: 1,
}}
>
<AccordionSummary
expandIcon={<ExpandMoreIcon />}
sx={{
backgroundColor: theme.palette.background.paper,
borderBottom: `1px solid ${theme.palette.divider}`,
"&.Mui-expanded": {
borderBottom: `1px solid ${theme.palette.divider}`,
},
}}
>
<Box sx={{ display: "flex", alignItems: "center" }}>
<div
style={{
display: "flex",
alignItems: "center",
gap: "0.5rem",
}}
>
<Typography
sx={{
fontWeight: 500,
color: theme.palette.text.primary,
}}
>
Proxies
</Typography>
<Tooltip title="Comma separated list of proxies that should follow Playwright proxy format">
<InfoOutlined fontSize="small" />
</Tooltip>
</div>
</Box>
</AccordionSummary>
<AccordionDetails
sx={{ p: 2, backgroundColor: theme.palette.background.default }}
>
<TextField
placeholder='Proxies ([{"server": "proxy.example.com:8080", "username": "username", "password": "password"}])'
fullWidth
variant="outlined"
size="small"
value={jobOptions.proxies}
onChange={handleProxiesChange}
InputProps={{
startAdornment: (
<CodeIcon
sx={{ color: theme.palette.text.secondary, mr: 1 }}
/>
),
}}
/>
</AccordionDetails>
</Accordion>
{/* Custom Headers Section */}
<ExpandedTableInput
label="Custom Headers"
placeholder='{"User-Agent": "CustomAgent", "Accept": "*/*"}'
urlParam="custom_headers"
onChange={(value) => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
custom_headers: value,
}));
}}
/>
{/* Custom Cookies Section */}
<ExpandedTableInput
label="Custom Cookies"
placeholder='[{"name": "value", "name2": "value2"}]'
urlParam="custom_cookies"
onChange={(value) => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
custom_cookies: value,
}));
}}
/>
</Box>
</FormControl>
</DialogContent>
</Dialog>
);
};

View File

@@ -0,0 +1 @@
export * from "./advanced-job-options-dialog";

View File

@@ -0,0 +1 @@
export * from "./advanced-job-options";

View File

@@ -0,0 +1,166 @@
import React, { useState } from "react";
import {
Table,
TableBody,
TableCell,
TableContainer,
TableHead,
TableRow,
Paper,
Box,
Typography,
useTheme,
alpha,
} from "@mui/material";
export type CsvRow = {
[key: string]: string;
};
export type CsvTableProps = {
csv: {
rows: CsvRow[];
headers: string[];
};
className?: string;
};
export const CsvTable: React.FC<CsvTableProps> = ({ csv, className }) => {
const [expandedRow, setExpandedRow] = useState<number | null>(null);
const theme = useTheme();
const handleRowClick = (rowIndex: number) => {
setExpandedRow((prevRow) => (prevRow === rowIndex ? null : rowIndex));
};
return (
<Box
sx={{
height: "100%",
display: "flex",
flexDirection: "column",
overflow: "hidden",
width: "100%",
}}
className={className}
>
{csv.rows.length > 0 ? (
<TableContainer
sx={{
flex: 1,
overflow: "auto",
borderRadius: theme.shape.borderRadius,
boxShadow: theme.shadows[1],
}}
>
<Table stickyHeader size="small" aria-label="csv data table">
<TableHead>
<TableRow>
{csv.headers.map((header, idx) => (
<TableCell
key={idx}
sx={{
fontWeight: "bold",
cursor: "pointer",
whiteSpace: "nowrap",
backgroundColor: theme.palette.background.paper,
color: theme.palette.text.primary,
"&:hover": {
backgroundColor: alpha(theme.palette.primary.main, 0.1),
},
p: { xs: 1, sm: 2 },
}}
>
{header}
</TableCell>
))}
</TableRow>
</TableHead>
<TableBody>
{csv.rows.map((row, rowIndex) => (
<React.Fragment key={rowIndex}>
<TableRow
onClick={() => handleRowClick(rowIndex)}
sx={{
"&:nth-of-type(odd)": {
backgroundColor: alpha(
theme.palette.primary.main,
0.02
),
},
"&:hover": {
backgroundColor: alpha(
theme.palette.primary.main,
0.04
),
},
cursor: "pointer",
}}
>
{Object.values(row).map((col, colIndex) => (
<TableCell
key={colIndex}
sx={{
whiteSpace: "nowrap",
maxWidth: { xs: "150px", sm: "200px", md: "200px" },
overflow: "hidden",
textOverflow: "ellipsis",
p: { xs: 1, sm: 2 },
}}
>
{col}
</TableCell>
))}
</TableRow>
{expandedRow === rowIndex && (
<TableRow>
<TableCell
colSpan={csv.headers.length}
sx={{ padding: 2 }}
>
<Paper
sx={{
padding: 2,
backgroundColor: alpha(
theme.palette.background.paper,
0.5
),
}}
>
<Typography variant="body2" color="text.secondary">
{row.text
? row.text
.replace(/[\n\t\r]+/g, " ")
.replace(/\s+/g, " ")
.trim()
: "No text available"}
</Typography>
</Paper>
</TableCell>
</TableRow>
)}
</React.Fragment>
))}
</TableBody>
</Table>
</TableContainer>
) : (
<Paper
sx={{
p: 4,
display: "flex",
justifyContent: "center",
alignItems: "center",
height: "100%",
borderRadius: theme.shape.borderRadius,
backgroundColor: alpha(theme.palette.background.paper, 0.5),
border: `1px dashed ${theme.palette.divider}`,
}}
>
<Typography color="text.secondary">No data available</Typography>
</Paper>
)}
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./csv-table";

View File

@@ -0,0 +1,204 @@
import {
Accordion,
AccordionSummary,
TableCell,
TableRow,
Paper,
TableBody,
useTheme,
TextField,
Box,
Typography,
AccordionDetails,
TableHead,
TableContainer,
Table,
} from "@mui/material";
import { useEffect, useState } from "react";
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
import { parseJsonToEntries } from "@/lib/helpers/parse-json-to-entries";
export type ExpandedTableInputProps = {
label: string;
onChange: (value: any) => void;
placeholder: string;
urlParam: string;
};
export const ExpandedTableInput = ({
label,
onChange,
placeholder,
urlParam,
}: ExpandedTableInputProps) => {
const theme = useTheme();
const [value, setValue] = useState("");
const [parsedHeaders, setParsedHeaders] = useState<[string, string][] | null>(
null
);
const [jsonError, setJsonError] = useState<string | null>(null);
const urlParams = new URLSearchParams(window.location.search);
const validateAndParse = (val: string) => {
if (val.trim() === "") {
setParsedHeaders(null);
setJsonError(null);
return null;
}
try {
const parsed = JSON.parse(val);
const entries = parseJsonToEntries(val);
if (entries === null) {
setParsedHeaders(null);
setJsonError("Invalid JSON object");
return null;
} else {
setParsedHeaders(entries);
setJsonError(null);
return parsed;
}
} catch (e) {
setParsedHeaders(null);
setJsonError("Invalid JSON format");
return null;
}
};
const handleChange = (e: React.ChangeEvent<HTMLInputElement>) => {
const val = e.target.value;
setValue(val);
const parsed = validateAndParse(val);
onChange(parsed);
};
useEffect(() => {
const jobOptions = urlParams.get("job_options");
if (!jobOptions) {
setParsedHeaders(null);
setJsonError(null);
return;
}
const jobOptionsObject = JSON.parse(jobOptions || "{}");
let val = jobOptionsObject[urlParam];
if (val.length === 0 || Object.keys(val).length === 0) {
setParsedHeaders(null);
setJsonError(null);
return;
}
if (typeof val === "string") {
try {
val = JSON.parse(val);
} catch {}
}
const finalVal =
typeof val === "string" ? val : val != null ? JSON.stringify(val) : "";
setValue(finalVal);
const parsed = validateAndParse(finalVal);
onChange(parsed);
}, [urlParam]);
return (
<Accordion
defaultExpanded
elevation={0}
sx={{
mb: 2,
border: `1px solid ${theme.palette.divider}`,
"&:before": { display: "none" },
borderRadius: 1,
overflow: "hidden",
padding: 1,
}}
>
<AccordionSummary
expandIcon={<ExpandMoreIcon />}
sx={{
backgroundColor: theme.palette.background.paper,
borderBottom: `1px solid ${theme.palette.divider}`,
"&.Mui-expanded": {
borderBottom: `1px solid ${theme.palette.divider}`,
},
}}
>
<Box sx={{ display: "flex", alignItems: "center" }}>
<Typography
sx={{ fontWeight: 500, color: theme.palette.text.primary }}
>
{label}
</Typography>
</Box>
</AccordionSummary>
<AccordionDetails
sx={{ p: 2, backgroundColor: theme.palette.background.default }}
>
<TextField
placeholder={placeholder}
value={value}
onChange={handleChange}
fullWidth
variant="outlined"
size="small"
error={jsonError !== null}
helperText={jsonError ?? ""}
/>
{parsedHeaders && parsedHeaders.length > 0 && (
<Paper
variant="outlined"
sx={{
marginTop: 1,
border: `1px solid ${theme.palette.divider}`,
borderRadius: 1,
overflow: "hidden",
padding: 0,
}}
>
<TableContainer sx={{ maxHeight: 200 }}>
<Table size="small" stickyHeader>
<TableHead>
<TableRow
sx={{
backgroundColor: theme.palette.background.paper,
}}
>
<TableCell sx={{ fontWeight: "bold" }}>Header</TableCell>
<TableCell sx={{ fontWeight: "bold" }}>Value</TableCell>
</TableRow>
</TableHead>
<TableBody>
{parsedHeaders.map(([key, val]) => (
<TableRow
key={key}
hover
sx={{
"&:nth-of-type(odd)": {
backgroundColor:
theme.palette.mode === "light"
? "rgba(0, 0, 0, 0.02)"
: "rgba(255, 255, 255, 0.02)",
},
}}
>
<TableCell sx={{ fontWeight: 500 }}>{key}</TableCell>
<TableCell>{val}</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</TableContainer>
</Paper>
)}
</AccordionDetails>
</Accordion>
);
};

View File

@@ -0,0 +1 @@
export * from "./expanded-table-input";

View File

@@ -0,0 +1 @@
export * from "./job-download-dialog";

View File

@@ -0,0 +1,95 @@
import {
Dialog,
DialogTitle,
DialogContent,
DialogActions,
Button,
FormControl,
RadioGroup,
FormControlLabel,
Radio,
FormLabel,
Typography,
Box,
} from "@mui/material";
import { useState } from "react";
export type JobDownloadDialogProps = {
open: boolean;
onClose: () => void;
ids: string[];
};
export const JobDownloadDialog = ({
open,
onClose,
ids,
}: JobDownloadDialogProps) => {
const [jobFormat, setJobFormat] = useState<string>("csv");
const handleDownload = async () => {
const response = await fetch("/api/download", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ data: { ids: ids, job_format: jobFormat } }),
});
if (response.ok) {
const blob = await response.blob();
const url = window.URL.createObjectURL(blob);
const a = document.createElement("a");
a.style.display = "none";
a.href = url;
a.download = `job_${ids[0]}.${jobFormat}`;
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
} else {
console.error("Failed to download the file.");
}
};
return (
<Dialog open={open} onClose={onClose}>
<DialogTitle>Download Job</DialogTitle>
<DialogContent>
<FormControl>
<Typography variant="body1">
You are about to download {ids.length} job(s). Please select the
format that you would like to download them in.
</Typography>
<br />
<Box
sx={{
display: "flex",
flexDirection: "column",
backgroundColor: "background.paper",
padding: 2,
border: "1px solid",
}}
>
<FormLabel>Format</FormLabel>
<hr style={{ width: "100%", margin: "10px 0" }} />
<RadioGroup
aria-labelledby="job-download-format-radio-buttons"
name="job-download-format-radio-buttons"
value={jobFormat}
onChange={(e) => setJobFormat(e.target.value)}
>
<FormControlLabel value="csv" control={<Radio />} label="CSV" />
<FormControlLabel
value="md"
control={<Radio />}
label="Markdown"
/>
</RadioGroup>
</Box>
<br />
<Button onClick={handleDownload} size="small">
Download
</Button>
</FormControl>
</DialogContent>
</Dialog>
);
};

View File

@@ -0,0 +1,40 @@
import { Box, Typography } from "@mui/material";
interface AudioViewerProps {
mediaUrl: string;
selectedMedia: string;
onError: () => void;
}
export const AudioViewer = ({
mediaUrl,
selectedMedia,
onError,
}: AudioViewerProps) => {
return (
<Box
sx={{
display: "flex",
justifyContent: "center",
alignItems: "center",
flexDirection: "column",
height: "100%",
gap: 2,
}}
>
<Typography variant="h6">{selectedMedia}</Typography>
<audio
controls
onError={onError}
style={{
width: "80%",
maxWidth: "500px",
}}
>
<source src={mediaUrl} type="audio/mpeg" />
Your browser does not support the audio element.
</audio>
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./audio-viewer";

View File

@@ -0,0 +1,36 @@
import { Box, useTheme } from "@mui/material";
export const ImageViewer = ({
mediaUrl,
selectedMedia,
}: {
mediaUrl: string;
selectedMedia: string;
}) => {
const theme = useTheme();
return (
<Box
sx={{
display: "flex",
justifyContent: "center",
alignItems: "center",
height: "100%",
width: "100%",
overflow: "hidden",
position: "relative",
}}
>
<img
src={mediaUrl}
alt={selectedMedia}
style={{
maxHeight: "100%",
maxWidth: "100%",
objectFit: "contain",
borderRadius: "4px",
boxShadow: theme.shadows[4],
}}
/>
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./image-viewer";

View File

@@ -0,0 +1 @@
export * from "./media-viewer";

View File

@@ -0,0 +1,75 @@
import { Box, Typography } from "@mui/material";
import { ImageViewer } from "./image";
import { VideoViewer } from "./video";
import { AudioViewer } from "./audio";
import { PDFViewer } from "./pdf-viewer";
interface MediaViewerProps {
selectedMedia: string;
activeTab: string;
getMediaUrl: (fileName: string) => string;
onError: (error: string) => void;
}
export const MediaViewer = ({
selectedMedia,
activeTab,
getMediaUrl,
onError,
}: MediaViewerProps) => {
if (!selectedMedia) {
return (
<Box
sx={{
display: "flex",
justifyContent: "center",
alignItems: "center",
height: "100%",
}}
>
<Typography variant="body1" color="textSecondary">
Select a file to view
</Typography>
</Box>
);
}
const mediaUrl = getMediaUrl(selectedMedia);
switch (activeTab) {
case "images":
return <ImageViewer mediaUrl={mediaUrl} selectedMedia={selectedMedia} />;
case "videos":
return (
<VideoViewer
mediaUrl={mediaUrl}
onError={() => onError("Error loading video")}
/>
);
case "audio":
return (
<AudioViewer
mediaUrl={mediaUrl}
selectedMedia={selectedMedia}
onError={() => onError("Error loading audio")}
/>
);
case "pdfs":
return <PDFViewer mediaUrl={mediaUrl} selectedMedia={selectedMedia} />;
default:
return (
<Box
sx={{
display: "flex",
justifyContent: "center",
alignItems: "center",
height: "100%",
}}
>
<Typography variant="body1">
{selectedMedia} - Download this file to view it
</Typography>
</Box>
);
}
};

View File

@@ -0,0 +1 @@
export * from "./pdf-viewer";

View File

@@ -0,0 +1,33 @@
import { Box, useTheme } from "@mui/material";
interface PDFViewerProps {
mediaUrl: string;
selectedMedia: string;
}
export const PDFViewer = ({ mediaUrl, selectedMedia }: PDFViewerProps) => {
const theme = useTheme();
return (
<Box
sx={{
width: "100%",
height: "100%",
overflow: "hidden",
borderRadius: 1,
}}
>
<iframe
src={`${mediaUrl}#view=fitH`}
style={{
width: "100%",
height: "100%",
border: "none",
borderRadius: "4px",
boxShadow: theme.shadows[4],
}}
title={selectedMedia}
/>
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./tile-grid-view";

View File

@@ -0,0 +1,114 @@
import { MediaFiles } from "@/components/pages/media/id/id";
import {
Card,
CardActionArea,
CardMedia,
CardContent,
Typography,
Box,
Grid,
useTheme,
} from "@mui/material";
interface TileGridViewProps {
mediaFiles: MediaFiles;
activeTab: string;
selectedMedia: string;
handleMediaSelect: (fileName: string) => void;
getMediaUrl: (fileName: string) => string;
}
export const TileGridView = ({
mediaFiles,
activeTab,
selectedMedia,
handleMediaSelect,
getMediaUrl,
}: TileGridViewProps) => {
const theme = useTheme();
return (
<Grid container spacing={2} sx={{ p: 2 }} data-testid="media-grid">
{mediaFiles[activeTab].map((fileName: string) => (
<Grid item xs={6} sm={4} md={3} lg={2} key={fileName}>
<Card
sx={{
height: "100%",
display: "flex",
flexDirection: "column",
borderColor:
selectedMedia === fileName
? theme.palette.primary.main
: "transparent",
borderWidth: 2,
borderStyle: "solid",
transition: "all 0.2s",
"&:hover": {
transform: "translateY(-4px)",
boxShadow: theme.shadows[6],
},
}}
>
<CardActionArea onClick={() => handleMediaSelect(fileName)}>
<CardMedia
component="div"
sx={{
pt: "75%",
position: "relative",
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[100]
: theme.palette.grey[800],
display: "flex",
justifyContent: "center",
alignItems: "center",
}}
>
{activeTab === "images" ? (
<Box
component="img"
src={getMediaUrl(fileName)}
alt={fileName}
sx={{
position: "absolute",
top: 0,
left: 0,
width: "100%",
height: "100%",
objectFit: "contain",
p: 1,
}}
onError={(e) => {
const target = e.target as HTMLImageElement;
if (target.src !== "/placeholder-image.png") {
target.src = "";
}
}}
/>
) : (
<Typography
variant="body2"
color="textSecondary"
sx={{
position: "absolute",
top: "50%",
left: "50%",
transform: "translate(-50%, -50%)",
}}
>
{fileName.split(".").pop()?.toUpperCase() || "FILE"}
</Typography>
)}
</CardMedia>
<CardContent sx={{ flexGrow: 1, p: 1 }}>
<Typography variant="body2" noWrap title={fileName}>
{fileName}
</Typography>
</CardContent>
</CardActionArea>
</Card>
</Grid>
))}
</Grid>
);
};

View File

@@ -0,0 +1 @@
export * from "./video-viewer";

View File

@@ -0,0 +1,39 @@
import { Box, useTheme } from "@mui/material";
export const VideoViewer = ({
mediaUrl,
onError,
}: {
mediaUrl: string;
onError: () => void;
}) => {
const theme = useTheme();
return (
<Box
sx={{
width: "100%",
height: "100%",
display: "flex",
justifyContent: "center",
alignItems: "center",
overflow: "hidden",
borderRadius: 1,
}}
>
<video
className="h-full w-full object-contain"
controls
onError={onError}
style={{
maxHeight: "100%",
maxWidth: "100%",
borderRadius: "4px",
boxShadow: theme.shadows[4],
}}
>
<source src={mediaUrl} type="video/mp4" />
Your browser does not support the video tag.
</video>
</Box>
);
};

View File

@@ -7,7 +7,7 @@ import TerminalIcon from "@mui/icons-material/Terminal";
import BarChart from "@mui/icons-material/BarChart";
import AutoAwesomeIcon from "@mui/icons-material/AutoAwesome";
import { List } from "@mui/material";
import { Schedule } from "@mui/icons-material";
import { Folder, Schedule, VideoFile } from "@mui/icons-material";
const items = [
{
@@ -35,6 +35,16 @@ const items = [
text: "Cron Jobs",
href: "/cron-jobs",
},
{
icon: <VideoFile />,
text: "Recordings",
href: "/recordings",
},
{
icon: <Folder />,
text: "Media",
href: "/media",
},
];
export const NavItems = () => {

View File

@@ -7,20 +7,15 @@ import {
TableHead,
TableRow,
Box,
Typography,
Accordion,
AccordionSummary,
AccordionDetails,
Checkbox,
Button,
Tooltip,
IconButton,
TableContainer,
} from "@mui/material";
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
import StarIcon from "@mui/icons-material/Star";
import { Job } from "../../types";
import { AutoAwesome } from "@mui/icons-material";
import { AutoAwesome, Image, VideoCameraBack } from "@mui/icons-material";
import { useRouter } from "next/router";
interface stringMap {
@@ -38,6 +33,7 @@ interface Props {
onDownload: (job: string[]) => void;
onNavigate: (elements: Object[], url: string, options: any) => void;
onFavorite: (ids: string[], field: string, value: any) => void;
onJobClick: (job: Job) => void;
stateProps: stateProps;
}
@@ -48,6 +44,7 @@ export const JobQueue = ({
onDownload,
onNavigate,
onFavorite,
onJobClick,
}: Props) => {
const { selectedJobs, filteredJobs } = stateProps;
const router = useRouter();
@@ -57,7 +54,7 @@ export const JobQueue = ({
<Table sx={{ tableLayout: "fixed", width: "100%" }}>
<TableHead>
<TableRow>
<TableCell>Select</TableCell>
<TableCell sx={{ width: "280px" }}>Select</TableCell>
<TableCell>Id</TableCell>
<TableCell>Url</TableCell>
<TableCell>Elements</TableCell>
@@ -70,7 +67,7 @@ export const JobQueue = ({
<TableBody sx={{ overflow: "auto" }}>
{filteredJobs.map((row, index) => (
<TableRow key={index}>
<TableCell padding="checkbox">
<TableCell padding="checkbox" sx={{ width: "280px" }}>
<Checkbox
checked={selectedJobs.has(row.id)}
onChange={() => onSelectJob(row.id)}
@@ -104,9 +101,45 @@ export const JobQueue = ({
</IconButton>
</span>
</Tooltip>
<Tooltip title="View Recording">
<span>
<IconButton
onClick={() => {
router.push({
pathname: "/recordings",
query: {
id: row.id,
},
});
}}
>
<VideoCameraBack />
</IconButton>
</span>
</Tooltip>
{row.job_options.collect_media && (
<Tooltip title="View Media">
<span>
<IconButton
onClick={() => {
router.replace(`/media?id=${row.id}`);
}}
>
<Image />
</IconButton>
</span>
</Tooltip>
)}
</TableCell>
<TableCell sx={{ maxWidth: 100, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.id}</Box>
<Box
sx={{
maxHeight: 100,
overflow: "auto",
}}
>
{row.id}
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 200, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.url}</Box>
@@ -117,41 +150,24 @@ export const JobQueue = ({
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto", padding: 0 }}>
<Accordion sx={{ margin: 0, padding: 0.5 }}>
<AccordionSummary
expandIcon={<ExpandMoreIcon />}
aria-controls="panel1a-content"
id="panel1a-header"
<Box
sx={{
maxHeight: 100,
overflow: "auto",
display: "flex",
alignItems: "center",
justifyContent: "center",
}}
>
<Button
sx={{
minHeight: 0,
"&.Mui-expanded": { minHeight: 0 },
fontSize: "0.875rem",
}}
onClick={() => onJobClick(row)}
>
<Box
sx={{
maxHeight: 150,
overflow: "auto",
width: "100%",
}}
>
<Typography sx={{ fontSize: "0.875rem" }}>
Show Result
</Typography>
</Box>
</AccordionSummary>
<AccordionDetails sx={{ padding: 1 }}>
<Box sx={{ maxHeight: 200, overflow: "auto" }}>
<Typography
sx={{
fontSize: "0.875rem",
whiteSpace: "pre-wrap",
}}
>
{JSON.stringify(row.result, null, 2)}
</Typography>
</Box>
</AccordionDetails>
</Accordion>
Show Result
</Button>
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
@@ -159,7 +175,7 @@ export const JobQueue = ({
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 50, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
<Box sx={{ maxWidth: 100, maxHeight: 100, overflow: "auto" }}>
<Box
className="rounded-md p-2 text-center"
sx={{ bgcolor: colors[row.status] }}

View File

@@ -20,6 +20,7 @@ import { Favorites, JobQueue } from ".";
import { Job } from "../../types";
import Cookies from "js-cookie";
import { useSearchParams } from "next/navigation";
import { JobDownloadDialog } from "../common/job-download-dialog";
interface JobTableProps {
jobs: Job[];
@@ -47,31 +48,15 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
const [searchQuery, setSearchQuery] = useState<string>(search || "");
const [searchMode, setSearchMode] = useState<string>(type || "url");
const [favoriteView, setFavoriteView] = useState<boolean>(false);
const [jobDownloadDialogOpen, setJobDownloadDialogOpen] =
useState<boolean>(false);
const token = Cookies.get("token");
const router = useRouter();
const handleDownload = async (ids: string[]) => {
const response = await fetch("/api/download", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ data: { ids: ids } }),
});
if (response.ok) {
const blob = await response.blob();
const url = window.URL.createObjectURL(blob);
const a = document.createElement("a");
a.style.display = "none";
a.href = url;
a.download = `job_${ids[0]}.csv`;
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
} else {
console.error("Failed to download the file.");
}
const handleDownload = (ids: string[]) => {
setSelectedJobs(new Set(ids));
setJobDownloadDialogOpen(true);
};
const handleNavigate = (elements: Object[], url: string, options: any) => {
@@ -156,24 +141,8 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
});
};
const scrollbarStyles = {
"&::-webkit-scrollbar": {
width: "8px",
height: "8px",
},
"&::-webkit-scrollbar-track": {
backgroundColor: "rgba(0,0,0,0.05)",
borderRadius: "8px",
},
"&::-webkit-scrollbar-thumb": {
backgroundColor: "rgba(0,0,0,0.2)",
borderRadius: "8px",
"&:hover": {
backgroundColor: "rgba(0,0,0,0.3)",
},
},
scrollbarWidth: "thin",
scrollbarColor: "rgba(0,0,0,0.2) rgba(0,0,0,0.05)",
const handleJobClick = (job: Job) => {
router.push(`/job/csv/${job.id}`);
};
return (
@@ -190,7 +159,6 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
maxWidth="100%"
bgcolor="background.default"
overflow="auto"
sx={scrollbarStyles}
>
<Box
className="flex flex-row justify-between p-2 w-full"
@@ -275,17 +243,23 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
onNavigate={handleNavigate}
onSelectJob={handleSelectJob}
onFavorite={favoriteJob}
></JobQueue>
onJobClick={handleJobClick}
/>
) : (
<Favorites
stateProps={{ selectedJobs, filteredJobs }}
onNavigate={handleNavigate}
onSelectJob={handleSelectJob}
onFavorite={favoriteJob}
></Favorites>
/>
)}
</Box>
</Box>
<JobDownloadDialog
open={jobDownloadDialogOpen}
onClose={() => setJobDownloadDialogOpen(false)}
ids={Array.from(selectedJobs)}
/>
</Box>
);
};

View File

@@ -11,7 +11,7 @@ import {
import { JobSelector } from "../../ai";
import { Job, Message } from "../../../types";
import { useSearchParams } from "next/navigation";
import { checkAI, fetchJob, fetchJobs, updateJob } from "../../../lib";
import { fetchJob, fetchJobs, updateJob, checkAI } from "../../../lib";
import SendIcon from "@mui/icons-material/Send";
import EditNoteIcon from "@mui/icons-material/EditNote";

View File

@@ -0,0 +1,35 @@
import { GetServerSideProps } from "next";
import { parseCookies } from "nookies";
export const getServerSideProps: GetServerSideProps = async (context) => {
const { req, params } = context;
const id = params?.id;
const cookies = parseCookies({ req });
const token = cookies.token;
let csv = null;
try {
const csvResponse = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/job/${id}/convert-to-csv`,
{
method: "GET",
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
}
);
csv = await csvResponse.json();
} catch (error) {
console.error("Error fetching job:", error);
}
return {
props: {
csv,
},
};
};

View File

@@ -0,0 +1,10 @@
import { CsvRow, CsvTable } from "@/components/common/csv-table/csv-table";
export type Csv = {
rows: CsvRow[];
headers: string[];
};
export const JobCsvId = ({ csv }: { csv: Csv }) => {
return <CsvTable csv={csv} />;
};

View File

@@ -0,0 +1 @@
export * from "./id";

View File

@@ -0,0 +1,392 @@
import { JobSelector } from "@/components/ai";
import { fetchJobs } from "@/lib";
import { Job } from "@/types";
import {
Box,
useTheme,
Typography,
CircularProgress,
Alert,
Paper,
Tabs,
Tab,
} from "@mui/material";
import { useRouter, useSearchParams } from "next/navigation";
import { useState, useEffect } from "react";
import { TileGridView } from "@/components/common/media-viewer/tile-grid-view";
import { MediaViewer } from "@/components/common/media-viewer";
export interface MediaFiles {
audio: string[];
documents: string[];
images: string[];
pdfs: string[];
presentations: string[];
spreadsheets: string[];
videos: string[];
[key: string]: string[];
}
export const MediaId = () => {
const searchParams = useSearchParams();
const theme = useTheme();
const router = useRouter();
const [error, setError] = useState<string | null>(null);
const [loading, setLoading] = useState(true);
const [jobs, setJobs] = useState<Job[]>([]);
const [selectedJob, setSelectedJob] = useState<Job | null>(null);
const [mediaFiles, setMediaFiles] = useState<MediaFiles | null>(null);
const [activeTab, setActiveTab] = useState<string>("images");
const [selectedMedia, setSelectedMedia] = useState<string | null>(null);
const currentId = searchParams.get("id");
const mediaType = searchParams.get("type") || "images";
const mediaName = searchParams.get("file");
const handleSelectJob = (job: Job | null) => {
if (job) {
router.push(`/media?id=${job.id}`);
}
};
const handleTabChange = (_event: React.SyntheticEvent, newValue: string) => {
setActiveTab(newValue);
router.push(`/media?id=${currentId}&type=${newValue}`);
};
const handleMediaSelect = (fileName: string) => {
setSelectedMedia(fileName);
router.push(`/media?id=${currentId}&type=${activeTab}&file=${fileName}`);
};
// Fetch jobs on mount
useEffect(() => {
fetchJobs(setJobs);
}, []);
// Set selected job when currentId changes
useEffect(() => {
if (!currentId) {
setSelectedJob(null);
return;
}
const job = jobs.find((j) => j.id === currentId);
setSelectedJob(job || null);
}, [currentId, jobs]);
// Fetch media files when selected job changes
useEffect(() => {
if (!selectedJob?.id) {
setError("No job ID provided");
setLoading(false);
return;
}
const fetchMediaFiles = async () => {
setLoading(true);
setError(null);
try {
const res = await fetch(`/api/media/get-media?id=${selectedJob.id}`);
if (!res.ok) {
throw new Error(`Media not found (status: ${res.status})`);
}
const data = await res.json();
setMediaFiles(data.files);
const hasMediaType = mediaType && data.files[mediaType]?.length > 0;
if (hasMediaType && activeTab !== mediaType) {
setActiveTab(mediaType);
} else if (!hasMediaType && !activeTab) {
// Only set a default tab if activeTab is not set
const firstNonEmpty = Object.entries(data.files).find(
([_, files]) => Array.isArray(files) && files.length > 0
);
if (firstNonEmpty) {
setActiveTab(firstNonEmpty[0]);
}
}
} catch (err) {
setError(
err instanceof Error ? err.message : "Failed to fetch media files"
);
} finally {
setLoading(false);
}
};
fetchMediaFiles();
}, [selectedJob?.id]);
// Set selected media when mediaName changes
useEffect(() => {
if (mediaName && mediaName !== selectedMedia) {
setSelectedMedia(mediaName);
}
}, [mediaName, selectedMedia]);
// Get media file URL
const getMediaUrl = (fileName: string) => {
if (!currentId || !activeTab) return "";
return `/api/media?id=${currentId}&type=${activeTab}&file=${fileName}`;
};
const renderMediaThumbnails = () => {
if (
!mediaFiles ||
!mediaFiles[activeTab] ||
mediaFiles[activeTab].length === 0
) {
return (
<Box
sx={{
display: "flex",
justifyContent: "center",
alignItems: "center",
height: "100%",
p: 3,
}}
>
<Typography variant="body2" color="textSecondary">
No {activeTab} files available
</Typography>
</Box>
);
}
return (
<TileGridView
mediaFiles={mediaFiles}
activeTab={activeTab}
selectedMedia={selectedMedia || ""}
handleMediaSelect={handleMediaSelect}
getMediaUrl={getMediaUrl}
/>
);
};
return (
<Box
sx={{
height: "100%",
width: "100%",
display: "flex",
flexDirection: "column",
position: "relative",
borderRadius: 2,
overflow: "hidden",
border: `1px solid ${theme.palette.divider}`,
backgroundColor: theme.palette.background.paper,
}}
>
<Box
sx={{
display: "flex",
justifyContent: "flex-end",
p: 1,
borderBottom: `1px solid ${theme.palette.divider}`,
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[50]
: theme.palette.grey[900],
zIndex: 10,
}}
>
<Box sx={{ width: "300px" }}>
<JobSelector
setSelectedJob={handleSelectJob}
selectedJob={selectedJob}
setJobs={setJobs}
jobs={jobs}
/>
</Box>
</Box>
{loading ? (
<Box
display="flex"
flexDirection="column"
alignItems="center"
justifyContent="center"
sx={{ flex: 1 }}
gap={2}
>
<CircularProgress />
<Typography variant="body2" color="textSecondary">
Loading media...
</Typography>
</Box>
) : error ? (
<Box
sx={{
flex: 1,
display: "flex",
justifyContent: "center",
alignItems: "center",
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[100]
: theme.palette.grey[900],
p: 2,
}}
>
<Paper
elevation={3}
sx={{
p: 3,
maxWidth: "500px",
width: "100%",
backgroundColor: theme.palette.background.paper,
borderRadius: 2,
}}
>
<Alert
severity="error"
variant="filled"
sx={{
mb: 2,
backgroundColor: theme.palette.error.main,
}}
>
{error}
</Alert>
<Typography variant="body2" color="textSecondary" sx={{ mt: 2 }}>
Please select a different job from the dropdown menu above or
check if media browsing is enabled.
</Typography>
</Paper>
</Box>
) : (
<>
<Box
sx={{
borderBottom: 1,
borderColor: "divider",
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[50]
: theme.palette.grey[900],
}}
>
<Tabs
value={activeTab}
onChange={handleTabChange}
variant="scrollable"
scrollButtons="auto"
aria-label="media type tabs"
>
{mediaFiles &&
Object.entries(mediaFiles).map(([type, files]) => (
<Tab
key={type}
value={type}
label={`${type.charAt(0).toUpperCase() + type.slice(1)} (${
files.length
})`}
disabled={!files.length}
/>
))}
</Tabs>
</Box>
<Box
sx={{
display: "flex",
flexDirection: "column",
flex: 1,
height: "calc(100% - 48px)",
overflow: "hidden",
}}
>
{selectedMedia && mediaType && mediaName ? (
<Box
sx={{
display: "flex",
flexDirection: "column",
height: "100%",
}}
>
<Box
sx={{
display: "flex",
justifyContent: "space-between",
alignItems: "center",
p: 1,
borderBottom: `1px solid ${theme.palette.divider}`,
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[50]
: theme.palette.grey[900],
}}
>
<Typography variant="subtitle1" noWrap>
{selectedMedia}
</Typography>
<Box>
<Typography
variant="body2"
sx={{
cursor: "pointer",
color: theme.palette.primary.main,
"&:hover": {
textDecoration: "underline",
},
}}
onClick={async () => {
setSelectedMedia(null);
await router.push(
`/media?id=${currentId}&type=${mediaType}`
);
}}
>
Back to Gallery
</Typography>
</Box>
</Box>
<Box
sx={{
flex: 1,
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[100]
: theme.palette.grey[900],
overflow: "hidden",
display: "flex",
justifyContent: "center",
alignItems: "center",
p: 2,
}}
>
<MediaViewer
selectedMedia={selectedMedia}
activeTab={activeTab}
getMediaUrl={getMediaUrl}
onError={() => setError("Error loading media")}
/>
</Box>
</Box>
) : (
<Box
sx={{
flex: 1,
overflow: "auto",
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[100]
: theme.palette.grey[900],
}}
>
{renderMediaThumbnails()}
</Box>
)}
</Box>
</>
)}
</Box>
);
};

View File

@@ -0,0 +1 @@
export { MediaId } from "./id";

View File

@@ -0,0 +1,204 @@
import { JobSelector } from "@/components/ai";
import { fetchJobs } from "@/lib";
import { useUserSettings } from "@/store/hooks";
import { Job } from "@/types";
import {
Box,
useTheme,
Typography,
CircularProgress,
Alert,
Paper,
} from "@mui/material";
import { useRouter, useSearchParams } from "next/navigation";
import { useState, useEffect } from "react";
export const RecordingId = () => {
const searchParams = useSearchParams();
const theme = useTheme();
const { userSettings } = useUserSettings();
const router = useRouter();
const [error, setError] = useState<string | null>(null);
const [videoUrl, setVideoUrl] = useState<string | null>(null);
const [loading, setLoading] = useState(true);
const [jobs, setJobs] = useState<Job[]>([]);
const [selectedJob, setSelectedJob] = useState<Job | null>(null);
const currentId = searchParams.get("id");
const handleSelectJob = (job: Job | null) => {
if (job) {
router.push(`/recordings?id=${job.id}`);
}
};
useEffect(() => {
fetchJobs(setJobs);
}, []);
useEffect(() => {
if (!userSettings.recordingsEnabled) {
setError("Recordings are disabled");
setLoading(false);
return;
}
if (!currentId) {
setError("No recording ID provided");
setLoading(false);
return;
}
setLoading(true);
setError(null);
const url = `/api/recordings/${currentId}`;
fetch(url, { method: "HEAD" })
.then((res) => {
if (!res.ok) {
throw new Error(`Video not found (status: ${res.status})`);
}
setVideoUrl(url);
})
.catch(() => {
setError("404 recording not found");
})
.finally(() => {
setLoading(false);
});
}, [currentId, userSettings.recordingsEnabled]);
useEffect(() => {
if (!currentId) {
setSelectedJob(null);
return;
}
const job = jobs.find((j) => j.id === currentId);
setSelectedJob(job || null);
}, [currentId, jobs]);
return (
<Box
sx={{
height: "100%",
width: "100%",
display: "flex",
flexDirection: "column",
position: "relative",
borderRadius: 2,
overflow: "hidden",
border: `1px solid ${theme.palette.divider}`,
backgroundColor: theme.palette.background.paper,
}}
>
<Box
sx={{
display: "flex",
justifyContent: "flex-end",
p: 1,
borderBottom: `1px solid ${theme.palette.divider}`,
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[50]
: theme.palette.grey[900],
zIndex: 10,
}}
>
<Box sx={{ width: "300px" }}>
<JobSelector
setSelectedJob={handleSelectJob}
selectedJob={selectedJob}
setJobs={setJobs}
jobs={jobs}
sxProps={{}}
/>
</Box>
</Box>
<Box
sx={{
flex: 1,
display: "flex",
justifyContent: "center",
alignItems: "center",
position: "relative",
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[100]
: theme.palette.grey[900],
p: 2,
overflow: "hidden",
}}
>
{loading ? (
<Box
display="flex"
flexDirection="column"
alignItems="center"
gap={2}
>
<CircularProgress />
<Typography variant="body2" color="textSecondary">
Loading recording...
</Typography>
</Box>
) : error ? (
<Paper
elevation={3}
sx={{
p: 3,
maxWidth: "500px",
width: "100%",
backgroundColor: theme.palette.background.paper,
borderRadius: 2,
}}
>
<Alert
severity="error"
variant="filled"
sx={{
mb: 2,
backgroundColor: theme.palette.error.main,
}}
>
{error}
</Alert>
<Typography variant="body2" color="textSecondary" sx={{ mt: 2 }}>
Please select a different recording from the dropdown menu above
or check if recordings are enabled.
</Typography>
</Paper>
) : (
<Box
sx={{
width: "100%",
height: "100%",
display: "flex",
justifyContent: "center",
alignItems: "center",
overflow: "hidden",
borderRadius: 1,
}}
>
<video
className="h-full w-full object-contain"
controls
onError={() => setError("Error loading video")}
style={{
maxHeight: "100%",
maxWidth: "100%",
borderRadius: "4px",
boxShadow: theme.shadows[4],
}}
>
<source src={videoUrl ?? undefined} type="video/mp4" />
Your browser does not support the video tag.
</video>
</Box>
)}
</Box>
</Box>
);
};

View File

@@ -0,0 +1 @@
export { RecordingId } from "./id";

View File

@@ -10,12 +10,14 @@ import { JobSubmitterInput } from "./job-submitter-input";
import { JobSubmitterOptions } from "./job-submitter-options";
import { ApiService } from "@/services";
import { useJobSubmitterProvider } from "./provider";
import { AdvancedJobOptions } from "@/components/common/advanced-job-options";
const initialJobOptions: RawJobOptions = {
multi_page_scrape: false,
custom_headers: null,
proxies: null,
collect_media: false,
custom_cookies: null,
};
export const JobSubmitter = () => {
@@ -38,12 +40,8 @@ export const JobSubmitter = () => {
const [loading, setLoading] = useState<boolean>(false);
const [jobOptions, setJobOptions] =
useState<RawJobOptions>(initialJobOptions);
const [customJSONSelected, setCustomJSONSelected] = useState<boolean>(false);
const [proxiesSelected, setProxiesSelected] = useState<boolean>(false);
const handleSelectProxies = () => {
setProxiesSelected(!proxiesSelected);
};
console.log(jobOptions);
const handleSubmit = async () => {
if (!validateURL(submittedURL)) {
@@ -57,12 +55,13 @@ export const JobSubmitter = () => {
setLoading(true);
let customHeaders;
let customCookies;
try {
customHeaders = jobOptions.custom_headers
? JSON.parse(jobOptions.custom_headers)
: null;
} catch (error) {
customHeaders = jobOptions.custom_headers || null;
customCookies = jobOptions.custom_cookies || null;
} catch (error: any) {
console.error(error);
setSnackbarMessage("Invalid JSON in custom headers.");
setSnackbarOpen(true);
setSnackbarSeverity("error");
@@ -76,6 +75,7 @@ export const JobSubmitter = () => {
user,
jobOptions,
customHeaders,
customCookies,
siteMap
)
.then(async (response) => {
@@ -102,16 +102,9 @@ export const JobSubmitter = () => {
.finally(() => setLoading(false));
};
// Parse the job options from the query string
useEffect(() => {
if (job_options) {
parseJobOptions(
job_options as string,
setCustomJSONSelected,
setProxiesSelected,
setJobOptions,
setSiteMap
);
parseJobOptions(job_options as string, setJobOptions, setSiteMap);
}
}, [job_options]);
@@ -123,13 +116,9 @@ export const JobSubmitter = () => {
handleSubmit={handleSubmit}
loading={loading}
/>
<JobSubmitterOptions
<AdvancedJobOptions
jobOptions={jobOptions}
setJobOptions={setJobOptions}
customJSONSelected={customJSONSelected}
setCustomJSONSelected={setCustomJSONSelected}
handleSelectProxies={handleSelectProxies}
proxiesSelected={proxiesSelected}
/>
</div>
);

View File

@@ -42,12 +42,12 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
params.append("username", email);
params.append("password", password);
const response = await axios.post(`/api/token`, params);
const isSecure = window.location.protocol === "https:";
Cookies.set("token", response.data.access_token, {
expires: 7,
path: "/",
domain: "localhost",
secure: false,
secure: isSecure,
sameSite: "Lax",
});

View File

@@ -4,10 +4,8 @@ import { RawJobOptions, SiteMap } from "@/types";
export const parseJobOptions = (
job_options: string,
setCustomJSONSelected: Dispatch<SetStateAction<boolean>>,
setProxiesSelected: Dispatch<SetStateAction<boolean>>,
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>,
setSiteMap: Dispatch<SetStateAction<any>>
setSiteMap: Dispatch<SetStateAction<SiteMap | null>>
) => {
if (job_options) {
const jsonOptions = JSON.parse(job_options as string);
@@ -16,20 +14,27 @@ export const parseJobOptions = (
custom_headers: null,
proxies: null,
collect_media: false,
custom_cookies: null,
};
if (jsonOptions.collect_media) {
newJobOptions.collect_media = true;
}
if (
jsonOptions.custom_headers &&
Object.keys(jsonOptions.custom_headers).length
) {
setCustomJSONSelected(true);
newJobOptions.custom_headers = JSON.stringify(jsonOptions.custom_headers);
newJobOptions.custom_headers = jsonOptions.custom_headers;
}
if (jsonOptions.custom_cookies && jsonOptions.custom_cookies.length > 0) {
newJobOptions.custom_cookies = jsonOptions.custom_cookies;
}
newJobOptions.multi_page_scrape = jsonOptions.multi_page_scrape;
if (jsonOptions.proxies.length > 0) {
setProxiesSelected(true);
newJobOptions.proxies = jsonOptions.proxies.join(",");
}

View File

@@ -0,0 +1,37 @@
export const parseJsonToEntries = (json: string): [string, string][] | null => {
try {
const parsed = JSON.parse(json);
if (Array.isArray(parsed)) {
if (
parsed.length > 0 &&
Array.isArray(parsed[0]) &&
parsed[0].length === 2 &&
typeof parsed[0][0] === "string"
) {
// Already array of [key, val] tuples
// Just ensure values are strings
return parsed.map(([k, v]) => [k, String(v)]);
}
// Array of objects
const allEntries: [string, string][] = [];
for (const item of parsed) {
if (typeof item === "object" && item !== null) {
allEntries.push(
// @ts-ignore
...Object.entries(item).map(([k, v]) => [k, String(v)])
);
} else {
return null;
}
}
return allEntries.length > 0 ? allEntries : null;
} else if (typeof parsed === "object" && parsed !== null) {
return Object.entries(parsed).map(([k, v]) => [k, String(v)]);
}
return null;
} catch {
return null;
}
};

View File

@@ -80,3 +80,22 @@ export const updateJob = async (ids: string[], field: string, value: any) => {
console.error("Error fetching jobs:", error);
});
};
export const getUserSettings = async () => {
const token = Cookies.get("token");
try {
const response = await fetch("/api/check", {
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
});
const data = await response.json();
return data;
} catch (error) {
console.error("Error fetching jobs:", error);
throw error;
}
};

View File

@@ -8,6 +8,9 @@ import { ThemeProvider, CssBaseline, Box } from "@mui/material";
import { NavDrawer } from "../components/common";
import { darkTheme, lightTheme } from "../styles/themes";
import { AuthProvider } from "../contexts/AuthContext";
import { Provider } from "react-redux";
import { PersistGate } from "redux-persist/integration/react";
import { store, persistor } from "@/store/store";
const App: React.FC<AppProps> = ({ Component, pageProps }) => {
const [isDarkMode, setIsDarkMode] = useState(false);
@@ -35,26 +38,30 @@ const App: React.FC<AppProps> = ({ Component, pageProps }) => {
<Head>
<title>Scraperr</title>
</Head>
<AuthProvider>
<ThemeProvider theme={isDarkMode ? darkTheme : lightTheme}>
<CssBaseline />
<Box sx={{ height: "100%", display: "flex" }}>
<NavDrawer isDarkMode={isDarkMode} toggleTheme={toggleTheme} />
<Box
component="main"
sx={{
p: 3,
bgcolor: "background.default",
overflow: "hidden",
height: "100%",
width: "100%",
}}
>
<Component {...pageProps} />
</Box>
</Box>
</ThemeProvider>
</AuthProvider>
<Provider store={store}>
<PersistGate loading={null} persistor={persistor}>
<AuthProvider>
<ThemeProvider theme={isDarkMode ? darkTheme : lightTheme}>
<CssBaseline />
<Box sx={{ height: "100%", display: "flex" }}>
<NavDrawer isDarkMode={isDarkMode} toggleTheme={toggleTheme} />
<Box
component="main"
sx={{
p: 3,
bgcolor: "background.default",
overflow: "hidden",
height: "100%",
width: "100%",
}}
>
<Component {...pageProps} />
</Box>
</Box>
</ThemeProvider>
</AuthProvider>
</PersistGate>
</Provider>
</>
);
};

View File

@@ -0,0 +1,24 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
const { id } = req.query;
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/get-media?id=${id}`
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const data = await response.json();
res.status(200).json(data);
} catch (error) {
console.error("Error streaming video:", error);
res.status(404).json({ error: "Error streaming video" });
}
}

View File

@@ -0,0 +1,33 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
const { id, type, file } = req.query;
if (!id || !type || !file) {
return res.status(400).json({ error: "Missing required parameters" });
}
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/media?id=${id}&type=${type}&file=${file}`
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const contentType =
response.headers.get("content-type") || "application/octet-stream";
res.setHeader("Content-Type", contentType);
const arrayBuffer = await response.arrayBuffer();
res.status(200).send(Buffer.from(arrayBuffer));
} catch (error) {
console.error("Error streaming media:", error);
res.status(404).json({ error: "Error retrieving media file" });
}
}

View File

@@ -0,0 +1,39 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
const { id } = req.query;
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/recordings/${id}`
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
res.setHeader("Content-Type", "video/mp4");
res.setHeader("Accept-Ranges", "bytes");
const reader = response.body?.getReader();
if (!reader) {
res.status(404).json({ error: "Recording not found" });
return;
}
while (true) {
const { done, value } = await reader.read();
if (done) break;
res.write(value);
}
res.end();
} catch (error) {
console.error("Error streaming video:", error);
res.status(404).json({ error: "Error streaming video" });
}
}

View File

@@ -0,0 +1,2 @@
export { JobCsvId as default } from "@/components/pages/job/csv/id";
export { getServerSideProps } from "@/components/pages/job/csv/id/get-server-side-props";

View File

@@ -6,7 +6,8 @@ import { Button, TextField, Typography, Box } from "@mui/material";
import { useTheme } from "@mui/material/styles";
import { useRouter } from "next/router";
import { useAuth } from "../contexts/AuthContext";
import { Constants } from "../lib";
import { Constants, getUserSettings } from "../lib";
import { useUserSettings } from "@/store/hooks";
type Mode = "login" | "signup";
@@ -19,6 +20,7 @@ const AuthForm: React.FC = () => {
const router = useRouter();
const { login } = useAuth();
const [registrationEnabled, setRegistrationEnabled] = useState<boolean>(true);
const { setUserSettings } = useUserSettings();
const checkRegistrationEnabled = async () => {
const response = await axios.get(`/api/check`);
@@ -28,12 +30,17 @@ const AuthForm: React.FC = () => {
useEffect(() => {
checkRegistrationEnabled();
}, []);
const handleSubmit = async (event: React.FormEvent) => {
event.preventDefault();
try {
if (mode === "login") {
await login(email, password);
alert("Login successful");
const userSettings = await getUserSettings();
setUserSettings(userSettings);
router.push("/");
} else {
await axios.post(`/api/signup`, {

View File

@@ -0,0 +1 @@
export { MediaId as default } from "@/components/pages/media/id";

View File

@@ -0,0 +1 @@
export { RecordingId as default } from "@/components/pages/recordings/id";

View File

@@ -6,6 +6,7 @@ export const submitJob = async (
user: any,
jobOptions: any,
customHeaders: any,
customCookies: any,
siteMap: SiteMap | null
) => {
return await fetch(`/api/submit-scrape-job`, {
@@ -23,6 +24,7 @@ export const submitJob = async (
custom_headers: customHeaders || {},
proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [],
site_map: siteMap,
custom_cookies: customCookies || [],
},
},
}),

23
src/store/hooks.ts Normal file
View File

@@ -0,0 +1,23 @@
import { TypedUseSelectorHook, useDispatch, useSelector } from "react-redux";
import type { RootState, AppDispatch } from "./store";
import {
SettingsState,
setAiEnabled,
setRecordingsEnabled,
} from "./slices/settingsSlice";
export const useAppDispatch = () => useDispatch<AppDispatch>();
export const useAppSelector: TypedUseSelectorHook<RootState> = useSelector;
export const useUserSettings = () => {
const userSettings = useAppSelector((state) => state.settings);
const dispatch = useAppDispatch();
const setUserSettings = (userSettings: any) => {
dispatch(setAiEnabled(userSettings.ai_enabled));
dispatch(setRecordingsEnabled(userSettings.recordings_enabled));
return userSettings;
};
return { userSettings, setUserSettings };
};

View File

@@ -0,0 +1,28 @@
import { createSlice, PayloadAction } from "@reduxjs/toolkit";
export interface SettingsState {
aiEnabled: boolean;
recordingsEnabled: boolean;
}
const initialState: SettingsState = {
aiEnabled: false,
recordingsEnabled: false,
};
const settingsSlice = createSlice({
name: "settings",
initialState,
reducers: {
setAiEnabled: (state, action: PayloadAction<boolean>) => {
state.aiEnabled = action.payload;
},
setRecordingsEnabled: (state, action: PayloadAction<boolean>) => {
state.recordingsEnabled = action.payload;
},
},
});
export const { setAiEnabled, setRecordingsEnabled } = settingsSlice.actions;
export default settingsSlice.reducer;

32
src/store/store.ts Normal file
View File

@@ -0,0 +1,32 @@
import { configureStore } from "@reduxjs/toolkit";
import { persistStore, persistReducer } from "redux-persist";
import storage from "redux-persist/lib/storage";
import { combineReducers } from "@reduxjs/toolkit";
import settingsReducer from "./slices/settingsSlice";
const persistConfig = {
key: "root",
storage,
whitelist: ["settings"], // only settings will be persisted
};
const rootReducer = combineReducers({
settings: settingsReducer,
});
const persistedReducer = persistReducer(persistConfig, rootReducer);
export const store = configureStore({
reducer: persistedReducer,
middleware: (getDefaultMiddleware) =>
getDefaultMiddleware({
serializableCheck: {
ignoredActions: ["persist/PERSIST", "persist/REHYDRATE"],
},
}),
});
export const persistor = persistStore(store);
export type RootState = ReturnType<typeof store.getState>;
export type AppDispatch = typeof store.dispatch;

View File

@@ -5,6 +5,8 @@
:root {
--delete-red: #ef4444;
--delete-red-hover: #ff6969;
--primary-blue: #007bff;
--primary-gray: #f8f9fa;
}
#__next {
@@ -20,3 +22,22 @@ body {
.MuiPopover-paper {
padding: 0 !important;
}
::-webkit-scrollbar {
width: 8px;
height: 8px;
}
::-webkit-scrollbar-track {
background-color: rgba(0, 0, 0, 0.05);
border-radius: 8px;
}
::-webkit-scrollbar-thumb {
background-color: rgba(0, 0, 0, 0.2);
border-radius: 8px;
}
::-webkit-scrollbar-thumb:hover {
background-color: rgba(0, 0, 0, 0.3);
}

View File

@@ -70,6 +70,16 @@ const commonThemeOptions = {
},
},
},
MuiCheckbox: {
styleOverrides: {
colorPrimary: {
color: "#1976d2",
"&.Mui-checked": {
color: "#034efc",
},
},
},
},
MuiPaper: {
styleOverrides: {
root: {
@@ -85,6 +95,7 @@ const lightTheme = createTheme({
mode: "light",
primary: {
main: "#1976d2",
contrastText: "#000000",
},
secondary: {
main: "#dc004e",
@@ -139,6 +150,7 @@ const darkTheme = createTheme({
mode: "dark",
primary: {
main: "#90caf9",
contrastText: "#fff",
},
secondary: {
main: "#f48fb1",

View File

@@ -7,7 +7,7 @@ export interface Job {
result: Object;
time_created: Date;
status: string;
job_options: Object;
job_options: RawJobOptions;
favorite: boolean;
chat?: Message[];
}
@@ -24,6 +24,7 @@ export type RawJobOptions = {
custom_headers: string | null;
proxies: string | null;
collect_media: boolean;
custom_cookies: string | null;
};
export type ActionOption = "click" | "input";

14
start.sh Executable file
View File

@@ -0,0 +1,14 @@
#!/bin/bash
RECORDINGS_ENABLED=${RECORDINGS_ENABLED:-true}
if [ "$RECORDINGS_ENABLED" == "false" ]; then
pdm run python -m api.backend.worker.job_worker
else
Xvfb :99 -screen 0 1280x1024x24 &
XVFB_PID=$!
sleep 2
x11vnc -display :99 -rfbport 5900 -forever -nopw &
VNC_PID=$!
DISPLAY=:99 pdm run python -m api.backend.worker.job_worker
fi

View File

@@ -12,7 +12,7 @@ stdout_logfile_maxbytes=0
stderr_logfile_maxbytes=0
[program:worker]
command=pdm run python -m api.backend.worker.job_worker
command=/project/app/start.sh
directory=/project/app
autostart=true
autorestart=true

5797
yarn.lock Normal file

File diff suppressed because it is too large Load Diff