8 Commits

Author SHA1 Message Date
Jayden Pyles
d4edb9d93e chore: update chart version [skip ci] 2025-05-19 20:46:19 -05:00
Jayden Pyles
5ebd96b62b feat: add agent mode (#81)
* chore: wip agent mode

* wip: add agent mode frontend

* wip: add agent mode frontend

* chore: cleanup code

* chore: cleanup code

* chore: cleanup code
2025-05-19 20:44:41 -05:00
Jayden Pyles
d602d3330a fix: site map
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
2025-05-17 17:05:37 -05:00
Jayden Pyles
6639e8b48f chore: update chart version [skip ci] 2025-05-17 16:33:18 -05:00
Jayden Pyles
263e46ba4d feat: add media viewer + other fixes (#79)
* feat: add media viewer + other fixes

* chore: remove logging [skip ci]

* chore: remove logging [skip ci]

* feat: add unit test for media

* feat: add unit test for media

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* chore: update docs [skip ci]
2025-05-17 16:31:34 -05:00
Jayden Pyles
f815a58efc chore: update docker version [skip ci] 2025-05-16 22:04:46 -05:00
Jayden Pyles
50ec5df657 chore: update chart version [skip ci] 2025-05-16 21:39:04 -05:00
Jayden Pyles
28de0f362c feat: add recording viewer and vnc (#78)
* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* chore: update gitignore [skip ci]

* chore: update dev compose [skip ci]

* fix: only run manually
2025-05-16 21:37:09 -05:00
86 changed files with 8260 additions and 11566 deletions

4
.dockerignore Normal file
View File

@@ -0,0 +1,4 @@
node_modules
npm-debug.log
Dockerfile
.dockerignore

View File

@@ -15,11 +15,11 @@ runs:
- name: Setup Docker project - name: Setup Docker project
shell: bash shell: bash
run: make build up-dev run: make build-ci up-ci
- name: Install dependencies - name: Install dependencies
shell: bash shell: bash
run: npm install run: yarn install
- name: Wait for frontend to be ready - name: Wait for frontend to be ready
shell: bash shell: bash

View File

@@ -1,14 +1,9 @@
name: Docker Image name: Docker Image
on: on:
workflow_run:
workflows: ["Unit Tests"]
types:
- completed
workflow_dispatch: workflow_dispatch:
jobs: jobs:
build: build:
if: ${{ github.event.workflow_run.conclusion == 'success' && github.ref == 'refs/heads/master' }}
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout - name: Checkout

View File

@@ -30,7 +30,7 @@ jobs:
run: pdm run playwright install run: pdm run playwright install
- name: Run tests - name: Run tests
run: PYTHONPATH=. pdm run pytest api/backend/tests run: PYTHONPATH=. pdm run pytest -v -ra api/backend/tests
cypress-tests: cypress-tests:
runs-on: ubuntu-latest runs-on: ubuntu-latest

16
.gitignore vendored
View File

@@ -188,4 +188,18 @@ postgres_data
.vscode .vscode
ollama ollama
data data
media
media/images
media/videos
media/audio
media/pdfs
media/spreadsheets
media/presentations
media/documents
media/recordings
media/download_summary.txt
cypress/screenshots
cypress/videos
docker-compose.dev.local.yml

View File

@@ -1,6 +1,6 @@
.DEFAULT_GOAL := help .DEFAULT_GOAL := help
COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.yml COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.local.yml
COMPOSE_PROD = docker compose -f docker-compose.yml COMPOSE_PROD = docker compose -f docker-compose.yml
.PHONY: help deps build pull up up-dev down setup deploy .PHONY: help deps build pull up up-dev down setup deploy
@@ -17,6 +17,7 @@ help:
@echo " make down - Stop and remove containers, networks, images, and volumes" @echo " make down - Stop and remove containers, networks, images, and volumes"
@echo " make setup - Setup server with dependencies and clone repo" @echo " make setup - Setup server with dependencies and clone repo"
@echo " make deploy - Deploy site onto server" @echo " make deploy - Deploy site onto server"
@echo " make cypress-start - Start Cypress"
@echo "" @echo ""
logs: logs:
@@ -51,3 +52,12 @@ setup:
deploy: deploy:
ansible-playbook -i ./ansible/inventory.yaml ./ansible/deploy_site.yaml -v ansible-playbook -i ./ansible/inventory.yaml ./ansible/deploy_site.yaml -v
build-ci:
docker compose -f docker-compose.yml -f docker-compose.dev.yml build
up-ci:
docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d --force-recreate
cypress-start:
DISPLAY=:0 npx cypress open

View File

@@ -13,7 +13,7 @@
## 📋 Overview ## 📋 Overview
Scraperr enables you to extract data from websites with precision using XPath selectors. This self-hosted application provides a clean interface to manage scraping jobs, view results, and export data. Scrape websites without writing a single line of code.
> 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information. > 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information.
@@ -29,7 +29,7 @@ Scraperr enables you to extract data from websites with precision using XPath se
- **Custom Headers**: Add JSON headers to your scraping requests - **Custom Headers**: Add JSON headers to your scraping requests
- **Media Downloads**: Automatically download images, videos, and other media - **Media Downloads**: Automatically download images, videos, and other media
- **Results Visualization**: View scraped data in a structured table format - **Results Visualization**: View scraped data in a structured table format
- **Data Export**: Export your results in various formats - **Data Export**: Export your results in markdown and csv formats
- **Notifcation Channels**: Send completion notifcations, through various channels - **Notifcation Channels**: Send completion notifcations, through various channels
## 🚀 Getting Started ## 🚀 Getting Started

View File

@@ -0,0 +1,6 @@
from typing_extensions import TypedDict
class Action(TypedDict):
type: str
url: str

View File

@@ -0,0 +1,94 @@
import random
from typing import Any
from camoufox import AsyncCamoufox
from playwright.async_api import Page
from api.backend.ai.agent.utils import (
capture_elements,
convert_to_markdown,
parse_response,
)
from api.backend.ai.clients import ask_open_ai, ask_ollama, open_ai_key
from api.backend.ai.agent.prompts import (
ELEMENT_EXTRACTION_PROMPT,
EXTRACT_ELEMENTS_PROMPT,
)
from api.backend.job.scraping.collect_media import collect_media
from api.backend.worker.logger import LOG
from api.backend.job.scraping.add_custom import add_custom_items
from api.backend.models import CapturedElement
ask_ai = ask_open_ai if open_ai_key else ask_ollama
async def scrape_with_agent(agent_job: dict[str, Any]):
LOG.info(f"Starting work for agent job: {agent_job}")
pages = set()
if agent_job["job_options"]["proxies"]:
proxy = random.choice(agent_job["job_options"]["proxies"])
LOG.info(f"Using proxy: {proxy}")
async with AsyncCamoufox(headless=True) as browser:
page: Page = await browser.new_page()
await add_custom_items(
agent_job["url"],
page,
agent_job["job_options"]["custom_cookies"],
agent_job["job_options"]["custom_headers"],
)
try:
await page.set_viewport_size({"width": 1920, "height": 1080})
await page.goto(agent_job["url"], timeout=60000)
if agent_job["job_options"]["collect_media"]:
await collect_media(agent_job["id"], page)
html_content = await page.content()
markdown_content = convert_to_markdown(html_content)
response = await ask_ai(
ELEMENT_EXTRACTION_PROMPT.format(
extraction_prompt=EXTRACT_ELEMENTS_PROMPT,
webpage=markdown_content,
prompt=agent_job["prompt"],
)
)
xpaths = parse_response(response)
captured_elements = await capture_elements(page, xpaths)
final_url = page.url
pages.add((html_content, final_url))
finally:
await page.close()
await browser.close()
name_to_elements = {}
for page in pages:
for element in captured_elements:
if element.name not in name_to_elements:
name_to_elements[element.name] = []
name_to_elements[element.name].append(element)
scraped_elements: list[dict[str, dict[str, list[CapturedElement]]]] = [
{
page[1]: name_to_elements,
}
for page in pages
]
return scraped_elements

View File

@@ -0,0 +1,58 @@
EXTRACT_ELEMENTS_PROMPT = """
You are an assistant that extracts XPath expressions from webpages.
You will receive HTML content in markdown format.
Each element in the markdown has their xpath shown above them in a path like:
<!-- //div -->
Respond only with a list of general XPath expressions inside `<xpaths>...</xpaths>` tags.
You will also decide the decision of what to do next. If there is no decision available, return nothing for that section.
"""
ELEMENT_EXTRACTION_PROMPT = """
{extraction_prompt}
**Guidelines:**
- Prefer shorter, more general XPaths like `//div[...]` or `//span[...]`.
- Avoid overly specific or deep paths like `//div[3]/ul/li[2]/a`.
- Do **not** chain multiple elements deeply (e.g., `//div/span/a`).
- Use XPaths further down the tree when possible.
- Do not include any extra explanation or text.
- One XPath is acceptable if that's all that's needed.
- Try and limit it down to 1 - 3 xpaths.
- Include a name for each xpath.
<important>
- USE THE MOST SIMPLE XPATHS POSSIBLE.
- USE THE MOST GENERAL XPATHS POSSIBLE.
- USE THE MOST SPECIFIC XPATHS POSSIBLE.
- USE THE MOST GENERAL XPATHS POSSIBLE.
</important>
**Example Format:**
```xml
<xpaths>
- <name: insert_name_here>: <xpath: //div>
- <name: insert_name_here>: <xpath: //span>
- <name: insert_name_here>: <xpath: //span[contains(@text, 'example')]>
- <name: insert_name_here>: <xpath: //div[contains(@text, 'example')]>
- <name: insert_name_here>: <xpath: //a[@href]>
- etc
</xpaths>
<decision>
<next_page>
- //a[@href='next_page_url']
</next_page>
</decision>
```
**Input webpage:**
{webpage}
**Target content:**
{prompt}
"""

View File

@@ -0,0 +1,252 @@
from lxml import html, etree
import re
from playwright.async_api import Page
from api.backend.models import CapturedElement
from api.backend.job.scraping.scraping_utils import clean_format_characters
def convert_to_markdown(html_str: str):
parser = html.HTMLParser()
tree = html.fromstring(html_str, parser=parser)
root = tree.getroottree()
def format_attributes(el: etree._Element) -> str:
"""Convert element attributes into a string."""
return " ".join(f'{k}="{v}"' for k, v in el.attrib.items())
def is_visible(el: etree._Element) -> bool:
style = el.attrib.get("style", "").lower()
class_ = el.attrib.get("class", "").lower()
# Check for visibility styles
if "display: none" in style or "visibility: hidden" in style:
return False
if "opacity: 0" in style or "opacity:0" in style:
return False
if "height: 0" in style or "width: 0" in style:
return False
# Check for common hidden classes
if any(
hidden in class_
for hidden in ["hidden", "invisible", "truncate", "collapse"]
):
return False
# Check for hidden attributes
if el.attrib.get("hidden") is not None:
return False
if el.attrib.get("aria-hidden") == "true":
return False
# Check for empty or whitespace-only content
if not el.text and len(el) == 0:
return False
return True
def is_layout_or_decorative(el: etree._Element) -> bool:
tag = el.tag.lower()
# Layout elements
if tag in {"nav", "footer", "header", "aside", "main", "section"}:
return True
# Decorative elements
if tag in {"svg", "path", "circle", "rect", "line", "polygon", "polyline"}:
return True
# Check id and class for layout/decorative keywords
id_class = " ".join(
[el.attrib.get("id", ""), el.attrib.get("class", "")]
).lower()
layout_keywords = {
"sidebar",
"nav",
"header",
"footer",
"menu",
"advert",
"ads",
"breadcrumb",
"container",
"wrapper",
"layout",
"grid",
"flex",
"row",
"column",
"section",
"banner",
"hero",
"card",
"modal",
"popup",
"tooltip",
"dropdown",
"overlay",
}
return any(keyword in id_class for keyword in layout_keywords)
# Tags to ignore in the final markdown output
included_tags = {
"div",
"span",
"a",
"p",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"img",
"button",
"input",
"textarea",
"ul",
"ol",
"li",
"table",
"tr",
"td",
"th",
"input",
"textarea",
"select",
"option",
"optgroup",
"fieldset",
"legend",
}
special_elements = []
normal_elements = []
for el in tree.iter():
if el.tag is etree.Comment:
continue
tag = el.tag.lower()
if tag not in included_tags:
continue
if not is_visible(el):
continue
if is_layout_or_decorative(el):
continue
path = root.getpath(el)
attrs = format_attributes(el)
attrs_str = f" {attrs}" if attrs else ""
text = el.text.strip() if el.text else ""
if not text and not attrs:
continue
# input elements
if tag == "button":
prefix = "🔘 **<button>**"
special_elements.append(f"<!-- {path} -->\n{prefix} {text}")
elif tag == "a":
href = el.attrib.get("href", "")
prefix = f"🔗 **<a href='{href}'>**"
special_elements.append(f"<!-- {path} -->\n{prefix} {text}")
elif tag == "input":
input_type = el.attrib.get("type", "text")
prefix = f"📝 **<input type='{input_type}'>**"
special_elements.append(f"<!-- {path} -->\n{prefix}")
else:
prefix = f"**<{tag}{attrs_str}>**"
if text:
normal_elements.append(f"<!-- {path} -->\n{prefix} {text}")
return "\n\n".join(normal_elements + special_elements) # type: ignore
def parse_response(text: str) -> list[dict[str, str]]:
xpaths = re.findall(r"<xpaths>(.*?)</xpaths>", text, re.DOTALL)
results = []
if xpaths:
lines = xpaths[0].strip().splitlines()
for line in lines:
if line.strip().startswith("-"):
name = re.findall(r"<name: (.*?)>", line)[0]
xpath = re.findall(r"<xpath: (.*?)>", line)[0]
results.append({"name": name, "xpath": xpath})
else:
results.append({"name": "", "xpath": line.strip()})
return results
def parse_next_page(text: str) -> str | None:
next_page = re.findall(r"<next_page>(.*?)</next_page>", text, re.DOTALL)
if next_page:
lines = next_page[0].strip().splitlines()
next_page = [
line.strip().lstrip("-").strip()
for line in lines
if line.strip().startswith("-")
]
return next_page[0] if next_page else None
async def capture_elements(
page: Page, xpaths: list[dict[str, str]]
) -> list[CapturedElement]:
captured_elements = []
seen_texts = set()
for xpath in xpaths:
try:
locator = page.locator(f"xpath={xpath['xpath']}")
count = await locator.count()
for i in range(count):
element_text = ""
element_handle = await locator.nth(i).element_handle()
if not element_handle:
continue
link = await element_handle.get_attribute("href") or ""
text = await element_handle.text_content()
if text:
element_text += text
if link:
element_text += f" ({link})"
cleaned = clean_format_characters(element_text)
if cleaned in seen_texts:
continue
seen_texts.add(cleaned)
captured_elements.append(
CapturedElement(
name=xpath["name"],
text=cleaned,
xpath=xpath["xpath"],
)
)
except Exception as e:
print(f"Error processing xpath {xpath}: {e}")
return captured_elements

View File

@@ -1,32 +1,29 @@
# STL # STL
import os
import logging import logging
from collections.abc import Iterable, AsyncGenerator from collections.abc import Iterable, AsyncGenerator
# PDM # PDM
from openai import OpenAI
from fastapi import APIRouter from fastapi import APIRouter
from fastapi.responses import JSONResponse, StreamingResponse from fastapi.responses import JSONResponse, StreamingResponse
from openai.types.chat import ChatCompletionMessageParam from openai.types.chat import ChatCompletionMessageParam
# LOCAL # LOCAL
from ollama import Message, AsyncClient from ollama import Message
from api.backend.models import AI from api.backend.models import AI
from api.backend.ai.clients import (
llama_client,
llama_model,
openai_client,
open_ai_model,
open_ai_key,
)
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
ai_router = APIRouter() ai_router = APIRouter()
# Load environment variables
open_ai_key = os.getenv("OPENAI_KEY")
open_ai_model = os.getenv("OPENAI_MODEL")
llama_url = os.getenv("OLLAMA_URL")
llama_model = os.getenv("OLLAMA_MODEL")
# Initialize clients
openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None
llama_client = AsyncClient(host=llama_url) if llama_url else None
async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]: async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]:
if llama_client and llama_model: if llama_client and llama_model:

38
api/backend/ai/clients.py Normal file
View File

@@ -0,0 +1,38 @@
import os
from openai import OpenAI
from ollama import AsyncClient
# Load environment variables
open_ai_key = os.getenv("OPENAI_KEY")
open_ai_model = os.getenv("OPENAI_MODEL")
llama_url = os.getenv("OLLAMA_URL")
llama_model = os.getenv("OLLAMA_MODEL")
# Initialize clients
openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None
llama_client = AsyncClient(host=llama_url) if llama_url else None
async def ask_open_ai(prompt: str) -> str:
if not openai_client:
raise ValueError("OpenAI client not initialized")
response = openai_client.chat.completions.create(
model=open_ai_model or "gpt-4.1-mini",
messages=[{"role": "user", "content": prompt}],
)
return response.choices[0].message.content or ""
async def ask_ollama(prompt: str) -> str:
if not llama_client:
raise ValueError("Ollama client not initialized")
response = await llama_client.chat(
model=llama_model or "", messages=[{"role": "user", "content": prompt}]
)
return response.message.content or ""

View File

@@ -2,6 +2,7 @@
import os import os
import logging import logging
import apscheduler # type: ignore import apscheduler # type: ignore
from contextlib import asynccontextmanager
# PDM # PDM
import apscheduler.schedulers import apscheduler.schedulers
@@ -33,7 +34,30 @@ logging.basicConfig(
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
app = FastAPI(title="api", root_path="/api")
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup
LOG.info("Starting application...")
init_database()
LOG.info("Starting cron scheduler...")
start_cron_scheduler(scheduler)
scheduler.start()
LOG.info("Cron scheduler started successfully")
yield
# Shutdown
LOG.info("Shutting down application...")
LOG.info("Stopping cron scheduler...")
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
LOG.info("Cron scheduler stopped")
LOG.info("Application shutdown complete")
app = FastAPI(title="api", root_path="/api", lifespan=lifespan)
app.add_middleware( app.add_middleware(
CORSMiddleware, CORSMiddleware,
@@ -43,28 +67,12 @@ app.add_middleware(
allow_headers=["*"], allow_headers=["*"],
) )
app.include_router(auth_router) app.include_router(auth_router)
app.include_router(ai_router) app.include_router(ai_router)
app.include_router(job_router) app.include_router(job_router)
app.include_router(stats_router) app.include_router(stats_router)
@app.on_event("startup")
async def startup_event():
start_cron_scheduler(scheduler)
scheduler.start()
if os.getenv("ENV") != "test":
init_database()
LOG.info("Starting up...")
@app.on_event("shutdown")
def shutdown_scheduler():
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
@app.exception_handler(RequestValidationError) @app.exception_handler(RequestValidationError)
async def validation_exception_handler(request: Request, exc: RequestValidationError): async def validation_exception_handler(request: Request, exc: RequestValidationError):
exc_str = f"{exc}".replace("\n", " ").replace(" ", " ") exc_str = f"{exc}".replace("\n", " ").replace(" ", " ")

View File

@@ -66,4 +66,8 @@ async def read_users_me(current_user: User = Depends(get_current_user)):
@auth_router.get("/auth/check") @auth_router.get("/auth/check")
async def check_auth(): async def check_auth():
return {"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True"} return {
"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True",
"recordings_enabled": os.environ.get("RECORDINGS_ENABLED", "true").lower()
== "true",
}

View File

@@ -1 +1,16 @@
from pathlib import Path
import os
DATABASE_PATH = "data/database.db" DATABASE_PATH = "data/database.db"
RECORDINGS_DIR = Path("media/recordings")
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
MEDIA_DIR = Path("media")
MEDIA_TYPES = [
"audio",
"documents",
"images",
"pdfs",
"presentations",
"spreadsheets",
"videos",
]

View File

@@ -1,7 +1,7 @@
JOB_INSERT_QUERY = """ JOB_INSERT_QUERY = """
INSERT INTO jobs INSERT INTO jobs
(id, url, elements, user, time_created, result, status, chat, job_options) (id, url, elements, user, time_created, result, status, chat, job_options, agent_mode, prompt)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""" """
DELETE_JOB_QUERY = """ DELETE_JOB_QUERY = """

View File

@@ -27,4 +27,7 @@ CREATE TABLE IF NOT EXISTS cron_jobs (
time_updated DATETIME NOT NULL, time_updated DATETIME NOT NULL,
FOREIGN KEY (job_id) REFERENCES jobs(id) FOREIGN KEY (job_id) REFERENCES jobs(id)
); );
ALTER TABLE jobs ADD COLUMN agent_mode BOOLEAN NOT NULL DEFAULT FALSE;
ALTER TABLE jobs ADD COLUMN prompt STRING;
""" """

View File

@@ -1,6 +1,7 @@
import os import os
from api.backend.database.common import connect, QUERIES, insert from api.backend.database.common import connect, QUERIES, insert
import logging import logging
import sqlite3
from api.backend.auth.auth_utils import get_password_hash from api.backend.auth.auth_utils import get_password_hash
@@ -11,11 +12,22 @@ def init_database():
cursor = connect() cursor = connect()
for query in QUERIES["init"].strip().split(";"): for query in QUERIES["init"].strip().split(";"):
if query.strip(): query = query.strip()
if not query:
continue
try:
LOG.info(f"Executing query: {query}") LOG.info(f"Executing query: {query}")
_ = cursor.execute(query) _ = cursor.execute(query)
except sqlite3.OperationalError as e:
if "duplicate column name" in str(e).lower():
LOG.warning(f"Skipping duplicate column error: {e}")
continue
else:
LOG.error(f"Error executing query: {query}")
raise
if os.environ.get("REGISTRATION_ENABLED", "True") == "False": if os.environ.get("REGISTRATION_ENABLED", "true").lower() == "false":
default_user_email = os.environ.get("DEFAULT_USER_EMAIL") default_user_email = os.environ.get("DEFAULT_USER_EMAIL")
default_user_password = os.environ.get("DEFAULT_USER_PASSWORD") default_user_password = os.environ.get("DEFAULT_USER_PASSWORD")
default_user_full_name = os.environ.get("DEFAULT_USER_FULL_NAME") default_user_full_name = os.environ.get("DEFAULT_USER_FULL_NAME")

View File

@@ -27,6 +27,8 @@ def insert(item: dict[str, Any]) -> None:
item["status"], item["status"],
item["chat"], item["chat"],
item["job_options"], item["job_options"],
item["agent_mode"],
item["prompt"],
), ),
) )
LOG.info(f"Inserted item: {item}") LOG.info(f"Inserted item: {item}")

View File

@@ -1,6 +1,7 @@
import os import os
from pathlib import Path from pathlib import Path
from urllib.parse import urlparse import re
from urllib.parse import urljoin, urlparse
from typing import Dict, List from typing import Dict, List
import aiohttp import aiohttp
@@ -9,12 +10,12 @@ from playwright.async_api import Page
from api.backend.utils import LOG from api.backend.utils import LOG
async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]: async def collect_media(id: str, page: Page) -> dict[str, list[dict[str, str]]]:
media_types = { media_types = {
"images": "img", "images": "img",
"videos": "video", "videos": "video",
"audio": "audio", "audio": "audio",
"pdfs": 'a[href$=".pdf"]', "pdfs": 'a[href$=".pdf"], a[href*=".pdf#page="]',
"documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]', "documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]',
"presentations": 'a[href$=".ppt"], a[href$=".pptx"]', "presentations": 'a[href$=".ppt"], a[href$=".pptx"]',
"spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]', "spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]',
@@ -48,6 +49,11 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
root_domain = f"{root_url.scheme}://{root_url.netloc}" root_domain = f"{root_url.scheme}://{root_url.netloc}"
url = f"{root_domain}{url}" url = f"{root_domain}{url}"
if url and re.match(r"^[\w\-]+/", url):
root_url = urlparse(page.url)
root_domain = f"{root_url.scheme}://{root_url.netloc}"
url = urljoin(root_domain + "/", url)
if url and url.startswith(("http://", "https://")): if url and url.startswith(("http://", "https://")):
try: try:
parsed = urlparse(url) parsed = urlparse(url)
@@ -67,15 +73,20 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
}.get(media_type, "") }.get(media_type, "")
filename += ext filename += ext
file_path = media_dir / filename if not os.path.exists(media_dir / id):
os.makedirs(media_dir / id, exist_ok=True)
file_path = media_dir / id / f"{filename}"
async with session.get(url) as response: async with session.get(url) as response:
response.raise_for_status() response.raise_for_status()
with open(file_path, "wb") as f: with open(file_path, "wb") as f:
while True: while True:
chunk = await response.content.read(8192) chunk = await response.content.read(8192)
if not chunk: if not chunk:
break break
f.write(chunk) f.write(chunk)
urls.append({"url": url, "local_path": str(file_path)}) urls.append({"url": url, "local_path": str(file_path)})

View File

@@ -8,7 +8,7 @@ from api.backend.job.scraping.collect_media import collect_media as collect_medi
async def scrape_content( async def scrape_content(
page: Page, pages: Set[Tuple[str, str]], collect_media: bool id: str, page: Page, pages: Set[Tuple[str, str]], collect_media: bool
) -> str: ) -> str:
last_height = await page.evaluate("document.body.scrollHeight") last_height = await page.evaluate("document.body.scrollHeight")
@@ -27,6 +27,19 @@ async def scrape_content(
if collect_media: if collect_media:
LOG.info("Collecting media") LOG.info("Collecting media")
await collect_media_utils(page) await collect_media_utils(id, page)
return html return html
def clean_format_characters(text: str) -> str:
text = text.strip()
text = text.replace("\n", " ")
text = text.replace("\t", " ")
text = text.replace("\r", " ")
text = text.replace("\f", " ")
text = text.replace("\v", " ")
text = text.replace("\b", " ")
text = text.replace("\a", " ")
return text

View File

@@ -24,7 +24,6 @@ def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]:
async def handle_input(action: Action, page: Page) -> bool: async def handle_input(action: Action, page: Page) -> bool:
try: try:
element = page.locator(f"xpath={action.xpath}") element = page.locator(f"xpath={action.xpath}")
await element.wait_for(state="visible", timeout=10000)
LOG.info(f"Sending keys: {action.input} to element: {action.xpath}") LOG.info(f"Sending keys: {action.input} to element: {action.xpath}")
await element.fill(action.input) await element.fill(action.input)
return True return True
@@ -36,7 +35,6 @@ async def handle_input(action: Action, page: Page) -> bool:
async def handle_click(action: Action, page: Page) -> bool: async def handle_click(action: Action, page: Page) -> bool:
try: try:
element = page.locator(f"xpath={action.xpath}") element = page.locator(f"xpath={action.xpath}")
await element.wait_for(state="visible", timeout=10000)
LOG.info(f"Clicking element: {action.xpath}") LOG.info(f"Clicking element: {action.xpath}")
await element.click() await element.click()
return True return True
@@ -52,6 +50,7 @@ ACTION_MAP = {
async def handle_site_mapping( async def handle_site_mapping(
id: str,
site_map_dict: dict[str, Any], site_map_dict: dict[str, Any],
page: Page, page: Page,
pages: set[tuple[str, str]], pages: set[tuple[str, str]],
@@ -68,11 +67,11 @@ async def handle_site_mapping(
await asyncio.sleep(2) await asyncio.sleep(2)
await scrape_content(page, pages, collect_media=collect_media) await scrape_content(id, page, pages, collect_media=collect_media)
cleared_site_map_dict = clear_done_actions(site_map_dict) cleared_site_map_dict = clear_done_actions(site_map_dict)
if cleared_site_map_dict["actions"]: if cleared_site_map_dict["actions"]:
await handle_site_mapping( await handle_site_mapping(
cleared_site_map_dict, page, pages, collect_media=collect_media id, cleared_site_map_dict, page, pages, collect_media=collect_media
) )

View File

@@ -58,6 +58,8 @@ class Job(pydantic.BaseModel):
job_options: JobOptions job_options: JobOptions
status: str = "Queued" status: str = "Queued"
chat: Optional[str] = None chat: Optional[str] = None
agent_mode: bool = False
prompt: Optional[str] = None
class CronJob(pydantic.BaseModel): class CronJob(pydantic.BaseModel):

View File

@@ -10,7 +10,7 @@ import random
# PDM # PDM
from fastapi import Depends, APIRouter from fastapi import Depends, APIRouter
from fastapi.encoders import jsonable_encoder from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse, StreamingResponse from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
from api.backend.scheduler import scheduler from api.backend.scheduler import scheduler
from apscheduler.triggers.cron import CronTrigger # type: ignore from apscheduler.triggers.cron import CronTrigger # type: ignore
@@ -42,6 +42,8 @@ from api.backend.job.cron_scheduling.cron_scheduling import (
from api.backend.job.utils.clean_job_format import clean_job_format from api.backend.job.utils.clean_job_format import clean_job_format
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
from api.backend.constants import MEDIA_DIR, MEDIA_TYPES, RECORDINGS_DIR
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
job_router = APIRouter() job_router = APIRouter()
@@ -231,3 +233,41 @@ async def delete_cron_job_request(request: DeleteCronJob):
async def get_cron_jobs_request(user: User = Depends(get_current_user)): async def get_cron_jobs_request(user: User = Depends(get_current_user)):
cron_jobs = get_cron_jobs(user.email) cron_jobs = get_cron_jobs(user.email)
return JSONResponse(content=jsonable_encoder(cron_jobs)) return JSONResponse(content=jsonable_encoder(cron_jobs))
@job_router.get("/recordings/{id}")
async def get_recording(id: str):
path = RECORDINGS_DIR / f"{id}.mp4"
if not path.exists():
return JSONResponse(content={"error": "Recording not found."}, status_code=404)
return FileResponse(
path, headers={"Content-Type": "video/mp4", "Accept-Ranges": "bytes"}
)
@job_router.get("/get-media")
async def get_media(id: str):
try:
files: dict[str, list[str]] = {}
for media_type in MEDIA_TYPES:
path = MEDIA_DIR / media_type / f"{id}"
files[media_type] = [file.name for file in path.glob("*")]
return JSONResponse(content={"files": files})
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
return JSONResponse(content={"error": str(e)}, status_code=500)
@job_router.get("/media")
async def get_media_file(id: str, type: str, file: str):
path = MEDIA_DIR / type / f"{id}" / file
if not path.exists():
return JSONResponse(content={"error": "Media file not found."}, status_code=404)
return FileResponse(path)

View File

@@ -9,11 +9,16 @@ from playwright.async_api import Page
from urllib.parse import urlparse, urljoin from urllib.parse import urlparse, urljoin
from api.backend.models import Element, CapturedElement from api.backend.models import Element, CapturedElement
from api.backend.job.scraping.scraping_utils import scrape_content from api.backend.job.scraping.scraping_utils import (
clean_format_characters,
scrape_content,
)
from api.backend.job.site_mapping.site_mapping import handle_site_mapping from api.backend.job.site_mapping.site_mapping import handle_site_mapping
from api.backend.job.scraping.add_custom import add_custom_items from api.backend.job.scraping.add_custom import add_custom_items
from api.backend.constants import RECORDINGS_ENABLED
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
@@ -37,6 +42,7 @@ def sxpath(context: etree._Element, xpath: str):
async def make_site_request( async def make_site_request(
id: str,
url: str, url: str,
headers: Optional[dict[str, Any]], headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False, multi_page_scrape: bool = False,
@@ -57,8 +63,9 @@ async def make_site_request(
proxy = random.choice(proxies) proxy = random.choice(proxies)
LOG.info(f"Using proxy: {proxy}") LOG.info(f"Using proxy: {proxy}")
async with AsyncCamoufox(headless=True, proxy=proxy) as browser: async with AsyncCamoufox(headless=not RECORDINGS_ENABLED, proxy=proxy) as browser:
page: Page = await browser.new_page() page: Page = await browser.new_page()
await page.set_viewport_size({"width": 1920, "height": 1080})
# Add cookies and headers # Add cookies and headers
await add_custom_items(url, page, custom_cookies, headers) await add_custom_items(url, page, custom_cookies, headers)
@@ -67,21 +74,21 @@ async def make_site_request(
try: try:
await page.goto(url, timeout=60000) await page.goto(url, timeout=60000)
await page.wait_for_load_state("networkidle", timeout=10000) await page.wait_for_load_state("networkidle")
final_url = page.url final_url = page.url
visited_urls.add(url) visited_urls.add(url)
visited_urls.add(final_url) visited_urls.add(final_url)
html_content = await scrape_content(page, pages, collect_media) html_content = await scrape_content(id, page, pages, collect_media)
html_content = await page.content() html_content = await page.content()
pages.add((html_content, final_url)) pages.add((html_content, final_url))
if site_map: if site_map:
await handle_site_mapping( await handle_site_mapping(
site_map, page, pages, collect_media=collect_media id, site_map, page, pages, collect_media=collect_media
) )
finally: finally:
@@ -108,6 +115,7 @@ async def make_site_request(
if link not in visited_urls and is_same_domain(link, original_url): if link not in visited_urls and is_same_domain(link, original_url):
await make_site_request( await make_site_request(
id,
link, link,
headers=headers, headers=headers,
multi_page_scrape=multi_page_scrape, multi_page_scrape=multi_page_scrape,
@@ -132,11 +140,13 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
for e in el: # type: ignore for e in el: # type: ignore
text = ( text = (
"\t".join(str(t) for t in e.itertext()) " ".join(str(t) for t in e.itertext())
if isinstance(e, etree._Element) if isinstance(e, etree._Element)
else str(e) # type: ignore else str(e) # type: ignore
) )
text = clean_format_characters(text)
captured_element = CapturedElement( captured_element = CapturedElement(
xpath=elem.xpath, text=text, name=elem.name xpath=elem.xpath, text=text, name=elem.name
) )
@@ -150,6 +160,7 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
async def scrape( async def scrape(
id: str,
url: str, url: str,
xpaths: list[Element], xpaths: list[Element],
headers: Optional[dict[str, Any]] = None, headers: Optional[dict[str, Any]] = None,
@@ -163,6 +174,7 @@ async def scrape(
pages: set[tuple[str, str]] = set() pages: set[tuple[str, str]] = set()
await make_site_request( await make_site_request(
id,
url, url,
headers=headers, headers=headers,
multi_page_scrape=multi_page_scrape, multi_page_scrape=multi_page_scrape,

View File

@@ -1,10 +1,12 @@
import os import os
import json import json
from pathlib import Path
from api.backend.job import get_queued_job, update_job from api.backend.job import get_queued_job, update_job
from api.backend.scraping import scrape from api.backend.scraping import scrape
from api.backend.models import Element from api.backend.models import Element
from fastapi.encoders import jsonable_encoder from fastapi.encoders import jsonable_encoder
import subprocess
import asyncio import asyncio
import traceback import traceback
@@ -14,6 +16,8 @@ from api.backend.database.startup import init_database
from api.backend.worker.post_job_complete.post_job_complete import post_job_complete from api.backend.worker.post_job_complete.post_job_complete import post_job_complete
from api.backend.worker.logger import LOG from api.backend.worker.logger import LOG
from api.backend.ai.agent.agent import scrape_with_agent
NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "") NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "")
NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "") NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "")
@@ -26,14 +30,42 @@ SMTP_USER = os.getenv("SMTP_USER", "")
SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "") SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "")
USE_TLS = os.getenv("USE_TLS", "false").lower() == "true" USE_TLS = os.getenv("USE_TLS", "false").lower() == "true"
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
RECORDINGS_DIR = Path("/project/app/media/recordings")
async def process_job(): async def process_job():
job = await get_queued_job() job = await get_queued_job()
ffmpeg_proc = None
status = "Queued" status = "Queued"
if job: if job:
LOG.info(f"Beginning processing job: {job}.") LOG.info(f"Beginning processing job: {job}.")
try: try:
output_path = RECORDINGS_DIR / f"{job['id']}.mp4"
if RECORDINGS_ENABLED:
ffmpeg_proc = subprocess.Popen(
[
"ffmpeg",
"-y",
"-video_size",
"1280x1024",
"-framerate",
"15",
"-f",
"x11grab",
"-i",
":99",
"-codec:v",
"libx264",
"-preset",
"ultrafast",
output_path,
]
)
_ = await update_job([job["id"]], field="status", value="Scraping") _ = await update_job([job["id"]], field="status", value="Scraping")
proxies = job["job_options"]["proxies"] proxies = job["job_options"]["proxies"]
@@ -45,16 +77,21 @@ async def process_job():
LOG.error(f"Failed to parse proxy JSON: {proxies}") LOG.error(f"Failed to parse proxy JSON: {proxies}")
proxies = [] proxies = []
scraped = await scrape( if job["agent_mode"]:
job["url"], scraped = await scrape_with_agent(job)
[Element(**j) for j in job["elements"]], else:
job["job_options"]["custom_headers"], scraped = await scrape(
job["job_options"]["multi_page_scrape"], job["id"],
proxies, job["url"],
job["job_options"]["site_map"], [Element(**j) for j in job["elements"]],
job["job_options"]["collect_media"], job["job_options"]["custom_headers"],
job["job_options"]["custom_cookies"], job["job_options"]["multi_page_scrape"],
) proxies,
job["job_options"]["site_map"],
job["job_options"]["collect_media"],
job["job_options"]["custom_cookies"],
)
LOG.info( LOG.info(
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}" f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
) )
@@ -87,12 +124,18 @@ async def process_job():
}, },
) )
if ffmpeg_proc:
ffmpeg_proc.terminate()
ffmpeg_proc.wait()
async def main(): async def main():
LOG.info("Starting job worker...") LOG.info("Starting job worker...")
init_database() init_database()
RECORDINGS_DIR.mkdir(parents=True, exist_ok=True)
while True: while True:
await process_job() await process_job()
await asyncio.sleep(5) await asyncio.sleep(5)

View File

@@ -30,5 +30,59 @@ describe.only("Job", () => {
"exist" "exist"
); );
cy.contains("div", "Completed", { timeout: 20000 }).should("exist"); cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
cy.get("tbody tr")
.first()
.within(() => {
cy.get('input[type="checkbox"]').click();
});
cy.get("[data-testid='DeleteIcon']").click();
cy.contains("div", "https://example.com", { timeout: 10000 }).should(
"not.exist"
);
});
it("should create a job with advanced options (media)", () => {
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
cy.visit("/");
cy.get("button").contains("Advanced Job Options").click();
cy.get('[data-cy="collect-media-checkbox"]').click();
cy.get("body").type("{esc}");
cy.get('[data-cy="url-input"]').type("https://books.toscrape.com");
cy.get('[data-cy="name-field"]').type("example");
cy.get('[data-cy="xpath-field"]').type("//body");
cy.get('[data-cy="add-button"]').click();
cy.get("button").contains("Submit").click();
cy.get("li").contains("Jobs").click();
cy.contains("div", "https://books.toscrape.com", { timeout: 10000 }).should(
"exist"
);
cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
cy.get("li").contains("Media").click();
cy.get("div[id='select-job']").click();
cy.get("li[role='option']").click();
cy.get("[data-testid='media-grid']", { timeout: 10000 }).should("exist");
cy.get("li").contains("Jobs").click();
cy.get("tbody tr")
.first()
.within(() => {
cy.get('input[type="checkbox"]').click();
});
cy.get("[data-testid='DeleteIcon']").click();
}); });
}); });

View File

@@ -1,6 +1,9 @@
version: "3" version: "3"
services: services:
scraperr: scraperr:
build:
context: .
dockerfile: docker/frontend/Dockerfile
command: ["npm", "run", "dev"] command: ["npm", "run", "dev"]
volumes: volumes:
- "$PWD/src:/app/src" - "$PWD/src:/app/src"
@@ -10,7 +13,12 @@ services:
- "$PWD/package-lock.json:/app/package-lock.json" - "$PWD/package-lock.json:/app/package-lock.json"
- "$PWD/tsconfig.json:/app/tsconfig.json" - "$PWD/tsconfig.json:/app/tsconfig.json"
scraperr_api: scraperr_api:
build:
context: .
dockerfile: docker/api/Dockerfile
environment: environment:
- LOG_LEVEL=INFO - LOG_LEVEL=INFO
volumes: volumes:
- "$PWD/api:/project/app/api" - "$PWD/api:/project/app/api"
ports:
- "5900:5900"

View File

@@ -1,11 +1,6 @@
services: services:
scraperr: scraperr:
depends_on: image: jpyles0524/scraperr:latest
- scraperr_api
image: jpyles0524/scraperr:1.0.13
build:
context: .
dockerfile: docker/frontend/Dockerfile
container_name: scraperr container_name: scraperr
command: ["npm", "run", "start"] command: ["npm", "run", "start"]
environment: environment:
@@ -18,9 +13,6 @@ services:
scraperr_api: scraperr_api:
init: True init: True
image: jpyles0524/scraperr_api:latest image: jpyles0524/scraperr_api:latest
build:
context: .
dockerfile: docker/api/Dockerfile
environment: environment:
- LOG_LEVEL=INFO - LOG_LEVEL=INFO
container_name: scraperr_api container_name: scraperr_api

View File

@@ -3,7 +3,7 @@ FROM python:3.10.12-slim as pybuilder
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y curl && \ apt-get install -y curl && \
apt-get install -y uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 && \ apt-get install -y x11vnc xvfb uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 ffmpeg && \
curl -LsSf https://astral.sh/uv/install.sh | sh && \ curl -LsSf https://astral.sh/uv/install.sh | sh && \
apt-get remove -y curl && \ apt-get remove -y curl && \
apt-get autoremove -y && \ apt-get autoremove -y && \
@@ -14,7 +14,8 @@ RUN pdm config python.use_venv false
WORKDIR /project/app WORKDIR /project/app
COPY pyproject.toml pdm.lock /project/app/ COPY pyproject.toml pdm.lock /project/app/
RUN pdm install
RUN pdm install -v --frozen-lockfile
RUN pdm run playwright install --with-deps RUN pdm run playwright install --with-deps
@@ -30,7 +31,12 @@ EXPOSE 8000
WORKDIR /project/app WORKDIR /project/app
RUN mkdir -p /project/app/media
RUN mkdir -p /project/app/data RUN mkdir -p /project/app/data
RUN touch /project/app/data/database.db RUN touch /project/app/data/database.db
EXPOSE 5900
COPY start.sh /project/app/start.sh
CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ] CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ]

View File

@@ -1,10 +1,14 @@
# Build next dependencies # Build next dependencies
FROM node:23.1 FROM node:23.1-slim
WORKDIR /app WORKDIR /app
COPY package*.json ./ # Copy package files first to leverage Docker cache
RUN npm install COPY package.json yarn.lock ./
# Install dependencies in a separate layer
RUN yarn install --frozen-lockfile
# Copy the rest of the application
COPY tsconfig.json /app/tsconfig.json COPY tsconfig.json /app/tsconfig.json
COPY tailwind.config.js /app/tailwind.config.js COPY tailwind.config.js /app/tailwind.config.js
COPY next.config.mjs /app/next.config.mjs COPY next.config.mjs /app/next.config.mjs
@@ -13,6 +17,7 @@ COPY postcss.config.js /app/postcss.config.js
COPY public /app/public COPY public /app/public
COPY src /app/src COPY src /app/src
RUN npm run build # Build the application
RUN yarn build
EXPOSE 3000 EXPOSE 3000

Binary file not shown.

Before

Width:  |  Height:  |  Size: 47 KiB

After

Width:  |  Height:  |  Size: 48 KiB

View File

@@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes # This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version. # to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/) # Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 1.0.14 version: 1.1.0
# This is the version number of the application being deployed. This version number should be # This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to # incremented each time you make changes to the application. Versions are not expected to

11371
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -12,9 +12,11 @@
"@minchat/react-chat-ui": "^0.16.2", "@minchat/react-chat-ui": "^0.16.2",
"@mui/icons-material": "^5.15.3", "@mui/icons-material": "^5.15.3",
"@mui/material": "^5.16.0", "@mui/material": "^5.16.0",
"@reduxjs/toolkit": "^2.8.2",
"@testing-library/jest-dom": "^5.16.5", "@testing-library/jest-dom": "^5.16.5",
"@testing-library/react": "^13.4.0", "@testing-library/react": "^13.4.0",
"@testing-library/user-event": "^13.5.0", "@testing-library/user-event": "^13.5.0",
"@types/react": "^18.3.21",
"axios": "^1.7.2", "axios": "^1.7.2",
"bootstrap": "^5.3.0", "bootstrap": "^5.3.0",
"chart.js": "^4.4.3", "chart.js": "^4.4.3",
@@ -30,16 +32,18 @@
"react-dom": "^18.3.1", "react-dom": "^18.3.1",
"react-markdown": "^9.0.0", "react-markdown": "^9.0.0",
"react-modal-image": "^2.6.0", "react-modal-image": "^2.6.0",
"react-redux": "^9.2.0",
"react-router": "^6.14.1", "react-router": "^6.14.1",
"react-router-dom": "^6.14.1", "react-router-dom": "^6.14.1",
"react-spinners": "^0.14.1", "react-spinners": "^0.14.1",
"redux-persist": "^6.0.0",
"typescript": "^4.9.5", "typescript": "^4.9.5",
"web-vitals": "^2.1.4" "web-vitals": "^2.1.4"
}, },
"scripts": { "scripts": {
"dev": "next dev", "dev": "yarn next dev",
"build": "next build", "build": "yarn next build",
"start": "next start", "start": "yarn next start",
"serve": "serve -s ./dist", "serve": "serve -s ./dist",
"cy:open": "cypress open", "cy:open": "cypress open",
"cy:run": "cypress run" "cy:run": "cypress run"

13
pdm.lock generated
View File

@@ -5,7 +5,7 @@
groups = ["default", "dev"] groups = ["default", "dev"]
strategy = ["inherit_metadata"] strategy = ["inherit_metadata"]
lock_version = "4.5.0" lock_version = "4.5.0"
content_hash = "sha256:cb37fedd6d022515dde14e475588a8da2144ba22e41dfdfacfe3f7a7d14486ca" content_hash = "sha256:5f4c90b42c3b35194a7c2af8b46b7c28127e25e836a779e85aae0df2bd0e69eb"
[[metadata.targets]] [[metadata.targets]]
requires_python = ">=3.10" requires_python = ">=3.10"
@@ -1174,6 +1174,17 @@ files = [
{file = "hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca"}, {file = "hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca"},
] ]
[[package]]
name = "html2text"
version = "2025.4.15"
requires_python = ">=3.9"
summary = "Turn HTML into equivalent Markdown-structured text."
groups = ["default"]
files = [
{file = "html2text-2025.4.15-py3-none-any.whl", hash = "sha256:00569167ffdab3d7767a4cdf589b7f57e777a5ed28d12907d8c58769ec734acc"},
{file = "html2text-2025.4.15.tar.gz", hash = "sha256:948a645f8f0bc3abe7fd587019a2197a12436cd73d0d4908af95bfc8da337588"},
]
[[package]] [[package]]
name = "httpcore" name = "httpcore"
version = "1.0.9" version = "1.0.9"

View File

@@ -41,6 +41,7 @@ dependencies = [
"apscheduler>=3.11.0", "apscheduler>=3.11.0",
"playwright>=1.52.0", "playwright>=1.52.0",
"camoufox>=0.4.11", "camoufox>=0.4.11",
"html2text>=2025.4.15",
] ]
requires-python = ">=3.10" requires-python = ">=3.10"
readme = "README.md" readme = "README.md"

View File

@@ -1,17 +1,23 @@
import React, { useState, useEffect, Dispatch, useRef } from "react"; import React, { useState, Dispatch, useEffect } from "react";
import { Job } from "../../types"; import { Job } from "../../types";
import { fetchJobs } from "../../lib";
import Box from "@mui/material/Box"; import Box from "@mui/material/Box";
import InputLabel from "@mui/material/InputLabel"; import InputLabel from "@mui/material/InputLabel";
import FormControl from "@mui/material/FormControl"; import FormControl from "@mui/material/FormControl";
import Select from "@mui/material/Select"; import Select from "@mui/material/Select";
import Popover from "@mui/material/Popover"; import Popover from "@mui/material/Popover";
import { Typography, MenuItem, useTheme } from "@mui/material"; import {
Typography,
MenuItem,
useTheme,
ClickAwayListener,
} from "@mui/material";
import { SxProps } from "@mui/material"; import { SxProps } from "@mui/material";
interface Props { interface Props {
sxProps: SxProps; sxProps?: SxProps;
setSelectedJob: Dispatch<React.SetStateAction<Job | null>>; setSelectedJob:
| Dispatch<React.SetStateAction<Job | null>>
| ((job: Job) => void);
selectedJob: Job | null; selectedJob: Job | null;
setJobs: Dispatch<React.SetStateAction<Job[]>>; setJobs: Dispatch<React.SetStateAction<Job[]>>;
jobs: Job[]; jobs: Job[];
@@ -43,6 +49,12 @@ export const JobSelector = ({
const open = Boolean(anchorEl); const open = Boolean(anchorEl);
useEffect(() => {
if (!open) {
setAnchorEl(null);
}
}, [open]);
return ( return (
<Box sx={sxProps}> <Box sx={sxProps}>
<FormControl fullWidth> <FormControl fullWidth>
@@ -55,9 +67,11 @@ export const JobSelector = ({
value={selectedJob?.id || ""} value={selectedJob?.id || ""}
label="Job" label="Job"
onChange={(e) => { onChange={(e) => {
setSelectedJob( const job = jobs.find((job) => job.id === e.target.value);
jobs.find((job) => job.id === e.target.value) || null
); if (job) {
setSelectedJob(job);
}
}} }}
> >
{jobs.map((job) => ( {jobs.map((job) => (
@@ -77,57 +91,63 @@ export const JobSelector = ({
</> </>
) : null} ) : null}
</FormControl> </FormControl>
<Popover
id="mouse-over-popover" {open && (
sx={{ <ClickAwayListener onClickAway={handlePopoverClose}>
pointerEvents: "none", <Popover
padding: 0, id="mouse-over-popover"
}}
open={open}
anchorEl={anchorEl}
anchorOrigin={{
vertical: "bottom",
horizontal: "left",
}}
transformOrigin={{
vertical: "top",
horizontal: "left",
}}
onClose={handlePopoverClose}
>
{popoverJob && (
<Box
sx={{ sx={{
border: pointerEvents: "none",
theme.palette.mode === "light" padding: 0,
? "2px solid black"
: "2px solid white",
}} }}
open={open}
anchorEl={anchorEl}
anchorOrigin={{
vertical: "bottom",
horizontal: "left",
}}
transformOrigin={{
vertical: "top",
horizontal: "left",
}}
onClose={handlePopoverClose}
> >
<Typography {popoverJob && (
variant="body1" <Box
sx={{ paddingLeft: 1, paddingRight: 1 }}
>
{popoverJob.url}
</Typography>
<div className="flex flex-row w-full justify-end mb-1">
<Typography
variant="body2"
sx={{ sx={{
paddingLeft: 1, border:
paddingRight: 1, theme.palette.mode === "light"
color: theme.palette.mode === "dark" ? "#d3d7e6" : "#5b5d63", ? "2px solid black"
fontStyle: "italic", : "2px solid white",
}} }}
> >
{popoverJob.time_created <Typography
? new Date(popoverJob.time_created).toLocaleString() variant="body1"
: "Unknown"} sx={{ paddingLeft: 1, paddingRight: 1 }}
</Typography> >
</div> {popoverJob.url}
</Box> </Typography>
)} <div className="flex flex-row w-full justify-end mb-1">
</Popover> <Typography
variant="body2"
sx={{
paddingLeft: 1,
paddingRight: 1,
color:
theme.palette.mode === "dark" ? "#d3d7e6" : "#5b5d63",
fontStyle: "italic",
}}
>
{popoverJob.time_created
? new Date(popoverJob.time_created).toLocaleString()
: "Unknown"}
</Typography>
</div>
</Box>
)}
</Popover>
</ClickAwayListener>
)}
</Box> </Box>
); );
}; };

View File

@@ -6,11 +6,13 @@ import { RawJobOptions } from "@/types";
export type AdvancedJobOptionsProps = { export type AdvancedJobOptionsProps = {
jobOptions: RawJobOptions; jobOptions: RawJobOptions;
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>; setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
multiPageScrapeEnabled?: boolean;
}; };
export const AdvancedJobOptions = ({ export const AdvancedJobOptions = ({
jobOptions, jobOptions,
setJobOptions, setJobOptions,
multiPageScrapeEnabled = true,
}: AdvancedJobOptionsProps) => { }: AdvancedJobOptionsProps) => {
const [open, setOpen] = useState(false); const [open, setOpen] = useState(false);
return ( return (
@@ -39,6 +41,7 @@ export const AdvancedJobOptions = ({
onClose={() => setOpen(false)} onClose={() => setOpen(false)}
jobOptions={jobOptions} jobOptions={jobOptions}
setJobOptions={setJobOptions} setJobOptions={setJobOptions}
multiPageScrapeEnabled={multiPageScrapeEnabled}
/> />
</Box> </Box>
); );

View File

@@ -32,6 +32,7 @@ export type AdvancedJobOptionsDialogProps = {
onClose: () => void; onClose: () => void;
jobOptions: RawJobOptions; jobOptions: RawJobOptions;
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>; setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
multiPageScrapeEnabled?: boolean;
}; };
export const AdvancedJobOptionsDialog = ({ export const AdvancedJobOptionsDialog = ({
@@ -39,6 +40,7 @@ export const AdvancedJobOptionsDialog = ({
onClose, onClose,
jobOptions, jobOptions,
setJobOptions, setJobOptions,
multiPageScrapeEnabled = true,
}: AdvancedJobOptionsDialogProps) => { }: AdvancedJobOptionsDialogProps) => {
const theme = useTheme(); const theme = useTheme();
const handleMultiPageScrapeChange = () => { const handleMultiPageScrapeChange = () => {
@@ -122,12 +124,19 @@ export const AdvancedJobOptionsDialog = ({
<Checkbox <Checkbox
checked={jobOptions.multi_page_scrape} checked={jobOptions.multi_page_scrape}
onChange={handleMultiPageScrapeChange} onChange={handleMultiPageScrapeChange}
disabled={!multiPageScrapeEnabled}
/> />
} }
label={ label={
<Box sx={{ display: "flex", alignItems: "center" }}> <Box sx={{ display: "flex", alignItems: "center" }}>
<Typography>Multi Page Scrape</Typography> <Typography>Multi Page Scrape</Typography>
<Tooltip title="Enable crawling through multiple pages"> <Tooltip
title={
multiPageScrapeEnabled
? "Enable crawling through multiple pages"
: "Multi page scrape is disabled"
}
>
<IconButton size="small"> <IconButton size="small">
<InfoOutlined fontSize="small" /> <InfoOutlined fontSize="small" />
</IconButton> </IconButton>
@@ -140,6 +149,7 @@ export const AdvancedJobOptionsDialog = ({
<Checkbox <Checkbox
checked={jobOptions.collect_media} checked={jobOptions.collect_media}
onChange={handleCollectMediaChange} onChange={handleCollectMediaChange}
data-cy="collect-media-checkbox"
/> />
} }
label={ label={

View File

@@ -131,8 +131,9 @@ export const CsvTable: React.FC<CsvTableProps> = ({ csv, className }) => {
<Typography variant="body2" color="text.secondary"> <Typography variant="body2" color="text.secondary">
{row.text {row.text
? row.text ? row.text
.replace(/(\r\n|\n|\r)/g, " ") .replace(/[\n\t\r]+/g, " ")
.replace(/\t/g, " ") .replace(/\s+/g, " ")
.trim()
: "No text available"} : "No text available"}
</Typography> </Typography>
</Paper> </Paper>

View File

@@ -0,0 +1,29 @@
import { Box } from "@mui/material";
export type DisabledProps = {
message: string;
};
export const Disabled = ({ message }: DisabledProps) => {
return (
<Box
bgcolor="background.default"
minHeight="100vh"
display="flex"
justifyContent="center"
alignItems="center"
>
<h4
style={{
color: "#fff",
padding: "20px",
borderRadius: "8px",
background: "rgba(0, 0, 0, 0.6)",
boxShadow: "0 4px 8px rgba(0, 0, 0, 0.2)",
}}
>
{message}
</h4>
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./disabled";

View File

@@ -0,0 +1,40 @@
import { Box, Typography } from "@mui/material";
interface AudioViewerProps {
mediaUrl: string;
selectedMedia: string;
onError: () => void;
}
export const AudioViewer = ({
mediaUrl,
selectedMedia,
onError,
}: AudioViewerProps) => {
return (
<Box
sx={{
display: "flex",
justifyContent: "center",
alignItems: "center",
flexDirection: "column",
height: "100%",
gap: 2,
}}
>
<Typography variant="h6">{selectedMedia}</Typography>
<audio
controls
onError={onError}
style={{
width: "80%",
maxWidth: "500px",
}}
>
<source src={mediaUrl} type="audio/mpeg" />
Your browser does not support the audio element.
</audio>
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./audio-viewer";

View File

@@ -0,0 +1,36 @@
import { Box, useTheme } from "@mui/material";
export const ImageViewer = ({
mediaUrl,
selectedMedia,
}: {
mediaUrl: string;
selectedMedia: string;
}) => {
const theme = useTheme();
return (
<Box
sx={{
display: "flex",
justifyContent: "center",
alignItems: "center",
height: "100%",
width: "100%",
overflow: "hidden",
position: "relative",
}}
>
<img
src={mediaUrl}
alt={selectedMedia}
style={{
maxHeight: "100%",
maxWidth: "100%",
objectFit: "contain",
borderRadius: "4px",
boxShadow: theme.shadows[4],
}}
/>
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./image-viewer";

View File

@@ -0,0 +1 @@
export * from "./media-viewer";

View File

@@ -0,0 +1,75 @@
import { Box, Typography } from "@mui/material";
import { ImageViewer } from "./image";
import { VideoViewer } from "./video";
import { AudioViewer } from "./audio";
import { PDFViewer } from "./pdf-viewer";
interface MediaViewerProps {
selectedMedia: string;
activeTab: string;
getMediaUrl: (fileName: string) => string;
onError: (error: string) => void;
}
export const MediaViewer = ({
selectedMedia,
activeTab,
getMediaUrl,
onError,
}: MediaViewerProps) => {
if (!selectedMedia) {
return (
<Box
sx={{
display: "flex",
justifyContent: "center",
alignItems: "center",
height: "100%",
}}
>
<Typography variant="body1" color="textSecondary">
Select a file to view
</Typography>
</Box>
);
}
const mediaUrl = getMediaUrl(selectedMedia);
switch (activeTab) {
case "images":
return <ImageViewer mediaUrl={mediaUrl} selectedMedia={selectedMedia} />;
case "videos":
return (
<VideoViewer
mediaUrl={mediaUrl}
onError={() => onError("Error loading video")}
/>
);
case "audio":
return (
<AudioViewer
mediaUrl={mediaUrl}
selectedMedia={selectedMedia}
onError={() => onError("Error loading audio")}
/>
);
case "pdfs":
return <PDFViewer mediaUrl={mediaUrl} selectedMedia={selectedMedia} />;
default:
return (
<Box
sx={{
display: "flex",
justifyContent: "center",
alignItems: "center",
height: "100%",
}}
>
<Typography variant="body1">
{selectedMedia} - Download this file to view it
</Typography>
</Box>
);
}
};

View File

@@ -0,0 +1 @@
export * from "./pdf-viewer";

View File

@@ -0,0 +1,33 @@
import { Box, useTheme } from "@mui/material";
interface PDFViewerProps {
mediaUrl: string;
selectedMedia: string;
}
export const PDFViewer = ({ mediaUrl, selectedMedia }: PDFViewerProps) => {
const theme = useTheme();
return (
<Box
sx={{
width: "100%",
height: "100%",
overflow: "hidden",
borderRadius: 1,
}}
>
<iframe
src={`${mediaUrl}#view=fitH`}
style={{
width: "100%",
height: "100%",
border: "none",
borderRadius: "4px",
boxShadow: theme.shadows[4],
}}
title={selectedMedia}
/>
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./tile-grid-view";

View File

@@ -0,0 +1,114 @@
import { MediaFiles } from "@/components/pages/media/id/id";
import {
Card,
CardActionArea,
CardMedia,
CardContent,
Typography,
Box,
Grid,
useTheme,
} from "@mui/material";
interface TileGridViewProps {
mediaFiles: MediaFiles;
activeTab: string;
selectedMedia: string;
handleMediaSelect: (fileName: string) => void;
getMediaUrl: (fileName: string) => string;
}
export const TileGridView = ({
mediaFiles,
activeTab,
selectedMedia,
handleMediaSelect,
getMediaUrl,
}: TileGridViewProps) => {
const theme = useTheme();
return (
<Grid container spacing={2} sx={{ p: 2 }} data-testid="media-grid">
{mediaFiles[activeTab].map((fileName: string) => (
<Grid item xs={6} sm={4} md={3} lg={2} key={fileName}>
<Card
sx={{
height: "100%",
display: "flex",
flexDirection: "column",
borderColor:
selectedMedia === fileName
? theme.palette.primary.main
: "transparent",
borderWidth: 2,
borderStyle: "solid",
transition: "all 0.2s",
"&:hover": {
transform: "translateY(-4px)",
boxShadow: theme.shadows[6],
},
}}
>
<CardActionArea onClick={() => handleMediaSelect(fileName)}>
<CardMedia
component="div"
sx={{
pt: "75%",
position: "relative",
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[100]
: theme.palette.grey[800],
display: "flex",
justifyContent: "center",
alignItems: "center",
}}
>
{activeTab === "images" ? (
<Box
component="img"
src={getMediaUrl(fileName)}
alt={fileName}
sx={{
position: "absolute",
top: 0,
left: 0,
width: "100%",
height: "100%",
objectFit: "contain",
p: 1,
}}
onError={(e) => {
const target = e.target as HTMLImageElement;
if (target.src !== "/placeholder-image.png") {
target.src = "";
}
}}
/>
) : (
<Typography
variant="body2"
color="textSecondary"
sx={{
position: "absolute",
top: "50%",
left: "50%",
transform: "translate(-50%, -50%)",
}}
>
{fileName.split(".").pop()?.toUpperCase() || "FILE"}
</Typography>
)}
</CardMedia>
<CardContent sx={{ flexGrow: 1, p: 1 }}>
<Typography variant="body2" noWrap title={fileName}>
{fileName}
</Typography>
</CardContent>
</CardActionArea>
</Card>
</Grid>
))}
</Grid>
);
};

View File

@@ -0,0 +1 @@
export * from "./video-viewer";

View File

@@ -0,0 +1,39 @@
import { Box, useTheme } from "@mui/material";
export const VideoViewer = ({
mediaUrl,
onError,
}: {
mediaUrl: string;
onError: () => void;
}) => {
const theme = useTheme();
return (
<Box
sx={{
width: "100%",
height: "100%",
display: "flex",
justifyContent: "center",
alignItems: "center",
overflow: "hidden",
borderRadius: 1,
}}
>
<video
className="h-full w-full object-contain"
controls
onError={onError}
style={{
maxHeight: "100%",
maxWidth: "100%",
borderRadius: "4px",
boxShadow: theme.shadows[4],
}}
>
<source src={mediaUrl} type="video/mp4" />
Your browser does not support the video tag.
</video>
</Box>
);
};

View File

@@ -3,11 +3,10 @@ import { NavItem } from "../nav-item";
import HomeIcon from "@mui/icons-material/Home"; import HomeIcon from "@mui/icons-material/Home";
import HttpIcon from "@mui/icons-material/Http"; import HttpIcon from "@mui/icons-material/Http";
import TerminalIcon from "@mui/icons-material/Terminal";
import BarChart from "@mui/icons-material/BarChart"; import BarChart from "@mui/icons-material/BarChart";
import AutoAwesomeIcon from "@mui/icons-material/AutoAwesome"; import AutoAwesomeIcon from "@mui/icons-material/AutoAwesome";
import { List } from "@mui/material"; import { List } from "@mui/material";
import { Schedule } from "@mui/icons-material"; import { Folder, Person, Schedule, VideoFile } from "@mui/icons-material";
const items = [ const items = [
{ {
@@ -20,6 +19,11 @@ const items = [
text: "Jobs", text: "Jobs",
href: "/jobs", href: "/jobs",
}, },
{
icon: <Person />,
text: "Agent",
href: "/agent",
},
{ {
icon: <AutoAwesomeIcon />, icon: <AutoAwesomeIcon />,
text: "Chat", text: "Chat",
@@ -35,6 +39,16 @@ const items = [
text: "Cron Jobs", text: "Cron Jobs",
href: "/cron-jobs", href: "/cron-jobs",
}, },
{
icon: <VideoFile />,
text: "Recordings",
href: "/recordings",
},
{
icon: <Folder />,
text: "Media",
href: "/media",
},
]; ];
export const NavItems = () => { export const NavItems = () => {

View File

@@ -7,20 +7,15 @@ import {
TableHead, TableHead,
TableRow, TableRow,
Box, Box,
Typography,
Accordion,
AccordionSummary,
AccordionDetails,
Checkbox, Checkbox,
Button, Button,
Tooltip, Tooltip,
IconButton, IconButton,
TableContainer, TableContainer,
} from "@mui/material"; } from "@mui/material";
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
import StarIcon from "@mui/icons-material/Star"; import StarIcon from "@mui/icons-material/Star";
import { Job } from "../../types"; import { Job } from "../../types";
import { AutoAwesome } from "@mui/icons-material"; import { AutoAwesome, Image, VideoCameraBack } from "@mui/icons-material";
import { useRouter } from "next/router"; import { useRouter } from "next/router";
interface stringMap { interface stringMap {
@@ -59,7 +54,7 @@ export const JobQueue = ({
<Table sx={{ tableLayout: "fixed", width: "100%" }}> <Table sx={{ tableLayout: "fixed", width: "100%" }}>
<TableHead> <TableHead>
<TableRow> <TableRow>
<TableCell>Select</TableCell> <TableCell sx={{ width: "280px" }}>Select</TableCell>
<TableCell>Id</TableCell> <TableCell>Id</TableCell>
<TableCell>Url</TableCell> <TableCell>Url</TableCell>
<TableCell>Elements</TableCell> <TableCell>Elements</TableCell>
@@ -72,7 +67,7 @@ export const JobQueue = ({
<TableBody sx={{ overflow: "auto" }}> <TableBody sx={{ overflow: "auto" }}>
{filteredJobs.map((row, index) => ( {filteredJobs.map((row, index) => (
<TableRow key={index}> <TableRow key={index}>
<TableCell padding="checkbox"> <TableCell padding="checkbox" sx={{ width: "280px" }}>
<Checkbox <Checkbox
checked={selectedJobs.has(row.id)} checked={selectedJobs.has(row.id)}
onChange={() => onSelectJob(row.id)} onChange={() => onSelectJob(row.id)}
@@ -106,12 +101,47 @@ export const JobQueue = ({
</IconButton> </IconButton>
</span> </span>
</Tooltip> </Tooltip>
<Tooltip title="View Recording">
<span>
<IconButton
onClick={() => {
router.push({
pathname: "/recordings",
query: {
id: row.id,
},
});
}}
>
<VideoCameraBack />
</IconButton>
</span>
</Tooltip>
{row.job_options.collect_media && (
<Tooltip title="View Media">
<span>
<IconButton
onClick={() => {
router.replace(`/media?id=${row.id}`);
}}
>
<Image />
</IconButton>
</span>
</Tooltip>
)}
</TableCell> </TableCell>
<TableCell sx={{ maxWidth: 100, overflow: "auto" }}> <TableCell
sx={{
maxWidth: 100,
overflow: "auto",
}}
>
<Box <Box
sx={{ sx={{
maxHeight: 100, maxHeight: 100,
overflow: "auto", overflow: "auto",
paddingTop: 1,
}} }}
> >
{row.id} {row.id}
@@ -122,7 +152,7 @@ export const JobQueue = ({
</TableCell> </TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}> <TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}> <Box sx={{ maxHeight: 100, overflow: "auto" }}>
{JSON.stringify(row.elements)} {row.agent_mode ? "Agent Mode" : JSON.stringify(row.elements)}
</Box> </Box>
</TableCell> </TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto", padding: 0 }}> <TableCell sx={{ maxWidth: 150, overflow: "auto", padding: 0 }}>
@@ -151,7 +181,7 @@ export const JobQueue = ({
</Box> </Box>
</TableCell> </TableCell>
<TableCell sx={{ maxWidth: 50, overflow: "auto" }}> <TableCell sx={{ maxWidth: 50, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}> <Box sx={{ maxWidth: 100, maxHeight: 100, overflow: "auto" }}>
<Box <Box
className="rounded-md p-2 text-center" className="rounded-md p-2 text-center"
sx={{ bgcolor: colors[row.status] }} sx={{ bgcolor: colors[row.status] }}
@@ -176,9 +206,19 @@ export const JobQueue = ({
Download Download
</Button> </Button>
<Button <Button
onClick={() => onClick={() => {
onNavigate(row.elements, row.url, row.job_options) if (row.agent_mode) {
} router.push({
pathname: "/agent",
query: {
url: row.url,
prompt: row.prompt,
},
});
} else {
onNavigate(row.elements, row.url, row.job_options);
}
}}
size="small" size="small"
sx={{ sx={{
minWidth: 0, minWidth: 0,

View File

@@ -0,0 +1,228 @@
import { validateURL } from "@/lib/helpers/validate-url";
import { ApiService } from "@/services";
import {
Box,
Button,
Divider,
Snackbar,
Alert,
TextField,
Typography,
useTheme,
} from "@mui/material";
import { useEffect, useState } from "react";
import { useRouter } from "next/router";
import { AdvancedJobOptions } from "@/components/common/advanced-job-options";
import { useAdvancedJobOptions } from "@/lib/hooks/use-advanced-job-options/use-advanced-job-options";
import { checkAI } from "@/lib";
import { Disabled } from "@/components/common/disabled/disabled";
export const Agent = () => {
const [url, setUrl] = useState("");
const [prompt, setPrompt] = useState("");
const [urlError, setUrlError] = useState<string | null>(null);
const [aiEnabled, setAiEnabled] = useState(false);
const [snackbarMessage, setSnackbarMessage] = useState("");
const [snackbarSeverity, setSnackbarSeverity] = useState<
"success" | "error" | "info" | "warning"
>("info");
const [snackbarOpen, setSnackbarOpen] = useState(false);
const router = useRouter();
const { jobOptions, setJobOptions } = useAdvancedJobOptions();
const theme = useTheme();
useEffect(() => {
if (router.query.url) {
setUrl(router.query.url as string);
}
if (router.query.prompt) {
setPrompt(router.query.prompt as string);
}
}, [router.query.url, router.query.prompt]);
useEffect(() => {
checkAI(setAiEnabled);
}, []);
const handleCloseSnackbar = () => {
setSnackbarOpen(false);
};
const ErrorSnackbar = () => {
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="error">
{snackbarMessage}
</Alert>
</Snackbar>
);
};
const NotifySnackbar = () => {
const goTo = () => {
router.push("/jobs");
};
const action = (
<Button color="inherit" size="small" onClick={goTo}>
Go To Job
</Button>
);
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="info" action={action}>
{snackbarMessage}
</Alert>
</Snackbar>
);
};
const handleSubmit = async () => {
if (!validateURL(url)) {
setUrlError("Please enter a valid URL.");
return;
}
setUrlError(null);
await ApiService.submitJob(
url,
[],
"",
{
collect_media: jobOptions.collect_media,
multi_page_scrape: jobOptions.multi_page_scrape,
},
jobOptions.custom_headers,
jobOptions.custom_cookies,
null,
true,
prompt
)
.then(async (response) => {
if (!response.ok) {
return response.json().then((error) => {
throw new Error(error.error);
});
}
return response.json();
})
.then((data) => {
setSnackbarMessage(
`Agent job: ${data.id} submitted successfully.` ||
"Agent job submitted successfully."
);
setSnackbarSeverity("info");
setSnackbarOpen(true);
})
.catch((error) => {
setSnackbarMessage(error || "An error occurred.");
setSnackbarSeverity("error");
setSnackbarOpen(true);
});
};
if (!aiEnabled) {
return (
<Disabled message="Must set either OPENAI_KEY or OLLAMA_MODEL to use AI features." />
);
}
return (
<Box
sx={{
minHeight: "100vh",
display: "flex",
alignItems: "center",
justifyContent: "center",
background: theme.palette.background.default,
p: 4,
}}
>
<Box
sx={{
backgroundColor: theme.palette.background.paper,
borderRadius: 4,
boxShadow: 6,
p: 4,
width: "100%",
maxWidth: 800,
display: "flex",
flexDirection: "column",
gap: "1rem",
}}
>
<Typography variant="h3" sx={{ textAlign: "center", fontWeight: 600 }}>
Agent Mode
</Typography>
<Typography
variant="body1"
sx={{ textAlign: "center", color: "text.secondary" }}
>
Use AI to scrape a website
</Typography>
<Divider />
<Typography variant="body1" sx={{ fontWeight: 500 }}>
Website URL
</Typography>
<TextField
value={url}
onChange={(e) => setUrl(e.target.value)}
error={!!urlError}
helperText={urlError}
autoComplete="agent-url"
fullWidth
placeholder="https://www.example.com"
variant="outlined"
size="small"
/>
<Typography variant="body1" sx={{ fontWeight: 500, marginBottom: 0 }}>
Prompt
</Typography>
<TextField
value={prompt}
onChange={(e) => setPrompt(e.target.value)}
autoComplete="agent-prompt"
fullWidth
placeholder="Collect all the links on the page"
variant="outlined"
size="small"
/>
<Box
sx={{
display: "flex",
gap: 2,
alignItems: "center",
justifyContent: "space-between",
flexWrap: "wrap",
}}
>
<AdvancedJobOptions
jobOptions={jobOptions}
setJobOptions={setJobOptions}
multiPageScrapeEnabled={false}
/>
<Button
variant="contained"
color="primary"
onClick={handleSubmit}
sx={{ minWidth: 120 }}
>
Submit
</Button>
</Box>
{snackbarSeverity === "info" ? <NotifySnackbar /> : <ErrorSnackbar />}
</Box>
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./agent";

View File

@@ -11,7 +11,7 @@ import {
import { JobSelector } from "../../ai"; import { JobSelector } from "../../ai";
import { Job, Message } from "../../../types"; import { Job, Message } from "../../../types";
import { useSearchParams } from "next/navigation"; import { useSearchParams } from "next/navigation";
import { checkAI, fetchJob, fetchJobs, updateJob } from "../../../lib"; import { fetchJob, fetchJobs, updateJob, checkAI } from "../../../lib";
import SendIcon from "@mui/icons-material/Send"; import SendIcon from "@mui/icons-material/Send";
import EditNoteIcon from "@mui/icons-material/EditNote"; import EditNoteIcon from "@mui/icons-material/EditNote";

View File

@@ -0,0 +1,392 @@
import { JobSelector } from "@/components/ai";
import { fetchJobs } from "@/lib";
import { Job } from "@/types";
import {
Box,
useTheme,
Typography,
CircularProgress,
Alert,
Paper,
Tabs,
Tab,
} from "@mui/material";
import { useRouter, useSearchParams } from "next/navigation";
import { useState, useEffect } from "react";
import { TileGridView } from "@/components/common/media-viewer/tile-grid-view";
import { MediaViewer } from "@/components/common/media-viewer";
export interface MediaFiles {
audio: string[];
documents: string[];
images: string[];
pdfs: string[];
presentations: string[];
spreadsheets: string[];
videos: string[];
[key: string]: string[];
}
export const MediaId = () => {
const searchParams = useSearchParams();
const theme = useTheme();
const router = useRouter();
const [error, setError] = useState<string | null>(null);
const [loading, setLoading] = useState(true);
const [jobs, setJobs] = useState<Job[]>([]);
const [selectedJob, setSelectedJob] = useState<Job | null>(null);
const [mediaFiles, setMediaFiles] = useState<MediaFiles | null>(null);
const [activeTab, setActiveTab] = useState<string>("images");
const [selectedMedia, setSelectedMedia] = useState<string | null>(null);
const currentId = searchParams.get("id");
const mediaType = searchParams.get("type") || "images";
const mediaName = searchParams.get("file");
const handleSelectJob = (job: Job | null) => {
if (job) {
router.push(`/media?id=${job.id}`);
}
};
const handleTabChange = (_event: React.SyntheticEvent, newValue: string) => {
setActiveTab(newValue);
router.push(`/media?id=${currentId}&type=${newValue}`);
};
const handleMediaSelect = (fileName: string) => {
setSelectedMedia(fileName);
router.push(`/media?id=${currentId}&type=${activeTab}&file=${fileName}`);
};
// Fetch jobs on mount
useEffect(() => {
fetchJobs(setJobs);
}, []);
// Set selected job when currentId changes
useEffect(() => {
if (!currentId) {
setSelectedJob(null);
return;
}
const job = jobs.find((j) => j.id === currentId);
setSelectedJob(job || null);
}, [currentId, jobs]);
// Fetch media files when selected job changes
useEffect(() => {
if (!selectedJob?.id) {
setError("No job ID provided");
setLoading(false);
return;
}
const fetchMediaFiles = async () => {
setLoading(true);
setError(null);
try {
const res = await fetch(`/api/media/get-media?id=${selectedJob.id}`);
if (!res.ok) {
throw new Error(`Media not found (status: ${res.status})`);
}
const data = await res.json();
setMediaFiles(data.files);
const hasMediaType = mediaType && data.files[mediaType]?.length > 0;
if (hasMediaType && activeTab !== mediaType) {
setActiveTab(mediaType);
} else if (!hasMediaType && !activeTab) {
// Only set a default tab if activeTab is not set
const firstNonEmpty = Object.entries(data.files).find(
([_, files]) => Array.isArray(files) && files.length > 0
);
if (firstNonEmpty) {
setActiveTab(firstNonEmpty[0]);
}
}
} catch (err) {
setError(
err instanceof Error ? err.message : "Failed to fetch media files"
);
} finally {
setLoading(false);
}
};
fetchMediaFiles();
}, [selectedJob?.id]);
// Set selected media when mediaName changes
useEffect(() => {
if (mediaName && mediaName !== selectedMedia) {
setSelectedMedia(mediaName);
}
}, [mediaName, selectedMedia]);
// Get media file URL
const getMediaUrl = (fileName: string) => {
if (!currentId || !activeTab) return "";
return `/api/media?id=${currentId}&type=${activeTab}&file=${fileName}`;
};
const renderMediaThumbnails = () => {
if (
!mediaFiles ||
!mediaFiles[activeTab] ||
mediaFiles[activeTab].length === 0
) {
return (
<Box
sx={{
display: "flex",
justifyContent: "center",
alignItems: "center",
height: "100%",
p: 3,
}}
>
<Typography variant="body2" color="textSecondary">
No {activeTab} files available
</Typography>
</Box>
);
}
return (
<TileGridView
mediaFiles={mediaFiles}
activeTab={activeTab}
selectedMedia={selectedMedia || ""}
handleMediaSelect={handleMediaSelect}
getMediaUrl={getMediaUrl}
/>
);
};
return (
<Box
sx={{
height: "100%",
width: "100%",
display: "flex",
flexDirection: "column",
position: "relative",
borderRadius: 2,
overflow: "hidden",
border: `1px solid ${theme.palette.divider}`,
backgroundColor: theme.palette.background.paper,
}}
>
<Box
sx={{
display: "flex",
justifyContent: "flex-end",
p: 1,
borderBottom: `1px solid ${theme.palette.divider}`,
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[50]
: theme.palette.grey[900],
zIndex: 10,
}}
>
<Box sx={{ width: "300px" }}>
<JobSelector
setSelectedJob={handleSelectJob}
selectedJob={selectedJob}
setJobs={setJobs}
jobs={jobs}
/>
</Box>
</Box>
{loading ? (
<Box
display="flex"
flexDirection="column"
alignItems="center"
justifyContent="center"
sx={{ flex: 1 }}
gap={2}
>
<CircularProgress />
<Typography variant="body2" color="textSecondary">
Loading media...
</Typography>
</Box>
) : error ? (
<Box
sx={{
flex: 1,
display: "flex",
justifyContent: "center",
alignItems: "center",
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[100]
: theme.palette.grey[900],
p: 2,
}}
>
<Paper
elevation={3}
sx={{
p: 3,
maxWidth: "500px",
width: "100%",
backgroundColor: theme.palette.background.paper,
borderRadius: 2,
}}
>
<Alert
severity="error"
variant="filled"
sx={{
mb: 2,
backgroundColor: theme.palette.error.main,
}}
>
{error}
</Alert>
<Typography variant="body2" color="textSecondary" sx={{ mt: 2 }}>
Please select a different job from the dropdown menu above or
check if media browsing is enabled.
</Typography>
</Paper>
</Box>
) : (
<>
<Box
sx={{
borderBottom: 1,
borderColor: "divider",
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[50]
: theme.palette.grey[900],
}}
>
<Tabs
value={activeTab}
onChange={handleTabChange}
variant="scrollable"
scrollButtons="auto"
aria-label="media type tabs"
>
{mediaFiles &&
Object.entries(mediaFiles).map(([type, files]) => (
<Tab
key={type}
value={type}
label={`${type.charAt(0).toUpperCase() + type.slice(1)} (${
files.length
})`}
disabled={!files.length}
/>
))}
</Tabs>
</Box>
<Box
sx={{
display: "flex",
flexDirection: "column",
flex: 1,
height: "calc(100% - 48px)",
overflow: "hidden",
}}
>
{selectedMedia && mediaType && mediaName ? (
<Box
sx={{
display: "flex",
flexDirection: "column",
height: "100%",
}}
>
<Box
sx={{
display: "flex",
justifyContent: "space-between",
alignItems: "center",
p: 1,
borderBottom: `1px solid ${theme.palette.divider}`,
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[50]
: theme.palette.grey[900],
}}
>
<Typography variant="subtitle1" noWrap>
{selectedMedia}
</Typography>
<Box>
<Typography
variant="body2"
sx={{
cursor: "pointer",
color: theme.palette.primary.main,
"&:hover": {
textDecoration: "underline",
},
}}
onClick={async () => {
setSelectedMedia(null);
await router.push(
`/media?id=${currentId}&type=${mediaType}`
);
}}
>
Back to Gallery
</Typography>
</Box>
</Box>
<Box
sx={{
flex: 1,
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[100]
: theme.palette.grey[900],
overflow: "hidden",
display: "flex",
justifyContent: "center",
alignItems: "center",
p: 2,
}}
>
<MediaViewer
selectedMedia={selectedMedia}
activeTab={activeTab}
getMediaUrl={getMediaUrl}
onError={() => setError("Error loading media")}
/>
</Box>
</Box>
) : (
<Box
sx={{
flex: 1,
overflow: "auto",
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[100]
: theme.palette.grey[900],
}}
>
{renderMediaThumbnails()}
</Box>
)}
</Box>
</>
)}
</Box>
);
};

View File

@@ -0,0 +1 @@
export { MediaId } from "./id";

View File

@@ -0,0 +1,204 @@
import { JobSelector } from "@/components/ai";
import { fetchJobs } from "@/lib";
import { useUserSettings } from "@/store/hooks";
import { Job } from "@/types";
import {
Box,
useTheme,
Typography,
CircularProgress,
Alert,
Paper,
} from "@mui/material";
import { useRouter, useSearchParams } from "next/navigation";
import { useState, useEffect } from "react";
export const RecordingId = () => {
const searchParams = useSearchParams();
const theme = useTheme();
const { userSettings } = useUserSettings();
const router = useRouter();
const [error, setError] = useState<string | null>(null);
const [videoUrl, setVideoUrl] = useState<string | null>(null);
const [loading, setLoading] = useState(true);
const [jobs, setJobs] = useState<Job[]>([]);
const [selectedJob, setSelectedJob] = useState<Job | null>(null);
const currentId = searchParams.get("id");
const handleSelectJob = (job: Job | null) => {
if (job) {
router.push(`/recordings?id=${job.id}`);
}
};
useEffect(() => {
fetchJobs(setJobs);
}, []);
useEffect(() => {
if (!userSettings.recordingsEnabled) {
setError("Recordings are disabled");
setLoading(false);
return;
}
if (!currentId) {
setError("No recording ID provided");
setLoading(false);
return;
}
setLoading(true);
setError(null);
const url = `/api/recordings/${currentId}`;
fetch(url, { method: "HEAD" })
.then((res) => {
if (!res.ok) {
throw new Error(`Video not found (status: ${res.status})`);
}
setVideoUrl(url);
})
.catch(() => {
setError("404 recording not found");
})
.finally(() => {
setLoading(false);
});
}, [currentId, userSettings.recordingsEnabled]);
useEffect(() => {
if (!currentId) {
setSelectedJob(null);
return;
}
const job = jobs.find((j) => j.id === currentId);
setSelectedJob(job || null);
}, [currentId, jobs]);
return (
<Box
sx={{
height: "100%",
width: "100%",
display: "flex",
flexDirection: "column",
position: "relative",
borderRadius: 2,
overflow: "hidden",
border: `1px solid ${theme.palette.divider}`,
backgroundColor: theme.palette.background.paper,
}}
>
<Box
sx={{
display: "flex",
justifyContent: "flex-end",
p: 1,
borderBottom: `1px solid ${theme.palette.divider}`,
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[50]
: theme.palette.grey[900],
zIndex: 10,
}}
>
<Box sx={{ width: "300px" }}>
<JobSelector
setSelectedJob={handleSelectJob}
selectedJob={selectedJob}
setJobs={setJobs}
jobs={jobs}
sxProps={{}}
/>
</Box>
</Box>
<Box
sx={{
flex: 1,
display: "flex",
justifyContent: "center",
alignItems: "center",
position: "relative",
backgroundColor:
theme.palette.mode === "light"
? theme.palette.grey[100]
: theme.palette.grey[900],
p: 2,
overflow: "hidden",
}}
>
{loading ? (
<Box
display="flex"
flexDirection="column"
alignItems="center"
gap={2}
>
<CircularProgress />
<Typography variant="body2" color="textSecondary">
Loading recording...
</Typography>
</Box>
) : error ? (
<Paper
elevation={3}
sx={{
p: 3,
maxWidth: "500px",
width: "100%",
backgroundColor: theme.palette.background.paper,
borderRadius: 2,
}}
>
<Alert
severity="error"
variant="filled"
sx={{
mb: 2,
backgroundColor: theme.palette.error.main,
}}
>
{error}
</Alert>
<Typography variant="body2" color="textSecondary" sx={{ mt: 2 }}>
Please select a different recording from the dropdown menu above
or check if recordings are enabled.
</Typography>
</Paper>
) : (
<Box
sx={{
width: "100%",
height: "100%",
display: "flex",
justifyContent: "center",
alignItems: "center",
overflow: "hidden",
borderRadius: 1,
}}
>
<video
className="h-full w-full object-contain"
controls
onError={() => setError("Error loading video")}
style={{
maxHeight: "100%",
maxWidth: "100%",
borderRadius: "4px",
boxShadow: theme.shadows[4],
}}
>
<source src={videoUrl ?? undefined} type="video/mp4" />
Your browser does not support the video tag.
</video>
</Box>
)}
</Box>
</Box>
);
};

View File

@@ -0,0 +1 @@
export { RecordingId } from "./id";

View File

@@ -41,8 +41,6 @@ export const JobSubmitter = () => {
const [jobOptions, setJobOptions] = const [jobOptions, setJobOptions] =
useState<RawJobOptions>(initialJobOptions); useState<RawJobOptions>(initialJobOptions);
console.log(jobOptions);
const handleSubmit = async () => { const handleSubmit = async () => {
if (!validateURL(submittedURL)) { if (!validateURL(submittedURL)) {
setIsValidUrl(false); setIsValidUrl(false);

View File

@@ -5,7 +5,7 @@ import { RawJobOptions, SiteMap } from "@/types";
export const parseJobOptions = ( export const parseJobOptions = (
job_options: string, job_options: string,
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>, setJobOptions: Dispatch<SetStateAction<RawJobOptions>>,
setSiteMap: Dispatch<SetStateAction<SiteMap | null>> setSiteMap?: Dispatch<SetStateAction<SiteMap | null>>
) => { ) => {
if (job_options) { if (job_options) {
const jsonOptions = JSON.parse(job_options as string); const jsonOptions = JSON.parse(job_options as string);
@@ -17,6 +17,10 @@ export const parseJobOptions = (
custom_cookies: null, custom_cookies: null,
}; };
if (jsonOptions.collect_media) {
newJobOptions.collect_media = true;
}
if ( if (
jsonOptions.custom_headers && jsonOptions.custom_headers &&
Object.keys(jsonOptions.custom_headers).length Object.keys(jsonOptions.custom_headers).length
@@ -34,7 +38,7 @@ export const parseJobOptions = (
newJobOptions.proxies = jsonOptions.proxies.join(","); newJobOptions.proxies = jsonOptions.proxies.join(",");
} }
if (jsonOptions.site_map) { if (jsonOptions.site_map && setSiteMap) {
setSiteMap(jsonOptions.site_map); setSiteMap(jsonOptions.site_map);
} }

View File

@@ -0,0 +1 @@
export * from "./use-advanced-job-options";

View File

@@ -0,0 +1,29 @@
import { useEffect, useState } from "react";
import { RawJobOptions } from "@/types";
import { parseJobOptions } from "@/lib/helpers/parse-job-options";
import { useRouter } from "next/router";
export const useAdvancedJobOptions = () => {
const initialJobOptions: RawJobOptions = {
multi_page_scrape: false,
custom_headers: null,
proxies: null,
collect_media: false,
custom_cookies: null,
};
const router = useRouter();
const { job_options } = router.query;
const [jobOptions, setJobOptions] =
useState<RawJobOptions>(initialJobOptions);
useEffect(() => {
if (job_options) {
parseJobOptions(job_options as string, setJobOptions);
}
}, [job_options]);
return { jobOptions, setJobOptions };
};

View File

@@ -80,3 +80,22 @@ export const updateJob = async (ids: string[], field: string, value: any) => {
console.error("Error fetching jobs:", error); console.error("Error fetching jobs:", error);
}); });
}; };
export const getUserSettings = async () => {
const token = Cookies.get("token");
try {
const response = await fetch("/api/check", {
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
});
const data = await response.json();
return data;
} catch (error) {
console.error("Error fetching jobs:", error);
throw error;
}
};

View File

@@ -8,6 +8,9 @@ import { ThemeProvider, CssBaseline, Box } from "@mui/material";
import { NavDrawer } from "../components/common"; import { NavDrawer } from "../components/common";
import { darkTheme, lightTheme } from "../styles/themes"; import { darkTheme, lightTheme } from "../styles/themes";
import { AuthProvider } from "../contexts/AuthContext"; import { AuthProvider } from "../contexts/AuthContext";
import { Provider } from "react-redux";
import { PersistGate } from "redux-persist/integration/react";
import { store, persistor } from "@/store/store";
const App: React.FC<AppProps> = ({ Component, pageProps }) => { const App: React.FC<AppProps> = ({ Component, pageProps }) => {
const [isDarkMode, setIsDarkMode] = useState(false); const [isDarkMode, setIsDarkMode] = useState(false);
@@ -35,26 +38,30 @@ const App: React.FC<AppProps> = ({ Component, pageProps }) => {
<Head> <Head>
<title>Scraperr</title> <title>Scraperr</title>
</Head> </Head>
<AuthProvider> <Provider store={store}>
<ThemeProvider theme={isDarkMode ? darkTheme : lightTheme}> <PersistGate loading={null} persistor={persistor}>
<CssBaseline /> <AuthProvider>
<Box sx={{ height: "100%", display: "flex" }}> <ThemeProvider theme={isDarkMode ? darkTheme : lightTheme}>
<NavDrawer isDarkMode={isDarkMode} toggleTheme={toggleTheme} /> <CssBaseline />
<Box <Box sx={{ height: "100%", display: "flex" }}>
component="main" <NavDrawer isDarkMode={isDarkMode} toggleTheme={toggleTheme} />
sx={{ <Box
p: 3, component="main"
bgcolor: "background.default", sx={{
overflow: "hidden", p: 3,
height: "100%", bgcolor: "background.default",
width: "100%", overflow: "hidden",
}} height: "100%",
> width: "100%",
<Component {...pageProps} /> }}
</Box> >
</Box> <Component {...pageProps} />
</ThemeProvider> </Box>
</AuthProvider> </Box>
</ThemeProvider>
</AuthProvider>
</PersistGate>
</Provider>
</> </>
); );
}; };

1
src/pages/agent.tsx Normal file
View File

@@ -0,0 +1 @@
export { Agent as default } from "@/components/pages/agent";

View File

@@ -0,0 +1,24 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
const { id } = req.query;
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/get-media?id=${id}`
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const data = await response.json();
res.status(200).json(data);
} catch (error) {
console.error("Error streaming video:", error);
res.status(404).json({ error: "Error streaming video" });
}
}

View File

@@ -0,0 +1,33 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
const { id, type, file } = req.query;
if (!id || !type || !file) {
return res.status(400).json({ error: "Missing required parameters" });
}
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/media?id=${id}&type=${type}&file=${file}`
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const contentType =
response.headers.get("content-type") || "application/octet-stream";
res.setHeader("Content-Type", contentType);
const arrayBuffer = await response.arrayBuffer();
res.status(200).send(Buffer.from(arrayBuffer));
} catch (error) {
console.error("Error streaming media:", error);
res.status(404).json({ error: "Error retrieving media file" });
}
}

View File

@@ -0,0 +1,39 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
const { id } = req.query;
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/recordings/${id}`
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
res.setHeader("Content-Type", "video/mp4");
res.setHeader("Accept-Ranges", "bytes");
const reader = response.body?.getReader();
if (!reader) {
res.status(404).json({ error: "Recording not found" });
return;
}
while (true) {
const { done, value } = await reader.read();
if (done) break;
res.write(value);
}
res.end();
} catch (error) {
console.error("Error streaming video:", error);
res.status(404).json({ error: "Error streaming video" });
}
}

View File

@@ -6,7 +6,8 @@ import { Button, TextField, Typography, Box } from "@mui/material";
import { useTheme } from "@mui/material/styles"; import { useTheme } from "@mui/material/styles";
import { useRouter } from "next/router"; import { useRouter } from "next/router";
import { useAuth } from "../contexts/AuthContext"; import { useAuth } from "../contexts/AuthContext";
import { Constants } from "../lib"; import { Constants, getUserSettings } from "../lib";
import { useUserSettings } from "@/store/hooks";
type Mode = "login" | "signup"; type Mode = "login" | "signup";
@@ -19,6 +20,7 @@ const AuthForm: React.FC = () => {
const router = useRouter(); const router = useRouter();
const { login } = useAuth(); const { login } = useAuth();
const [registrationEnabled, setRegistrationEnabled] = useState<boolean>(true); const [registrationEnabled, setRegistrationEnabled] = useState<boolean>(true);
const { setUserSettings } = useUserSettings();
const checkRegistrationEnabled = async () => { const checkRegistrationEnabled = async () => {
const response = await axios.get(`/api/check`); const response = await axios.get(`/api/check`);
@@ -28,12 +30,17 @@ const AuthForm: React.FC = () => {
useEffect(() => { useEffect(() => {
checkRegistrationEnabled(); checkRegistrationEnabled();
}, []); }, []);
const handleSubmit = async (event: React.FormEvent) => { const handleSubmit = async (event: React.FormEvent) => {
event.preventDefault(); event.preventDefault();
try { try {
if (mode === "login") { if (mode === "login") {
await login(email, password); await login(email, password);
alert("Login successful"); alert("Login successful");
const userSettings = await getUserSettings();
setUserSettings(userSettings);
router.push("/"); router.push("/");
} else { } else {
await axios.post(`/api/signup`, { await axios.post(`/api/signup`, {

View File

@@ -0,0 +1 @@
export { MediaId as default } from "@/components/pages/media/id";

View File

@@ -0,0 +1 @@
export { RecordingId as default } from "@/components/pages/recordings/id";

View File

@@ -7,7 +7,9 @@ export const submitJob = async (
jobOptions: any, jobOptions: any,
customHeaders: any, customHeaders: any,
customCookies: any, customCookies: any,
siteMap: SiteMap | null siteMap: SiteMap | null,
agentMode: boolean = false,
prompt?: string
) => { ) => {
return await fetch(`/api/submit-scrape-job`, { return await fetch(`/api/submit-scrape-job`, {
method: "POST", method: "POST",
@@ -26,6 +28,8 @@ export const submitJob = async (
site_map: siteMap, site_map: siteMap,
custom_cookies: customCookies || [], custom_cookies: customCookies || [],
}, },
agent_mode: agentMode,
prompt: prompt || "",
}, },
}), }),
}); });

23
src/store/hooks.ts Normal file
View File

@@ -0,0 +1,23 @@
import { TypedUseSelectorHook, useDispatch, useSelector } from "react-redux";
import type { RootState, AppDispatch } from "./store";
import {
SettingsState,
setAiEnabled,
setRecordingsEnabled,
} from "./slices/settingsSlice";
export const useAppDispatch = () => useDispatch<AppDispatch>();
export const useAppSelector: TypedUseSelectorHook<RootState> = useSelector;
export const useUserSettings = () => {
const userSettings = useAppSelector((state) => state.settings);
const dispatch = useAppDispatch();
const setUserSettings = (userSettings: any) => {
dispatch(setAiEnabled(userSettings.ai_enabled));
dispatch(setRecordingsEnabled(userSettings.recordings_enabled));
return userSettings;
};
return { userSettings, setUserSettings };
};

View File

@@ -0,0 +1,28 @@
import { createSlice, PayloadAction } from "@reduxjs/toolkit";
export interface SettingsState {
aiEnabled: boolean;
recordingsEnabled: boolean;
}
const initialState: SettingsState = {
aiEnabled: false,
recordingsEnabled: false,
};
const settingsSlice = createSlice({
name: "settings",
initialState,
reducers: {
setAiEnabled: (state, action: PayloadAction<boolean>) => {
state.aiEnabled = action.payload;
},
setRecordingsEnabled: (state, action: PayloadAction<boolean>) => {
state.recordingsEnabled = action.payload;
},
},
});
export const { setAiEnabled, setRecordingsEnabled } = settingsSlice.actions;
export default settingsSlice.reducer;

32
src/store/store.ts Normal file
View File

@@ -0,0 +1,32 @@
import { configureStore } from "@reduxjs/toolkit";
import { persistStore, persistReducer } from "redux-persist";
import storage from "redux-persist/lib/storage";
import { combineReducers } from "@reduxjs/toolkit";
import settingsReducer from "./slices/settingsSlice";
const persistConfig = {
key: "root",
storage,
whitelist: ["settings"], // only settings will be persisted
};
const rootReducer = combineReducers({
settings: settingsReducer,
});
const persistedReducer = persistReducer(persistConfig, rootReducer);
export const store = configureStore({
reducer: persistedReducer,
middleware: (getDefaultMiddleware) =>
getDefaultMiddleware({
serializableCheck: {
ignoredActions: ["persist/PERSIST", "persist/REHYDRATE"],
},
}),
});
export const persistor = persistStore(store);
export type RootState = ReturnType<typeof store.getState>;
export type AppDispatch = typeof store.dispatch;

View File

@@ -7,9 +7,11 @@ export interface Job {
result: Object; result: Object;
time_created: Date; time_created: Date;
status: string; status: string;
job_options: Object; job_options: RawJobOptions;
favorite: boolean; favorite: boolean;
chat?: Message[]; chat?: Message[];
agent_mode?: boolean;
prompt?: string;
} }
export type JobOptions = { export type JobOptions = {

14
start.sh Executable file
View File

@@ -0,0 +1,14 @@
#!/bin/bash
RECORDINGS_ENABLED=${RECORDINGS_ENABLED:-true}
if [ "$RECORDINGS_ENABLED" == "false" ]; then
pdm run python -m api.backend.worker.job_worker
else
Xvfb :99 -screen 0 1280x1024x24 &
XVFB_PID=$!
sleep 2
x11vnc -display :99 -rfbport 5900 -forever -nopw &
VNC_PID=$!
DISPLAY=:99 pdm run python -m api.backend.worker.job_worker
fi

View File

@@ -12,7 +12,7 @@ stdout_logfile_maxbytes=0
stderr_logfile_maxbytes=0 stderr_logfile_maxbytes=0
[program:worker] [program:worker]
command=pdm run python -m api.backend.worker.job_worker command=/project/app/start.sh
directory=/project/app directory=/project/app
autostart=true autostart=true
autorestart=true autorestart=true

5797
yarn.lock Normal file

File diff suppressed because it is too large Load Diff