40 Commits

Author SHA1 Message Date
Jayden Pyles
d2bc9c53ff fix: cypress test 2025-05-31 23:11:18 -05:00
Jayden Pyles
734974df83 fix: cypress test 2025-05-31 23:01:39 -05:00
Jayden Pyles
b6dbd0dc82 fix: cypress test 2025-05-31 22:17:57 -05:00
Jayden Pyles
d57dd0af1a fix: cypress test 2025-05-31 21:55:47 -05:00
Jayden Pyles
23fccd7afb fix: cypress test 2025-05-31 21:39:38 -05:00
Jayden Pyles
f89a460206 fix: cypress test 2025-05-31 21:13:52 -05:00
Jayden Pyles
1169d48992 fix: cypress test 2025-05-31 21:05:26 -05:00
Jayden Pyles
ff809d7833 fix: cypress test 2025-05-31 19:29:48 -05:00
Jayden Pyles
41c7f6795c fix: build 2025-05-31 18:01:13 -05:00
Jayden Pyles
21c2155786 chore: refactor wip 2025-05-31 17:22:29 -05:00
Jayden Pyles
6d45bd129c chore: refactor wip 2025-05-31 14:26:50 -05:00
Jayden Pyles
3ab31bd186 chore: refactor wip 2025-05-31 14:21:51 -05:00
Jayden Pyles
e00c187e68 chore: work in progress 2025-05-30 22:18:26 -05:00
Jayden Pyles
39c0d17e1e chore: refactor wip 2025-05-28 20:42:06 -05:00
Jayden Pyles
b66963ed33 chore: work in progress 2025-05-25 19:00:16 -05:00
Jayden Pyles
813550e07c chore: work in progress 2025-05-25 18:52:17 -05:00
Jayden Pyles
dcb4afe01b chore: work in progress 2025-05-25 18:50:58 -05:00
Jayden Pyles
344d9036c3 chore: work in progress 2025-05-25 18:39:27 -05:00
Jayden Pyles
96552cd1d1 chore: work in progress 2025-05-25 18:17:20 -05:00
Jayden Pyles
d4ac85d206 chore: work in progress 2025-05-25 18:01:49 -05:00
Jayden Pyles
a805b98ce3 chore: work in progress 2025-05-25 17:54:01 -05:00
Jayden Pyles
eaf047ecd8 chore: work in progress 2025-05-25 17:36:42 -05:00
Jayden Pyles
d43040fe08 chore: work in progress 2025-05-25 17:16:42 -05:00
Jayden Pyles
f0813323f0 chore: refactor wip 2025-05-23 18:12:37 -05:00
Jayden Pyles
fb7986bccf chore: refactor wip 2025-05-22 18:36:17 -05:00
Jayden Pyles
a1664856a6 chore: refactor wip 2025-05-21 22:13:40 -05:00
Jayden Pyles
467244b7f8 chore: refactor wip 2025-05-20 19:49:38 -05:00
Jayden Pyles
b91a133b4d chore: refactor wip 2025-05-20 19:34:30 -05:00
Jayden Pyles
aeed81a6df chore: refactor wip 2025-05-20 19:05:48 -05:00
Jayden Pyles
d4edb9d93e chore: update chart version [skip ci] 2025-05-19 20:46:19 -05:00
Jayden Pyles
5ebd96b62b feat: add agent mode (#81)
* chore: wip agent mode

* wip: add agent mode frontend

* wip: add agent mode frontend

* chore: cleanup code

* chore: cleanup code

* chore: cleanup code
2025-05-19 20:44:41 -05:00
Jayden Pyles
d602d3330a fix: site map
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
2025-05-17 17:05:37 -05:00
Jayden Pyles
6639e8b48f chore: update chart version [skip ci] 2025-05-17 16:33:18 -05:00
Jayden Pyles
263e46ba4d feat: add media viewer + other fixes (#79)
* feat: add media viewer + other fixes

* chore: remove logging [skip ci]

* chore: remove logging [skip ci]

* feat: add unit test for media

* feat: add unit test for media

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* chore: update docs [skip ci]
2025-05-17 16:31:34 -05:00
Jayden Pyles
f815a58efc chore: update docker version [skip ci] 2025-05-16 22:04:46 -05:00
Jayden Pyles
50ec5df657 chore: update chart version [skip ci] 2025-05-16 21:39:04 -05:00
Jayden Pyles
28de0f362c feat: add recording viewer and vnc (#78)
* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* chore: update gitignore [skip ci]

* chore: update dev compose [skip ci]

* fix: only run manually
2025-05-16 21:37:09 -05:00
Jayden Pyles
6b33723cac feat: update version
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
2025-05-16 14:15:53 -05:00
Jayden Pyles
5c89e4d7d2 feat: allow custom cookies (#77)
* feat: working new advanced job options

* feat: working new advanced job options

* feat: add tests for adding custom cookies/headers
2025-05-16 14:13:58 -05:00
Jayden Pyles
ed0828a585 fix: deployment
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
2025-05-13 21:03:21 -05:00
179 changed files with 10954 additions and 12836 deletions

4
.dockerignore Normal file
View File

@@ -0,0 +1,4 @@
node_modules
npm-debug.log
Dockerfile
.dockerignore

View File

@@ -2,6 +2,13 @@ name: Run Cypress Tests
description: Run Cypress tests
inputs:
openai_key:
description: "OpenAI API key"
required: true
default: ""
runs:
using: "composite"
steps:
@@ -15,11 +22,13 @@ runs:
- name: Setup Docker project
shell: bash
run: make build up-dev
env:
OPENAI_KEY: ${{ inputs.openai_key }}
run: make build-ci up-ci
- name: Install dependencies
shell: bash
run: npm install
run: yarn install
- name: Wait for frontend to be ready
shell: bash

View File

@@ -1,14 +1,9 @@
name: Docker Image
on:
workflow_run:
workflows: ["Unit Tests"]
types:
- completed
workflow_dispatch:
jobs:
build:
if: ${{ github.event.workflow_run.conclusion == 'success' && github.ref == 'refs/heads/master' }}
runs-on: ubuntu-latest
steps:
- name: Checkout
@@ -36,8 +31,8 @@ jobs:
file: ./docker/frontend/Dockerfile
push: true
tags: |
${{ secrets.DOCKERHUB_USERNAME }}/${{ secrets.DOCKERHUB_REPO }}:latest
${{ secrets.DOCKERHUB_USERNAME }}/${{ secrets.DOCKERHUB_REPO }}:${{ env.VERSION }}
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:latest
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:${{ env.VERSION }}
- name: Build and push api
uses: docker/build-push-action@v5
@@ -46,8 +41,8 @@ jobs:
file: ./docker/api/Dockerfile
push: true
tags: |
${{ secrets.DOCKERHUB_USERNAME }}/${{ secrets.DOCKERHUB_REPO }}:latest
${{ secrets.DOCKERHUB_USERNAME }}/${{ secrets.DOCKERHUB_REPO }}:${{ env.VERSION }}
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:${{ env.VERSION }}
push-helm-chart:
runs-on: ubuntu-latest

View File

@@ -30,13 +30,15 @@ jobs:
run: pdm run playwright install
- name: Run tests
run: PYTHONPATH=. pdm run pytest api/backend/tests
run: PYTHONPATH=. pdm run pytest -v -ra api/backend/tests
cypress-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/run-cypress-tests
with:
openai_key: ${{ secrets.OPENAI_KEY }}
success-message:
runs-on: ubuntu-latest

16
.gitignore vendored
View File

@@ -188,4 +188,18 @@ postgres_data
.vscode
ollama
data
media
media/images
media/videos
media/audio
media/pdfs
media/spreadsheets
media/presentations
media/documents
media/recordings
media/download_summary.txt
cypress/screenshots
cypress/videos
docker-compose.dev.local.yml

View File

@@ -1,6 +1,6 @@
.DEFAULT_GOAL := help
COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.yml
COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.local.yml
COMPOSE_PROD = docker compose -f docker-compose.yml
.PHONY: help deps build pull up up-dev down setup deploy
@@ -17,6 +17,7 @@ help:
@echo " make down - Stop and remove containers, networks, images, and volumes"
@echo " make setup - Setup server with dependencies and clone repo"
@echo " make deploy - Deploy site onto server"
@echo " make cypress-start - Start Cypress"
@echo ""
logs:
@@ -51,3 +52,15 @@ setup:
deploy:
ansible-playbook -i ./ansible/inventory.yaml ./ansible/deploy_site.yaml -v
build-ci:
docker compose -f docker-compose.yml -f docker-compose.dev.yml build
up-ci:
docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d --force-recreate
cypress-start:
DISPLAY=:0 npx cypress open
cypress-run:
npx cypress run

View File

@@ -13,7 +13,7 @@
## 📋 Overview
Scraperr enables you to extract data from websites with precision using XPath selectors. This self-hosted application provides a clean interface to manage scraping jobs, view results, and export data.
Scrape websites without writing a single line of code.
> 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information.
@@ -29,7 +29,7 @@ Scraperr enables you to extract data from websites with precision using XPath se
- **Custom Headers**: Add JSON headers to your scraping requests
- **Media Downloads**: Automatically download images, videos, and other media
- **Results Visualization**: View scraped data in a structured table format
- **Data Export**: Export your results in various formats
- **Data Export**: Export your results in markdown and csv formats
- **Notifcation Channels**: Send completion notifcations, through various channels
## 🚀 Getting Started

View File

@@ -0,0 +1,6 @@
from typing_extensions import TypedDict
class Action(TypedDict):
type: str
url: str

View File

@@ -0,0 +1,91 @@
# STL
import random
from typing import Any
# PDM
from camoufox import AsyncCamoufox
from playwright.async_api import Page
# LOCAL
from api.backend.ai.clients import ask_ollama, ask_open_ai, open_ai_key
from api.backend.job.models import CapturedElement
from api.backend.worker.logger import LOG
from api.backend.ai.agent.utils import (
parse_response,
capture_elements,
convert_to_markdown,
)
from api.backend.ai.agent.prompts import (
EXTRACT_ELEMENTS_PROMPT,
ELEMENT_EXTRACTION_PROMPT,
)
from api.backend.job.scraping.add_custom import add_custom_items
from api.backend.job.scraping.collect_media import collect_media
ask_ai = ask_open_ai if open_ai_key else ask_ollama
async def scrape_with_agent(agent_job: dict[str, Any]):
LOG.info(f"Starting work for agent job: {agent_job}")
pages = set()
if agent_job["job_options"]["proxies"]:
proxy = random.choice(agent_job["job_options"]["proxies"])
LOG.info(f"Using proxy: {proxy}")
async with AsyncCamoufox(headless=True) as browser:
page: Page = await browser.new_page()
await add_custom_items(
agent_job["url"],
page,
agent_job["job_options"]["custom_cookies"],
agent_job["job_options"]["custom_headers"],
)
try:
await page.set_viewport_size({"width": 1920, "height": 1080})
await page.goto(agent_job["url"], timeout=60000)
if agent_job["job_options"]["collect_media"]:
await collect_media(agent_job["id"], page)
html_content = await page.content()
markdown_content = convert_to_markdown(html_content)
response = await ask_ai(
ELEMENT_EXTRACTION_PROMPT.format(
extraction_prompt=EXTRACT_ELEMENTS_PROMPT,
webpage=markdown_content,
prompt=agent_job["prompt"],
)
)
xpaths = parse_response(response)
captured_elements = await capture_elements(page, xpaths)
final_url = page.url
pages.add((html_content, final_url))
finally:
await page.close()
await browser.close()
name_to_elements = {}
for page in pages:
for element in captured_elements:
if element.name not in name_to_elements:
name_to_elements[element.name] = []
name_to_elements[element.name].append(element)
scraped_elements: list[dict[str, dict[str, list[CapturedElement]]]] = [
{
page[1]: name_to_elements,
}
for page in pages
]
return scraped_elements

View File

@@ -0,0 +1,58 @@
EXTRACT_ELEMENTS_PROMPT = """
You are an assistant that extracts XPath expressions from webpages.
You will receive HTML content in markdown format.
Each element in the markdown has their xpath shown above them in a path like:
<!-- //div -->
Respond only with a list of general XPath expressions inside `<xpaths>...</xpaths>` tags.
You will also decide the decision of what to do next. If there is no decision available, return nothing for that section.
"""
ELEMENT_EXTRACTION_PROMPT = """
{extraction_prompt}
**Guidelines:**
- Prefer shorter, more general XPaths like `//div[...]` or `//span[...]`.
- Avoid overly specific or deep paths like `//div[3]/ul/li[2]/a`.
- Do **not** chain multiple elements deeply (e.g., `//div/span/a`).
- Use XPaths further down the tree when possible.
- Do not include any extra explanation or text.
- One XPath is acceptable if that's all that's needed.
- Try and limit it down to 1 - 3 xpaths.
- Include a name for each xpath.
<important>
- USE THE MOST SIMPLE XPATHS POSSIBLE.
- USE THE MOST GENERAL XPATHS POSSIBLE.
- USE THE MOST SPECIFIC XPATHS POSSIBLE.
- USE THE MOST GENERAL XPATHS POSSIBLE.
</important>
**Example Format:**
```xml
<xpaths>
- <name: insert_name_here>: <xpath: //div>
- <name: insert_name_here>: <xpath: //span>
- <name: insert_name_here>: <xpath: //span[contains(@text, 'example')]>
- <name: insert_name_here>: <xpath: //div[contains(@text, 'example')]>
- <name: insert_name_here>: <xpath: //a[@href]>
- etc
</xpaths>
<decision>
<next_page>
- //a[@href='next_page_url']
</next_page>
</decision>
```
**Input webpage:**
{webpage}
**Target content:**
{prompt}
"""

View File

@@ -0,0 +1,255 @@
# STL
import re
# PDM
from lxml import html, etree
from playwright.async_api import Page
# LOCAL
from api.backend.job.models import CapturedElement
from api.backend.job.utils.text_utils import clean_text
def convert_to_markdown(html_str: str):
parser = html.HTMLParser()
tree = html.fromstring(html_str, parser=parser)
root = tree.getroottree()
def format_attributes(el: etree._Element) -> str:
"""Convert element attributes into a string."""
return " ".join(f'{k}="{v}"' for k, v in el.attrib.items())
def is_visible(el: etree._Element) -> bool:
style = el.attrib.get("style", "").lower()
class_ = el.attrib.get("class", "").lower()
# Check for visibility styles
if "display: none" in style or "visibility: hidden" in style:
return False
if "opacity: 0" in style or "opacity:0" in style:
return False
if "height: 0" in style or "width: 0" in style:
return False
# Check for common hidden classes
if any(
hidden in class_
for hidden in ["hidden", "invisible", "truncate", "collapse"]
):
return False
# Check for hidden attributes
if el.attrib.get("hidden") is not None:
return False
if el.attrib.get("aria-hidden") == "true":
return False
# Check for empty or whitespace-only content
if not el.text and len(el) == 0:
return False
return True
def is_layout_or_decorative(el: etree._Element) -> bool:
tag = el.tag.lower()
# Layout elements
if tag in {"nav", "footer", "header", "aside", "main", "section"}:
return True
# Decorative elements
if tag in {"svg", "path", "circle", "rect", "line", "polygon", "polyline"}:
return True
# Check id and class for layout/decorative keywords
id_class = " ".join(
[el.attrib.get("id", ""), el.attrib.get("class", "")]
).lower()
layout_keywords = {
"sidebar",
"nav",
"header",
"footer",
"menu",
"advert",
"ads",
"breadcrumb",
"container",
"wrapper",
"layout",
"grid",
"flex",
"row",
"column",
"section",
"banner",
"hero",
"card",
"modal",
"popup",
"tooltip",
"dropdown",
"overlay",
}
return any(keyword in id_class for keyword in layout_keywords)
# Tags to ignore in the final markdown output
included_tags = {
"div",
"span",
"a",
"p",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"img",
"button",
"input",
"textarea",
"ul",
"ol",
"li",
"table",
"tr",
"td",
"th",
"input",
"textarea",
"select",
"option",
"optgroup",
"fieldset",
"legend",
}
special_elements = []
normal_elements = []
for el in tree.iter():
if el.tag is etree.Comment:
continue
tag = el.tag.lower()
if tag not in included_tags:
continue
if not is_visible(el):
continue
if is_layout_or_decorative(el):
continue
path = root.getpath(el)
attrs = format_attributes(el)
attrs_str = f" {attrs}" if attrs else ""
text = el.text.strip() if el.text else ""
if not text and not attrs:
continue
# input elements
if tag == "button":
prefix = "🔘 **<button>**"
special_elements.append(f"<!-- {path} -->\n{prefix} {text}")
elif tag == "a":
href = el.attrib.get("href", "")
prefix = f"🔗 **<a href='{href}'>**"
special_elements.append(f"<!-- {path} -->\n{prefix} {text}")
elif tag == "input":
input_type = el.attrib.get("type", "text")
prefix = f"📝 **<input type='{input_type}'>**"
special_elements.append(f"<!-- {path} -->\n{prefix}")
else:
prefix = f"**<{tag}{attrs_str}>**"
if text:
normal_elements.append(f"<!-- {path} -->\n{prefix} {text}")
return "\n\n".join(normal_elements + special_elements) # type: ignore
def parse_response(text: str) -> list[dict[str, str]]:
xpaths = re.findall(r"<xpaths>(.*?)</xpaths>", text, re.DOTALL)
results = []
if xpaths:
lines = xpaths[0].strip().splitlines()
for line in lines:
if line.strip().startswith("-"):
name = re.findall(r"<name: (.*?)>", line)[0]
xpath = re.findall(r"<xpath: (.*?)>", line)[0]
results.append({"name": name, "xpath": xpath})
else:
results.append({"name": "", "xpath": line.strip()})
return results
def parse_next_page(text: str) -> str | None:
next_page = re.findall(r"<next_page>(.*?)</next_page>", text, re.DOTALL)
if next_page:
lines = next_page[0].strip().splitlines()
next_page = [
line.strip().lstrip("-").strip()
for line in lines
if line.strip().startswith("-")
]
return next_page[0] if next_page else None
async def capture_elements(
page: Page, xpaths: list[dict[str, str]]
) -> list[CapturedElement]:
captured_elements = []
seen_texts = set()
for xpath in xpaths:
try:
locator = page.locator(f"xpath={xpath['xpath']}")
count = await locator.count()
for i in range(count):
element_text = ""
element_handle = await locator.nth(i).element_handle()
if not element_handle:
continue
link = await element_handle.get_attribute("href") or ""
text = await element_handle.text_content()
if text:
element_text += text
if link:
element_text += f" ({link})"
cleaned = clean_text(element_text)
if cleaned in seen_texts:
continue
seen_texts.add(cleaned)
captured_elements.append(
CapturedElement(
name=xpath["name"],
text=cleaned,
xpath=xpath["xpath"],
)
)
except Exception as e:
print(f"Error processing xpath {xpath}: {e}")
return captured_elements

View File

@@ -1,32 +1,28 @@
# STL
import os
import logging
from collections.abc import Iterable, AsyncGenerator
# PDM
from openai import OpenAI
from ollama import Message
from fastapi import APIRouter
from fastapi.responses import JSONResponse, StreamingResponse
from openai.types.chat import ChatCompletionMessageParam
# LOCAL
from ollama import Message, AsyncClient
from api.backend.models import AI
from api.backend.ai.clients import (
llama_model,
open_ai_key,
llama_client,
open_ai_model,
openai_client,
)
from api.backend.ai.schemas import AI
from api.backend.routers.handle_exceptions import handle_exceptions
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("AI")
ai_router = APIRouter()
# Load environment variables
open_ai_key = os.getenv("OPENAI_KEY")
open_ai_model = os.getenv("OPENAI_MODEL")
llama_url = os.getenv("OLLAMA_URL")
llama_model = os.getenv("OLLAMA_MODEL")
# Initialize clients
openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None
llama_client = AsyncClient(host=llama_url) if llama_url else None
async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]:
if llama_client and llama_model:
@@ -67,6 +63,7 @@ chat_function = llama_chat if llama_client else openai_chat
@ai_router.post("/ai")
@handle_exceptions(logger=LOG)
async def ai(c: AI):
return StreamingResponse(
chat_function(chat_messages=c.messages), media_type="text/plain"
@@ -74,5 +71,6 @@ async def ai(c: AI):
@ai_router.get("/ai/check")
@handle_exceptions(logger=LOG)
async def check():
return JSONResponse(content={"ai_enabled": bool(open_ai_key or llama_model)})

39
api/backend/ai/clients.py Normal file
View File

@@ -0,0 +1,39 @@
# STL
import os
# PDM
from ollama import AsyncClient
from openai import OpenAI
# Load environment variables
open_ai_key = os.getenv("OPENAI_KEY")
open_ai_model = os.getenv("OPENAI_MODEL")
llama_url = os.getenv("OLLAMA_URL")
llama_model = os.getenv("OLLAMA_MODEL")
# Initialize clients
openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None
llama_client = AsyncClient(host=llama_url) if llama_url else None
async def ask_open_ai(prompt: str) -> str:
if not openai_client:
raise ValueError("OpenAI client not initialized")
response = openai_client.chat.completions.create(
model=open_ai_model or "gpt-4.1-mini",
messages=[{"role": "user", "content": prompt}],
)
return response.choices[0].message.content or ""
async def ask_ollama(prompt: str) -> str:
if not llama_client:
raise ValueError("Ollama client not initialized")
response = await llama_client.chat(
model=llama_model or "", messages=[{"role": "user", "content": prompt}]
)
return response.message.content or ""

View File

@@ -0,0 +1,4 @@
# LOCAL
from .ai import AI
__all__ = ["AI"]

View File

@@ -0,0 +1,9 @@
# STL
from typing import Any
# PDM
import pydantic
class AI(pydantic.BaseModel):
messages: list[Any]

View File

@@ -1,39 +1,60 @@
# STL
import os
import logging
import apscheduler # type: ignore
from contextlib import asynccontextmanager
# PDM
import apscheduler.schedulers
import apscheduler.schedulers.background
from fastapi import FastAPI, Request, status
from fastapi.responses import JSONResponse
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
# LOCAL
from api.backend.ai.ai_router import ai_router
from api.backend.auth.auth_router import auth_router
from api.backend.utils import get_log_level
from api.backend.routers.job_router import job_router
from api.backend.routers.stats_router import stats_router
from api.backend.database.startup import init_database
from fastapi.responses import JSONResponse
from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
from api.backend.scheduler import scheduler
from api.backend.ai.ai_router import ai_router
from api.backend.job.job_router import job_router
from api.backend.auth.auth_router import auth_router
from api.backend.database.startup import init_database
from api.backend.stats.stats_router import stats_router
from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
log_level = os.getenv("LOG_LEVEL")
LOG_LEVEL = get_log_level(log_level)
logging.basicConfig(
level=LOG_LEVEL,
format="%(levelname)s: %(asctime)s - %(name)s - %(message)s",
format="%(levelname)s: %(asctime)s - [%(name)s] - %(message)s",
handlers=[logging.StreamHandler()],
)
LOG = logging.getLogger(__name__)
app = FastAPI(title="api", root_path="/api")
@asynccontextmanager
async def lifespan(_: FastAPI):
# Startup
LOG.info("Starting application...")
init_database()
LOG.info("Starting cron scheduler...")
start_cron_scheduler(scheduler)
scheduler.start()
LOG.info("Cron scheduler started successfully")
yield
# Shutdown
LOG.info("Shutting down application...")
LOG.info("Stopping cron scheduler...")
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
LOG.info("Cron scheduler stopped")
LOG.info("Application shutdown complete")
app = FastAPI(title="api", root_path="/api", lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
@@ -43,28 +64,12 @@ app.add_middleware(
allow_headers=["*"],
)
app.include_router(auth_router)
app.include_router(ai_router)
app.include_router(job_router)
app.include_router(stats_router)
@app.on_event("startup")
async def startup_event():
start_cron_scheduler(scheduler)
scheduler.start()
if os.getenv("ENV") != "test":
init_database()
LOG.info("Starting up...")
@app.on_event("shutdown")
def shutdown_scheduler():
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(request: Request, exc: RequestValidationError):
exc_str = f"{exc}".replace("\n", " ").replace(" ", " ")

View File

@@ -1,13 +1,14 @@
# STL
from datetime import timedelta
import os
import logging
from datetime import timedelta
# PDM
from fastapi import Depends, APIRouter, HTTPException, status
from fastapi.security import OAuth2PasswordRequestForm
# LOCAL
from api.backend.schemas import User, Token, UserCreate
from api.backend.auth.schemas import User, Token, UserCreate
from api.backend.auth.auth_utils import (
ACCESS_TOKEN_EXPIRE_MINUTES,
get_current_user,
@@ -15,18 +16,19 @@ from api.backend.auth.auth_utils import (
get_password_hash,
create_access_token,
)
import logging
from api.backend.database.common import update
from api.backend.routers.handle_exceptions import handle_exceptions
auth_router = APIRouter()
LOG = logging.getLogger("auth_router")
LOG = logging.getLogger("Auth")
@auth_router.post("/auth/token", response_model=Token)
@handle_exceptions(logger=LOG)
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
user = await authenticate_user(form_data.username, form_data.password)
if not user:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
@@ -47,6 +49,7 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends(
@auth_router.post("/auth/signup", response_model=User)
@handle_exceptions(logger=LOG)
async def create_user(user: UserCreate):
hashed_password = get_password_hash(user.password)
user_dict = user.model_dump()
@@ -60,10 +63,16 @@ async def create_user(user: UserCreate):
@auth_router.get("/auth/users/me", response_model=User)
@handle_exceptions(logger=LOG)
async def read_users_me(current_user: User = Depends(get_current_user)):
return current_user
@auth_router.get("/auth/check")
@handle_exceptions(logger=LOG)
async def check_auth():
return {"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True"}
return {
"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True",
"recordings_enabled": os.environ.get("RECORDINGS_ENABLED", "true").lower()
== "true",
}

View File

@@ -1,8 +1,8 @@
# STL
import os
import logging
from typing import Any, Optional
from datetime import datetime, timedelta
import logging
# PDM
from jose import JWTError, jwt
@@ -12,11 +12,10 @@ from passlib.context import CryptContext
from fastapi.security import OAuth2PasswordBearer
# LOCAL
from api.backend.schemas import User, UserInDB, TokenData
from api.backend.auth.schemas import User, UserInDB, TokenData
from api.backend.database.common import query
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Auth")
_ = load_dotenv()
@@ -118,7 +117,8 @@ async def get_current_user(token: str = Depends(oauth2_scheme)):
LOG.error(f"Exception occurred: {e}")
return EMPTY_USER
user = await get_user(email=token_data.email)
user = await get_user(email=token_data.email or "")
if user is None:
return EMPTY_USER
@@ -136,6 +136,7 @@ async def require_user(token: str = Depends(oauth2_scheme)):
payload: Optional[dict[str, Any]] = jwt.decode(
token, SECRET_KEY, algorithms=[ALGORITHM]
)
if not payload:
raise credentials_exception
@@ -149,7 +150,7 @@ async def require_user(token: str = Depends(oauth2_scheme)):
except JWTError:
raise credentials_exception
user = await get_user(email=token_data.email)
user = await get_user(email=token_data.email or "")
if user is None:
raise credentials_exception

View File

@@ -0,0 +1,4 @@
# LOCAL
from .auth import User, Token, UserInDB, TokenData, UserCreate
__all__ = ["User", "Token", "UserInDB", "TokenData", "UserCreate"]

View File

@@ -1 +1,24 @@
# STL
import os
from pathlib import Path
DATABASE_PATH = "data/database.db"
RECORDINGS_DIR = Path("media/recordings")
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
MEDIA_DIR = Path("media")
MEDIA_TYPES = [
"audio",
"documents",
"images",
"pdfs",
"presentations",
"spreadsheets",
"videos",
]
REGISTRATION_ENABLED = os.getenv("REGISTRATION_ENABLED", "true").lower() == "true"
DEFAULT_USER_EMAIL = os.getenv("DEFAULT_USER_EMAIL")
DEFAULT_USER_PASSWORD = os.getenv("DEFAULT_USER_PASSWORD")
DEFAULT_USER_FULL_NAME = os.getenv("DEFAULT_USER_FULL_NAME")
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")

View File

@@ -1,3 +1,5 @@
from .common import insert, QUERIES, update
# LOCAL
from .common import insert, update, connect
from .schema import INIT_QUERY
__all__ = ["insert", "QUERIES", "update"]
__all__ = ["insert", "update", "INIT_QUERY", "connect"]

View File

@@ -1,12 +1,13 @@
# STL
import logging
import sqlite3
from typing import Any, Optional
from api.backend.constants import DATABASE_PATH
from api.backend.utils import format_json, format_sql_row_to_python
from api.backend.database.schema import INIT_QUERY
from api.backend.database.queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
import logging
LOG = logging.getLogger(__name__)
# LOCAL
from api.backend.constants import DATABASE_PATH
from api.backend.database.utils import format_json, format_sql_row_to_python
LOG = logging.getLogger("Database")
def connect():
@@ -25,8 +26,10 @@ def insert(query: str, values: tuple[Any, ...]):
try:
_ = cursor.execute(query, copy)
connection.commit()
except sqlite3.Error as e:
LOG.error(f"An error occurred: {e}")
finally:
cursor.close()
connection.close()
@@ -78,15 +81,9 @@ def update(query: str, values: Optional[tuple[Any, ...]] = None):
return res.rowcount
except sqlite3.Error as e:
LOG.error(f"An error occurred: {e}")
finally:
cursor.close()
connection.close()
return 0
QUERIES = {
"init": INIT_QUERY,
"insert_job": JOB_INSERT_QUERY,
"delete_job": DELETE_JOB_QUERY,
}

View File

@@ -1,3 +1,4 @@
from .queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
# LOCAL
from .job.job_queries import DELETE_JOB_QUERY, JOB_INSERT_QUERY
__all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"]

View File

@@ -0,0 +1,68 @@
# STL
import logging
from typing import Any
# LOCAL
from api.backend.database.utils import format_list_for_query
from api.backend.database.common import query, insert, update
JOB_INSERT_QUERY = """
INSERT INTO jobs
(id, url, elements, user, time_created, result, status, chat, job_options, agent_mode, prompt)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
"""
DELETE_JOB_QUERY = """
DELETE FROM jobs WHERE id IN ()
"""
LOG = logging.getLogger("Database")
def insert_job(item: dict[str, Any]) -> None:
insert(
JOB_INSERT_QUERY,
(
item["id"],
item["url"],
item["elements"],
item["user"],
item["time_created"],
item["result"],
item["status"],
item["chat"],
item["job_options"],
item["agent_mode"],
item["prompt"],
),
)
LOG.info(f"Inserted item: {item}")
async def get_queued_job():
queued_job_query = (
"SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
)
res = query(queued_job_query)
LOG.info(f"Got queued job: {res}")
return res[0] if res else None
async def update_job(ids: list[str], field: str, value: Any):
query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
res = update(query, tuple([value] + ids))
LOG.info(f"Updated job: {res}")
async def delete_jobs(jobs: list[str]):
if not jobs:
LOG.info("No jobs to delete.")
return False
query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}"
res = update(query, tuple(jobs))
LOG.info(f"Deleted jobs: {res}")
return res

View File

@@ -1,9 +0,0 @@
JOB_INSERT_QUERY = """
INSERT INTO jobs
(id, url, elements, user, time_created, result, status, chat, job_options)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
"""
DELETE_JOB_QUERY = """
DELETE FROM jobs WHERE id IN ()
"""

View File

@@ -0,0 +1,41 @@
# LOCAL
from api.backend.database.common import query
async def average_elements_per_link(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
AVG(json_array_length(elements)) AS average_elements,
COUNT(*) AS count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = query(job_query, (user,))
return results
async def get_jobs_per_day(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
COUNT(*) AS job_count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = query(job_query, (user,))
return results

View File

@@ -27,4 +27,8 @@ CREATE TABLE IF NOT EXISTS cron_jobs (
time_updated DATETIME NOT NULL,
FOREIGN KEY (job_id) REFERENCES jobs(id)
);
ALTER TABLE jobs ADD COLUMN agent_mode BOOLEAN NOT NULL DEFAULT FALSE;
ALTER TABLE jobs ADD COLUMN prompt STRING;
ALTER TABLE jobs ADD COLUMN favorite BOOLEAN NOT NULL DEFAULT FALSE;
"""

View File

@@ -1,24 +1,52 @@
import os
from api.backend.database.common import connect, QUERIES, insert
# STL
import logging
import sqlite3
# LOCAL
from api.backend.constants import (
DEFAULT_USER_EMAIL,
REGISTRATION_ENABLED,
DEFAULT_USER_PASSWORD,
DEFAULT_USER_FULL_NAME,
)
from api.backend.auth.auth_utils import get_password_hash
from api.backend.database.common import insert, connect
from api.backend.database.schema import INIT_QUERY
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Database")
def init_database():
def execute_startup_query():
cursor = connect()
for query in QUERIES["init"].strip().split(";"):
if query.strip():
for query in INIT_QUERY.strip().split(";"):
query = query.strip()
if not query:
continue
try:
LOG.info(f"Executing query: {query}")
_ = cursor.execute(query)
if os.environ.get("REGISTRATION_ENABLED", "True") == "False":
default_user_email = os.environ.get("DEFAULT_USER_EMAIL")
default_user_password = os.environ.get("DEFAULT_USER_PASSWORD")
default_user_full_name = os.environ.get("DEFAULT_USER_FULL_NAME")
except sqlite3.OperationalError as e:
if "duplicate column name" in str(e).lower():
LOG.warning(f"Skipping duplicate column error: {e}")
continue
else:
LOG.error(f"Error executing query: {query}")
raise
cursor.close()
def init_database():
execute_startup_query()
if not REGISTRATION_ENABLED:
default_user_email = DEFAULT_USER_EMAIL
default_user_password = DEFAULT_USER_PASSWORD
default_user_full_name = DEFAULT_USER_FULL_NAME
if (
not default_user_email
@@ -39,5 +67,3 @@ def init_database():
default_user_full_name,
),
)
cursor.close()

View File

@@ -0,0 +1,30 @@
# STL
import json
from typing import Any
def format_list_for_query(ids: list[str]):
return (
f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)"
)
def format_sql_row_to_python(row: dict[str, Any]):
new_row: dict[str, Any] = {}
for key, value in row.items():
if isinstance(value, str):
try:
new_row[key] = json.loads(value)
except json.JSONDecodeError:
new_row[key] = value
else:
new_row[key] = value
return new_row
def format_json(items: list[Any]):
for idx, item in enumerate(items):
if isinstance(item, (dict, list)):
formatted_item = json.dumps(item)
items[idx] = formatted_item

View File

@@ -1,17 +1,9 @@
from .job import (
insert,
update_job,
delete_jobs,
get_jobs_per_day,
get_queued_job,
average_elements_per_link,
)
# LOCAL
from .job import insert, update_job, delete_jobs, get_queued_job
__all__ = [
"insert",
"update_job",
"delete_jobs",
"get_jobs_per_day",
"get_queued_job",
"average_elements_per_link",
]

View File

@@ -1,15 +1,19 @@
# STL
import uuid
import logging
import datetime
from typing import Any
import uuid
from api.backend.database.common import insert, query
from api.backend.models import CronJob
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
from apscheduler.triggers.cron import CronTrigger # type: ignore
# PDM
from apscheduler.triggers.cron import CronTrigger
from apscheduler.schedulers.background import BackgroundScheduler
# LOCAL
from api.backend.job import insert as insert_job
import logging
from api.backend.schemas.cron import CronJob
from api.backend.database.common import query, insert
LOG = logging.getLogger("Cron Scheduler")
LOG = logging.getLogger("Cron")
def insert_cron_job(cron_job: CronJob):
@@ -17,6 +21,7 @@ def insert_cron_job(cron_job: CronJob):
INSERT INTO cron_jobs (id, user_email, job_id, cron_expression, time_created, time_updated)
VALUES (?, ?, ?, ?, ?, ?)
"""
values = (
cron_job.id,
cron_job.user_email,
@@ -36,6 +41,7 @@ def delete_cron_job(id: str, user_email: str):
DELETE FROM cron_jobs
WHERE id = ? AND user_email = ?
"""
values = (id, user_email)
insert(query, values)

View File

@@ -3,20 +3,18 @@ import logging
from typing import Any
# LOCAL
from api.backend.utils import format_list_for_query
from api.backend.database.common import (
insert as common_insert,
query as common_query,
QUERIES,
update as common_update,
)
from api.backend.database.utils import format_list_for_query
from api.backend.database.common import query as common_query
from api.backend.database.common import insert as common_insert
from api.backend.database.common import update as common_update
from api.backend.database.queries.job.job_queries import JOB_INSERT_QUERY
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Job")
def insert(item: dict[str, Any]) -> None:
common_insert(
QUERIES["insert_job"],
JOB_INSERT_QUERY,
(
item["id"],
item["url"],
@@ -27,9 +25,12 @@ def insert(item: dict[str, Any]) -> None:
item["status"],
item["chat"],
item["job_options"],
item["agent_mode"],
item["prompt"],
),
)
LOG.info(f"Inserted item: {item}")
LOG.debug(f"Inserted item: {item}")
async def get_queued_job():
@@ -37,61 +38,22 @@ async def get_queued_job():
"SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
)
res = common_query(query)
LOG.info(f"Got queued job: {res}")
LOG.debug(f"Got queued job: {res}")
return res[0] if res else None
async def update_job(ids: list[str], field: str, value: Any):
query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
res = common_update(query, tuple([value] + ids))
LOG.info(f"Updated job: {res}")
LOG.debug(f"Updated job: {res}")
async def delete_jobs(jobs: list[str]):
if not jobs:
LOG.info("No jobs to delete.")
LOG.debug("No jobs to delete.")
return False
query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}"
res = common_update(query, tuple(jobs))
return res > 0
async def average_elements_per_link(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
AVG(json_array_length(elements)) AS average_elements,
COUNT(*) AS count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = common_query(job_query, (user,))
return results
async def get_jobs_per_day(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
COUNT(*) AS job_count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = common_query(job_query, (user,))
return results

View File

@@ -0,0 +1,248 @@
# STL
import csv
import uuid
import random
import logging
import datetime
from io import StringIO
# PDM
from fastapi import Depends, APIRouter
from fastapi.encoders import jsonable_encoder
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
from apscheduler.triggers.cron import CronTrigger # type: ignore
# LOCAL
from api.backend.job import insert, update_job, delete_jobs
from api.backend.constants import MEDIA_DIR, MEDIA_TYPES, RECORDINGS_DIR
from api.backend.scheduler import scheduler
from api.backend.schemas.job import Job, UpdateJobs, DownloadJob, DeleteScrapeJobs
from api.backend.auth.schemas import User
from api.backend.schemas.cron import CronJob, DeleteCronJob
from api.backend.database.utils import format_list_for_query
from api.backend.auth.auth_utils import get_current_user
from api.backend.database.common import query
from api.backend.job.utils.text_utils import clean_text
from api.backend.job.models.job_options import FetchOptions
from api.backend.routers.handle_exceptions import handle_exceptions
from api.backend.job.utils.clean_job_format import clean_job_format
from api.backend.job.cron_scheduling.cron_scheduling import (
get_cron_jobs,
delete_cron_job,
insert_cron_job,
get_cron_job_trigger,
insert_job_from_cron_job,
)
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
LOG = logging.getLogger("Job")
job_router = APIRouter()
@job_router.post("/update")
@handle_exceptions(logger=LOG)
async def update(update_jobs: UpdateJobs, _: User = Depends(get_current_user)):
"""Used to update jobs"""
await update_job(update_jobs.ids, update_jobs.field, update_jobs.value)
return JSONResponse(content={"message": "Jobs updated successfully."})
@job_router.post("/submit-scrape-job")
@handle_exceptions(logger=LOG)
async def submit_scrape_job(job: Job):
LOG.info(f"Recieved job: {job}")
job.id = uuid.uuid4().hex
job_dict = job.model_dump()
insert(job_dict)
return JSONResponse(
content={"id": job.id, "message": "Job submitted successfully."}
)
@job_router.post("/retrieve-scrape-jobs")
@handle_exceptions(logger=LOG)
async def retrieve_scrape_jobs(
fetch_options: FetchOptions, user: User = Depends(get_current_user)
):
LOG.info(f"Retrieving jobs for account: {user.email}")
ATTRIBUTES = "chat" if fetch_options.chat else "*"
job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?"
results = query(job_query, (user.email,))
return JSONResponse(content=jsonable_encoder(results[::-1]))
@job_router.get("/job/{id}")
@handle_exceptions(logger=LOG)
async def job(id: str, user: User = Depends(get_current_user)):
LOG.info(f"Retrieving jobs for account: {user.email}")
job_query = "SELECT * FROM jobs WHERE user = ? AND id = ?"
results = query(job_query, (user.email, id))
return JSONResponse(content=jsonable_encoder(results))
@job_router.post("/download")
@handle_exceptions(logger=LOG)
async def download(download_job: DownloadJob):
LOG.info(f"Downloading job with ids: {download_job.ids}")
job_query = (
f"SELECT * FROM jobs WHERE id IN {format_list_for_query(download_job.ids)}"
)
results = query(job_query, tuple(download_job.ids))
if download_job.job_format == "csv":
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
headers = [
"id",
"url",
"element_name",
"xpath",
"text",
"user",
"time_created",
]
csv_writer.writerow(headers)
for result in results:
for res in result["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
csv_writer.writerow(
[
result.get("id", "")
+ "-"
+ str(random.randint(0, 1000000)),
url,
element_name,
value.get("xpath", ""),
text,
result.get("user", ""),
result.get("time_created", ""),
]
)
_ = csv_buffer.seek(0)
response = StreamingResponse(
csv_buffer,
media_type="text/csv",
)
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
elif download_job.job_format == "md":
response = StreamingResponse(
stream_md_from_job_results(results),
media_type="text/markdown",
)
response.headers["Content-Disposition"] = "attachment; filename=export.md"
return response
@job_router.get("/job/{id}/convert-to-csv")
@handle_exceptions(logger=LOG)
async def convert_to_csv(id: str):
job_query = f"SELECT * FROM jobs WHERE id = ?"
results = query(job_query, (id,))
return JSONResponse(content=clean_job_format(results))
@job_router.post("/delete-scrape-jobs")
@handle_exceptions(logger=LOG)
async def delete(delete_scrape_jobs: DeleteScrapeJobs):
result = await delete_jobs(delete_scrape_jobs.ids)
return (
JSONResponse(content={"message": "Jobs successfully deleted."})
if result
else JSONResponse(content={"error": "Jobs not deleted."})
)
@job_router.post("/schedule-cron-job")
@handle_exceptions(logger=LOG)
async def schedule_cron_job(cron_job: CronJob):
if not cron_job.id:
cron_job.id = uuid.uuid4().hex
if not cron_job.time_created:
cron_job.time_created = datetime.datetime.now()
if not cron_job.time_updated:
cron_job.time_updated = datetime.datetime.now()
insert_cron_job(cron_job)
queried_job = query("SELECT * FROM jobs WHERE id = ?", (cron_job.job_id,))
scheduler.add_job(
insert_job_from_cron_job,
get_cron_job_trigger(cron_job.cron_expression),
id=cron_job.id,
args=[queried_job[0]],
)
return JSONResponse(content={"message": "Cron job scheduled successfully."})
@job_router.post("/delete-cron-job")
@handle_exceptions(logger=LOG)
async def delete_cron_job_request(request: DeleteCronJob):
if not request.id:
return JSONResponse(
content={"error": "Cron job id is required."}, status_code=400
)
delete_cron_job(request.id, request.user_email)
scheduler.remove_job(request.id)
return JSONResponse(content={"message": "Cron job deleted successfully."})
@job_router.get("/cron-jobs")
@handle_exceptions(logger=LOG)
async def get_cron_jobs_request(user: User = Depends(get_current_user)):
cron_jobs = get_cron_jobs(user.email)
return JSONResponse(content=jsonable_encoder(cron_jobs))
@job_router.get("/recordings/{id}")
@handle_exceptions(logger=LOG)
async def get_recording(id: str):
path = RECORDINGS_DIR / f"{id}.mp4"
if not path.exists():
return JSONResponse(content={"error": "Recording not found."}, status_code=404)
return FileResponse(
path, headers={"Content-Type": "video/mp4", "Accept-Ranges": "bytes"}
)
@job_router.get("/get-media")
@handle_exceptions(logger=LOG)
async def get_media(id: str):
files: dict[str, list[str]] = {}
for media_type in MEDIA_TYPES:
path = MEDIA_DIR / media_type / f"{id}"
files[media_type] = [file.name for file in path.glob("*")]
return JSONResponse(content={"files": files})
@job_router.get("/media")
@handle_exceptions(logger=LOG)
async def get_media_file(id: str, type: str, file: str):
path = MEDIA_DIR / type / f"{id}" / file
if not path.exists():
return JSONResponse(content={"error": "Media file not found."}, status_code=404)
return FileResponse(path)

View File

@@ -1,3 +1,5 @@
from .job_options import JobOptions
# LOCAL
from .job import Element, CapturedElement
from .job_options import Proxy, JobOptions
__all__ = ["JobOptions"]
__all__ = ["JobOptions", "CapturedElement", "Element", "Proxy"]

View File

@@ -0,0 +1,15 @@
from typing import Optional
import pydantic
class Element(pydantic.BaseModel):
name: str
xpath: str
url: Optional[str] = None
class CapturedElement(pydantic.BaseModel):
xpath: str
text: str
name: str

View File

@@ -1,8 +1,19 @@
from pydantic import BaseModel
# STL
from typing import Any, Optional
# PDM
from pydantic import BaseModel
# LOCAL
from api.backend.job.models.site_map import SiteMap
class Proxy(BaseModel):
server: str
username: Optional[str] = None
password: Optional[str] = None
class FetchOptions(BaseModel):
chat: Optional[bool] = None
@@ -10,6 +21,7 @@ class FetchOptions(BaseModel):
class JobOptions(BaseModel):
multi_page_scrape: bool = False
custom_headers: dict[str, Any] = {}
proxies: list[str] = []
proxies: list[Proxy] = []
site_map: Optional[SiteMap] = None
collect_media: bool = False
custom_cookies: list[dict[str, Any]] = []

View File

@@ -0,0 +1,49 @@
# STL
import logging
from typing import Any, Optional
from urllib.parse import urlparse
# PDM
from playwright.async_api import Page, BrowserContext
LOG = logging.getLogger("Job")
async def add_custom_cookies(
custom_cookies: list[dict[str, Any]],
url: str,
context: BrowserContext,
) -> None:
parsed_url = urlparse(url)
domain = parsed_url.netloc
for cookie in custom_cookies:
cookie_dict = {
"name": cookie.get("name", ""),
"value": cookie.get("value", ""),
"domain": domain,
"path": "/",
}
LOG.info(f"Adding cookie: {cookie_dict}")
await context.add_cookies([cookie_dict]) # type: ignore
async def add_custom_headers(
custom_headers: dict[str, Any],
page: Page,
) -> None:
await page.set_extra_http_headers(custom_headers)
async def add_custom_items(
url: str,
page: Page,
cookies: Optional[list[dict[str, Any]]] = None,
headers: Optional[dict[str, Any]] = None,
) -> None:
if cookies:
await add_custom_cookies(cookies, url, page.context)
if headers:
await add_custom_headers(headers, page)

View File

@@ -1,20 +1,24 @@
# STL
import os
from pathlib import Path
from urllib.parse import urlparse
import re
import logging
from typing import Dict, List
from pathlib import Path
from urllib.parse import urljoin, urlparse
# PDM
import aiohttp
from playwright.async_api import Page
from api.backend.utils import LOG
LOG = logging.getLogger("Job")
async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
async def collect_media(id: str, page: Page) -> dict[str, list[dict[str, str]]]:
media_types = {
"images": "img",
"videos": "video",
"audio": "audio",
"pdfs": 'a[href$=".pdf"]',
"pdfs": 'a[href$=".pdf"], a[href*=".pdf#page="]',
"documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]',
"presentations": 'a[href$=".ppt"], a[href$=".pptx"]',
"spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]',
@@ -48,6 +52,11 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
root_domain = f"{root_url.scheme}://{root_url.netloc}"
url = f"{root_domain}{url}"
if url and re.match(r"^[\w\-]+/", url):
root_url = urlparse(page.url)
root_domain = f"{root_url.scheme}://{root_url.netloc}"
url = urljoin(root_domain + "/", url)
if url and url.startswith(("http://", "https://")):
try:
parsed = urlparse(url)
@@ -67,15 +76,20 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
}.get(media_type, "")
filename += ext
file_path = media_dir / filename
if not os.path.exists(media_dir / id):
os.makedirs(media_dir / id, exist_ok=True)
file_path = media_dir / id / f"{filename}"
async with session.get(url) as response:
response.raise_for_status()
with open(file_path, "wb") as f:
while True:
chunk = await response.content.read(8192)
if not chunk:
break
f.write(chunk)
urls.append({"url": url, "local_path": str(file_path)})

View File

@@ -1,83 +1,80 @@
import logging
# STL
import random
from typing import Any, Optional, cast
import logging
from typing import Any, cast
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup, Tag
# PDM
from bs4 import Tag, BeautifulSoup
from lxml import etree
from camoufox import AsyncCamoufox
from playwright.async_api import Page
from urllib.parse import urlparse, urljoin
from api.backend.models import Element, CapturedElement
from api.backend.job.scraping.scraping_utils import scrape_content
# LOCAL
from api.backend.constants import RECORDINGS_ENABLED
from api.backend.job.models import Element, CapturedElement
from api.backend.job.utils.text_utils import clean_text
from api.backend.job.scraping.add_custom import add_custom_items
from api.backend.job.scraping.scraping_utils import (
sxpath,
is_same_domain,
scrape_content,
)
from api.backend.job.site_mapping.site_mapping import handle_site_mapping
LOG = logging.getLogger(__name__)
def is_same_domain(url: str, original_url: str) -> bool:
parsed_url = urlparse(url)
parsed_original_url = urlparse(original_url)
return parsed_url.netloc == parsed_original_url.netloc or parsed_url.netloc == ""
def clean_xpath(xpath: str) -> str:
parts = xpath.split("/")
clean_parts = ["/" if part == "" else part for part in parts]
clean_xpath = "//".join(clean_parts).replace("////", "//").replace("'", "\\'")
LOG.info(f"Cleaned xpath: {clean_xpath}")
return clean_xpath
def sxpath(context: etree._Element, xpath: str):
return context.xpath(xpath)
LOG = logging.getLogger("Job")
async def make_site_request(
id: str,
url: str,
headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False,
job_options: dict[str, Any],
visited_urls: set[str] = set(),
pages: set[tuple[str, str]] = set(),
original_url: str = "",
proxies: Optional[list[str]] = None,
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
):
headers = job_options["custom_headers"]
multi_page_scrape = job_options["multi_page_scrape"]
proxies = job_options["proxies"]
site_map = job_options["site_map"]
collect_media = job_options["collect_media"]
custom_cookies = job_options["custom_cookies"]
if url in visited_urls:
return
proxy = None
if proxies:
proxy = random.choice(proxies)
LOG.info(f"Using proxy: {proxy}")
async with AsyncCamoufox(headless=True, proxy=proxy) as browser:
async with AsyncCamoufox(headless=not RECORDINGS_ENABLED, proxy=proxy) as browser:
page: Page = await browser.new_page()
await page.set_viewport_size({"width": 1920, "height": 1080})
if headers:
await page.set_extra_http_headers(headers)
# Add cookies and headers
await add_custom_items(url, page, custom_cookies, headers)
LOG.info(f"Visiting URL: {url}")
try:
await page.goto(url, timeout=60000)
await page.wait_for_load_state("networkidle", timeout=10000)
await page.wait_for_load_state("networkidle")
final_url = page.url
visited_urls.add(url)
visited_urls.add(final_url)
html_content = await scrape_content(page, pages, collect_media)
html_content = await scrape_content(id, page, pages, collect_media)
html_content = await page.content()
pages.add((html_content, final_url))
if site_map:
await handle_site_mapping(
site_map, page, pages, collect_media=collect_media
id, site_map, page, pages, collect_media=collect_media
)
finally:
@@ -104,15 +101,12 @@ async def make_site_request(
if link not in visited_urls and is_same_domain(link, original_url):
await make_site_request(
id,
link,
headers=headers,
multi_page_scrape=multi_page_scrape,
job_options=job_options,
visited_urls=visited_urls,
pages=pages,
original_url=original_url,
proxies=proxies,
site_map=site_map,
collect_media=collect_media,
)
@@ -127,11 +121,13 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
for e in el: # type: ignore
text = (
"\t".join(str(t) for t in e.itertext())
" ".join(str(t) for t in e.itertext())
if isinstance(e, etree._Element)
else str(e) # type: ignore
)
text = clean_text(text)
captured_element = CapturedElement(
xpath=elem.xpath, text=text, name=elem.name
)
@@ -145,27 +141,21 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
async def scrape(
id: str,
url: str,
xpaths: list[Element],
headers: Optional[dict[str, Any]] = None,
multi_page_scrape: bool = False,
proxies: Optional[list[str]] = None,
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
job_options: dict[str, Any],
):
visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set()
await make_site_request(
id,
url,
headers=headers,
multi_page_scrape=multi_page_scrape,
job_options=job_options,
visited_urls=visited_urls,
pages=pages,
original_url=url,
proxies=proxies,
site_map=site_map,
collect_media=collect_media,
)
elements: list[dict[str, dict[str, list[CapturedElement]]]] = []

View File

@@ -1,14 +1,21 @@
# STL
import asyncio
import logging
from typing import Set, Tuple
from urllib.parse import urlparse
# PDM
from lxml import etree
from playwright.async_api import Page
from api.backend.utils import LOG
# LOCAL
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
LOG = logging.getLogger("Job")
async def scrape_content(
page: Page, pages: Set[Tuple[str, str]], collect_media: bool
id: str, page: Page, pages: Set[Tuple[str, str]], collect_media: bool
) -> str:
last_height = await page.evaluate("document.body.scrollHeight")
@@ -27,6 +34,25 @@ async def scrape_content(
if collect_media:
LOG.info("Collecting media")
await collect_media_utils(page)
await collect_media_utils(id, page)
return html
def is_same_domain(url: str, original_url: str) -> bool:
parsed_url = urlparse(url)
parsed_original_url = urlparse(original_url)
return parsed_url.netloc == parsed_original_url.netloc or parsed_url.netloc == ""
def clean_xpath(xpath: str) -> str:
parts = xpath.split("/")
clean_parts = ["/" if part == "" else part for part in parts]
clean_xpath = "//".join(clean_parts).replace("////", "//").replace("'", "\\'")
LOG.info(f"Cleaned xpath: {clean_xpath}")
return clean_xpath
def sxpath(context: etree._Element, xpath: str):
return context.xpath(xpath)

View File

@@ -1,14 +1,17 @@
import logging
# STL
import asyncio
import logging
from copy import deepcopy
from typing import Any
# PDM
from playwright.async_api import Page
# LOCAL
from api.backend.job.models.site_map import Action, SiteMap
from api.backend.job.scraping.scraping_utils import scrape_content
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Job")
def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]:
@@ -24,7 +27,6 @@ def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]:
async def handle_input(action: Action, page: Page) -> bool:
try:
element = page.locator(f"xpath={action.xpath}")
await element.wait_for(state="visible", timeout=10000)
LOG.info(f"Sending keys: {action.input} to element: {action.xpath}")
await element.fill(action.input)
return True
@@ -36,7 +38,6 @@ async def handle_input(action: Action, page: Page) -> bool:
async def handle_click(action: Action, page: Page) -> bool:
try:
element = page.locator(f"xpath={action.xpath}")
await element.wait_for(state="visible", timeout=10000)
LOG.info(f"Clicking element: {action.xpath}")
await element.click()
return True
@@ -52,6 +53,7 @@ ACTION_MAP = {
async def handle_site_mapping(
id: str,
site_map_dict: dict[str, Any],
page: Page,
pages: set[tuple[str, str]],
@@ -68,11 +70,11 @@ async def handle_site_mapping(
await asyncio.sleep(2)
await scrape_content(page, pages, collect_media=collect_media)
await scrape_content(id, page, pages, collect_media=collect_media)
cleared_site_map_dict = clear_done_actions(site_map_dict)
if cleared_site_map_dict["actions"]:
await handle_site_mapping(
cleared_site_map_dict, page, pages, collect_media=collect_media
id, cleared_site_map_dict, page, pages, collect_media=collect_media
)

View File

@@ -1,6 +1,8 @@
# STL
from typing import Any
from api.backend.utils import clean_text
# LOCAL
from api.backend.job.utils.text_utils import clean_text
def clean_job_format(jobs: list[dict[str, Any]]) -> dict[str, Any]:

View File

@@ -1,6 +1,8 @@
# STL
from typing import Any
from api.backend.utils import clean_text
# LOCAL
from api.backend.job.utils.text_utils import clean_text
def stream_md_from_job_results(jobs: list[dict[str, Any]]):

View File

@@ -0,0 +1,10 @@
def clean_text(text: str):
text = text.strip()
text = text.replace("\n", " ")
text = text.replace("\t", " ")
text = text.replace("\r", " ")
text = text.replace("\f", " ")
text = text.replace("\v", " ")
text = text.replace("\b", " ")
text = text.replace("\a", " ")
return text

View File

@@ -0,0 +1,31 @@
# STL
import logging
import traceback
from typing import Any, Union, Callable, Awaitable
from functools import wraps
# PDM
from fastapi.responses import JSONResponse
def handle_exceptions(
logger: logging.Logger,
) -> Callable[
[Callable[..., Awaitable[Any]]], Callable[..., Awaitable[Union[Any, JSONResponse]]]
]:
def decorator(
func: Callable[..., Awaitable[Any]],
) -> Callable[..., Awaitable[Union[Any, JSONResponse]]]:
@wraps(func)
async def wrapper(*args: Any, **kwargs: Any) -> Union[Any, JSONResponse]:
try:
return await func(*args, **kwargs)
except Exception as e:
logger.error(f"Exception occurred: {e}")
traceback.print_exc()
return JSONResponse(content={"error": str(e)}, status_code=500)
return wrapper
return decorator

View File

@@ -1,233 +0,0 @@
# STL
import datetime
import uuid
import traceback
from io import StringIO
import csv
import logging
import random
# PDM
from fastapi import Depends, APIRouter
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse, StreamingResponse
from api.backend.scheduler import scheduler
from apscheduler.triggers.cron import CronTrigger # type: ignore
# LOCAL
from api.backend.job import insert, update_job, delete_jobs
from api.backend.models import (
DeleteCronJob,
UpdateJobs,
DownloadJob,
DeleteScrapeJobs,
Job,
CronJob,
)
from api.backend.schemas import User
from api.backend.auth.auth_utils import get_current_user
from api.backend.utils import clean_text, format_list_for_query
from api.backend.job.models.job_options import FetchOptions
from api.backend.database.common import query
from api.backend.job.cron_scheduling.cron_scheduling import (
delete_cron_job,
get_cron_job_trigger,
insert_cron_job,
get_cron_jobs,
insert_job_from_cron_job,
)
from api.backend.job.utils.clean_job_format import clean_job_format
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
LOG = logging.getLogger(__name__)
job_router = APIRouter()
@job_router.post("/update")
async def update(update_jobs: UpdateJobs, _: User = Depends(get_current_user)):
"""Used to update jobs"""
await update_job(update_jobs.ids, update_jobs.field, update_jobs.value)
@job_router.post("/submit-scrape-job")
async def submit_scrape_job(job: Job):
LOG.info(f"Recieved job: {job}")
try:
job.id = uuid.uuid4().hex
job_dict = job.model_dump()
insert(job_dict)
return JSONResponse(content={"id": job.id})
except Exception as e:
LOG.error(f"Exception occurred: {traceback.format_exc()}")
return JSONResponse(content={"error": str(e)}, status_code=500)
@job_router.post("/retrieve-scrape-jobs")
async def retrieve_scrape_jobs(
fetch_options: FetchOptions, user: User = Depends(get_current_user)
):
LOG.info(f"Retrieving jobs for account: {user.email}")
ATTRIBUTES = "chat" if fetch_options.chat else "*"
try:
job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?"
results = query(job_query, (user.email,))
return JSONResponse(content=jsonable_encoder(results[::-1]))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
return JSONResponse(content=[], status_code=500)
@job_router.get("/job/{id}")
async def job(id: str, user: User = Depends(get_current_user)):
LOG.info(f"Retrieving jobs for account: {user.email}")
try:
job_query = "SELECT * FROM jobs WHERE user = ? AND id = ?"
results = query(job_query, (user.email, id))
return JSONResponse(content=jsonable_encoder(results))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
return JSONResponse(content={"error": str(e)}, status_code=500)
@job_router.post("/download")
async def download(download_job: DownloadJob):
LOG.info(f"Downloading job with ids: {download_job.ids}")
try:
job_query = (
f"SELECT * FROM jobs WHERE id IN {format_list_for_query(download_job.ids)}"
)
results = query(job_query, tuple(download_job.ids))
if download_job.job_format == "csv":
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
headers = [
"id",
"url",
"element_name",
"xpath",
"text",
"user",
"time_created",
]
csv_writer.writerow(headers)
for result in results:
for res in result["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
csv_writer.writerow(
[
result.get("id", "")
+ "-"
+ str(random.randint(0, 1000000)),
url,
element_name,
value.get("xpath", ""),
text,
result.get("user", ""),
result.get("time_created", ""),
]
)
_ = csv_buffer.seek(0)
response = StreamingResponse(
csv_buffer,
media_type="text/csv",
)
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
elif download_job.job_format == "md":
response = StreamingResponse(
stream_md_from_job_results(results),
media_type="text/markdown",
)
response.headers["Content-Disposition"] = "attachment; filename=export.md"
return response
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
return {"error": str(e)}
@job_router.get("/job/{id}/convert-to-csv")
async def convert_to_csv(id: str):
try:
job_query = f"SELECT * FROM jobs WHERE id = ?"
results = query(job_query, (id,))
return JSONResponse(content=clean_job_format(results))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
return {"error": str(e)}
@job_router.post("/delete-scrape-jobs")
async def delete(delete_scrape_jobs: DeleteScrapeJobs):
result = await delete_jobs(delete_scrape_jobs.ids)
return (
JSONResponse(content={"message": "Jobs successfully deleted."})
if result
else JSONResponse({"error": "Jobs not deleted."})
)
@job_router.post("/schedule-cron-job")
async def schedule_cron_job(cron_job: CronJob):
if not cron_job.id:
cron_job.id = uuid.uuid4().hex
if not cron_job.time_created:
cron_job.time_created = datetime.datetime.now()
if not cron_job.time_updated:
cron_job.time_updated = datetime.datetime.now()
insert_cron_job(cron_job)
queried_job = query("SELECT * FROM jobs WHERE id = ?", (cron_job.job_id,))
scheduler.add_job(
insert_job_from_cron_job,
get_cron_job_trigger(cron_job.cron_expression),
id=cron_job.id,
args=[queried_job[0]],
)
return JSONResponse(content={"message": "Cron job scheduled successfully."})
@job_router.post("/delete-cron-job")
async def delete_cron_job_request(request: DeleteCronJob):
if not request.id:
return JSONResponse(
content={"error": "Cron job id is required."}, status_code=400
)
delete_cron_job(request.id, request.user_email)
scheduler.remove_job(request.id)
return JSONResponse(content={"message": "Cron job deleted successfully."})
@job_router.get("/cron-jobs")
async def get_cron_jobs_request(user: User = Depends(get_current_user)):
cron_jobs = get_cron_jobs(user.email)
return JSONResponse(content=jsonable_encoder(cron_jobs))

View File

@@ -1,3 +1,4 @@
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
# PDM
from apscheduler.schedulers.background import BackgroundScheduler
scheduler = BackgroundScheduler()

View File

@@ -0,0 +1,17 @@
from typing import Optional, Union
from datetime import datetime
import pydantic
class CronJob(pydantic.BaseModel):
id: Optional[str] = None
user_email: str
job_id: str
cron_expression: str
time_created: Optional[Union[datetime, str]] = None
time_updated: Optional[Union[datetime, str]] = None
class DeleteCronJob(pydantic.BaseModel):
id: str
user_email: str

View File

@@ -1,24 +1,24 @@
# STL
from typing import Any, Literal, Optional, Union
from datetime import datetime
# LOCAL
from api.backend.job.models.job_options import JobOptions
# PDM
import pydantic
class Element(pydantic.BaseModel):
name: str
xpath: str
url: Optional[str] = None
from api.backend.job.models import Element, CapturedElement
class CapturedElement(pydantic.BaseModel):
xpath: str
text: str
name: str
class Job(pydantic.BaseModel):
id: Optional[str] = None
url: str
elements: list[Element]
user: str = ""
time_created: Optional[Union[datetime, str]] = None
result: list[dict[str, dict[str, list[CapturedElement]]]] = []
job_options: JobOptions
status: str = "Queued"
chat: Optional[str] = None
agent_mode: bool = False
prompt: Optional[str] = None
favorite: bool = False
class RetrieveScrapeJobs(pydantic.BaseModel):
@@ -34,41 +34,7 @@ class DeleteScrapeJobs(pydantic.BaseModel):
ids: list[str]
class GetStatistics(pydantic.BaseModel):
user: str
class UpdateJobs(pydantic.BaseModel):
ids: list[str]
field: str
value: Any
class AI(pydantic.BaseModel):
messages: list[Any]
class Job(pydantic.BaseModel):
id: Optional[str] = None
url: str
elements: list[Element]
user: str = ""
time_created: Optional[Union[datetime, str]] = None
result: list[dict[str, dict[str, list[CapturedElement]]]] = []
job_options: JobOptions
status: str = "Queued"
chat: Optional[str] = None
class CronJob(pydantic.BaseModel):
id: Optional[str] = None
user_email: str
job_id: str
cron_expression: str
time_created: Optional[Union[datetime, str]] = None
time_updated: Optional[Union[datetime, str]] = None
class DeleteCronJob(pydantic.BaseModel):
id: str
user_email: str

View File

@@ -2,28 +2,30 @@
import logging
# PDM
from fastapi import APIRouter, Depends
from fastapi import Depends, APIRouter
# LOCAL
from api.backend.job import (
from api.backend.auth.schemas import User
from api.backend.auth.auth_utils import get_current_user
from api.backend.routers.handle_exceptions import handle_exceptions
from api.backend.database.queries.statistics.statistic_queries import (
get_jobs_per_day,
average_elements_per_link,
)
from api.backend.auth.auth_utils import get_current_user
from api.backend.schemas import User
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Statistics")
stats_router = APIRouter()
@stats_router.get("/statistics/get-average-element-per-link")
@handle_exceptions(logger=LOG)
async def get_average_element_per_link(user: User = Depends(get_current_user)):
return await average_elements_per_link(user.email)
@stats_router.get("/statistics/get-average-jobs-per-day")
@handle_exceptions(logger=LOG)
async def average_jobs_per_day(user: User = Depends(get_current_user)):
data = await get_jobs_per_day(user.email)
return data

View File

@@ -0,0 +1,63 @@
# STL
import os
import sqlite3
from typing import Generator
from unittest.mock import patch
# PDM
import pytest
from proxy import Proxy
# LOCAL
from api.backend.database.schema import INIT_QUERY
from api.backend.tests.constants import TEST_DB_PATH
@pytest.fixture(scope="session", autouse=True)
def running_proxy():
proxy = Proxy(["--hostname", "127.0.0.1", "--port", "8080"])
proxy.setup()
yield proxy
proxy.shutdown()
@pytest.fixture(scope="session", autouse=True)
def patch_database_path():
with patch("api.backend.database.common.DATABASE_PATH", TEST_DB_PATH):
yield
@pytest.fixture(scope="session", autouse=True)
def patch_recordings_enabled():
with patch("api.backend.job.scraping.scraping.RECORDINGS_ENABLED", False):
yield
@pytest.fixture(scope="session")
def test_db_path() -> str:
return TEST_DB_PATH
@pytest.fixture(scope="session", autouse=True)
def test_db(test_db_path: str) -> Generator[str, None, None]:
"""Create a fresh test database for each test function."""
os.makedirs(os.path.dirname(test_db_path), exist_ok=True)
if os.path.exists(test_db_path):
os.remove(test_db_path)
conn = sqlite3.connect(test_db_path)
cursor = conn.cursor()
for query in INIT_QUERY.strip().split(";"):
query = query.strip()
if query:
cursor.execute(query)
conn.commit()
conn.close()
yield test_db_path
if os.path.exists(test_db_path):
os.remove(test_db_path)

View File

@@ -0,0 +1 @@
TEST_DB_PATH = "tests/test_db.sqlite"

View File

@@ -1,7 +1,13 @@
from api.backend.models import Element, Job, JobOptions, CapturedElement
# STL
import uuid
# PDM
from faker import Faker
# LOCAL
from api.backend.job.models import Element, JobOptions, CapturedElement
from api.backend.schemas.job import Job
fake = Faker()

View File

@@ -1,8 +1,13 @@
# STL
from unittest.mock import AsyncMock, patch
# PDM
import pytest
from fastapi.testclient import TestClient
from unittest.mock import AsyncMock, patch
# LOCAL
from api.backend.app import app
from api.backend.models import DownloadJob
from api.backend.schemas.job import DownloadJob
from api.backend.tests.factories.job_factory import create_completed_job
client = TestClient(app)
@@ -13,8 +18,8 @@ mocked_random_int = 123456
@pytest.mark.asyncio
@patch("api.backend.routers.job_router.query")
@patch("api.backend.routers.job_router.random.randint")
@patch("api.backend.job.job_router.query")
@patch("api.backend.job.job_router.random.randint")
async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
# Ensure the mock returns immediately
mock_query.return_value = mock_results

View File

@@ -1,25 +1,110 @@
import pytest
# STL
import logging
from playwright.async_api import async_playwright, Error
from typing import Dict
from datetime import datetime
# PDM
import pytest
from fastapi.testclient import TestClient
from playwright.async_api import Route, Cookie, async_playwright
# LOCAL
from api.backend.app import app
from api.backend.job.models import Proxy, Element, JobOptions
from api.backend.schemas.job import Job
from api.backend.database.common import query
from api.backend.job.scraping.scraping import scrape
from api.backend.job.scraping.add_custom import add_custom_items
logging.basicConfig(level=logging.DEBUG)
LOG = logging.getLogger(__name__)
client = TestClient(app)
@pytest.mark.asyncio
async def test_proxy():
proxy = "127.0.0.1:8080"
async def test_add_custom_items():
test_cookies = [{"name": "big", "value": "cookie"}]
test_headers = {"User-Agent": "test-agent", "Accept": "application/json"}
async with async_playwright() as p:
browser = await p.firefox.launch(
headless=True, proxy={"server": f"http://{proxy}"}
)
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
with pytest.raises(Error) as excinfo:
await page.goto("http://example.com")
# Set up request interception
captured_headers: Dict[str, str] = {}
assert "NS_ERROR_PROXY_CONNECTION_REFUSED" in str(excinfo.value)
async def handle_route(route: Route) -> None:
nonlocal captured_headers
captured_headers = route.request.headers
await route.continue_()
await page.route("**/*", handle_route)
await add_custom_items(
url="http://example.com",
page=page,
cookies=test_cookies,
headers=test_headers,
)
# Navigate to example.com
await page.goto("http://example.com")
# Verify cookies were added
cookies: list[Cookie] = await page.context.cookies()
test_cookie = next((c for c in cookies if c.get("name") == "big"), None)
assert test_cookie is not None
assert test_cookie.get("value") == "cookie"
assert test_cookie.get("path") == "/" # Default path should be set
assert test_cookie.get("sameSite") == "Lax" # Default sameSite should be set
# Verify headers were added
assert captured_headers.get("user-agent") == "test-agent"
await browser.close()
@pytest.mark.asyncio
async def test_proxies():
job = Job(
url="https://example.com",
elements=[Element(xpath="//div", name="test")],
job_options=JobOptions(
proxies=[
Proxy(
server="127.0.0.1:8080",
username="user",
password="pass",
)
],
),
time_created=datetime.now().isoformat(),
)
response = client.post("/submit-scrape-job", json=job.model_dump())
assert response.status_code == 200
jobs = query("SELECT * FROM jobs")
job = jobs[0]
assert job is not None
assert job["job_options"]["proxies"] == [
{
"server": "127.0.0.1:8080",
"username": "user",
"password": "pass",
}
]
response = await scrape(
id=job["id"],
url=job["url"],
xpaths=[Element(**e) for e in job["elements"]],
job_options=job["job_options"],
)
example_response = response[0]["https://example.com/"]
assert example_response is not {}

View File

@@ -0,0 +1,17 @@
# STL
import sqlite3
# LOCAL
from api.backend.database.schema import INIT_QUERY
from api.backend.tests.constants import TEST_DB_PATH
def connect_to_db():
conn = sqlite3.connect(TEST_DB_PATH)
cur = conn.cursor()
for query in INIT_QUERY.split(";"):
cur.execute(query)
conn.commit()
return conn, cur

View File

@@ -1,17 +1,10 @@
from typing import Any, Optional
# STL
import logging
import json
from typing import Optional
LOG = logging.getLogger(__name__)
def clean_text(text: str):
text = text.replace("\r\n", "\n") # Normalize newlines
text = text.replace("\n", "\\n") # Escape newlines
text = text.replace('"', '\\"') # Escape double quotes
return text
def get_log_level(level_name: Optional[str]) -> int:
level = logging.INFO
@@ -20,30 +13,3 @@ def get_log_level(level_name: Optional[str]) -> int:
level = getattr(logging, level_name, logging.INFO)
return level
def format_list_for_query(ids: list[str]):
return (
f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)"
)
def format_sql_row_to_python(row: dict[str, Any]):
new_row: dict[str, Any] = {}
for key, value in row.items():
if isinstance(value, str):
try:
new_row[key] = json.loads(value)
except json.JSONDecodeError:
new_row[key] = value
else:
new_row[key] = value
return new_row
def format_json(items: list[Any]):
for idx, item in enumerate(items):
if isinstance(item, (dict, list)):
formatted_item = json.dumps(item)
items[idx] = formatted_item

View File

@@ -0,0 +1,17 @@
# STL
import os
from pathlib import Path
NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "")
NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "")
SCRAPERR_FRONTEND_URL = os.getenv("SCRAPERR_FRONTEND_URL", "")
EMAIL = os.getenv("EMAIL", "")
TO = os.getenv("TO", "")
SMTP_HOST = os.getenv("SMTP_HOST", "")
SMTP_PORT = int(os.getenv("SMTP_PORT", 587))
SMTP_USER = os.getenv("SMTP_USER", "")
SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "")
USE_TLS = os.getenv("USE_TLS", "false").lower() == "true"
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
RECORDINGS_DIR = Path("/project/app/media/recordings")

View File

@@ -1,48 +1,89 @@
import os
from api.backend.job import get_queued_job, update_job
from api.backend.scraping import scrape
from api.backend.models import Element
from fastapi.encoders import jsonable_encoder
# STL
import json
import asyncio
import traceback
import subprocess
from api.backend.database.startup import init_database
# PDM
from fastapi.encoders import jsonable_encoder
from api.backend.worker.post_job_complete.post_job_complete import post_job_complete
# LOCAL
from api.backend.job import update_job, get_queued_job
from api.backend.job.models import Element
from api.backend.worker.logger import LOG
NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "")
NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "")
SCRAPERR_FRONTEND_URL = os.getenv("SCRAPERR_FRONTEND_URL", "")
EMAIL = os.getenv("EMAIL", "")
TO = os.getenv("TO", "")
SMTP_HOST = os.getenv("SMTP_HOST", "")
SMTP_PORT = int(os.getenv("SMTP_PORT", 587))
SMTP_USER = os.getenv("SMTP_USER", "")
SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "")
USE_TLS = os.getenv("USE_TLS", "false").lower() == "true"
from api.backend.ai.agent.agent import scrape_with_agent
from api.backend.database.startup import init_database
from api.backend.worker.constants import (
TO,
EMAIL,
USE_TLS,
SMTP_HOST,
SMTP_PORT,
SMTP_USER,
SMTP_PASSWORD,
RECORDINGS_DIR,
RECORDINGS_ENABLED,
NOTIFICATION_CHANNEL,
SCRAPERR_FRONTEND_URL,
NOTIFICATION_WEBHOOK_URL,
)
from api.backend.job.scraping.scraping import scrape
from api.backend.worker.post_job_complete.post_job_complete import post_job_complete
async def process_job():
job = await get_queued_job()
ffmpeg_proc = None
status = "Queued"
if job:
LOG.info(f"Beginning processing job: {job}.")
try:
output_path = RECORDINGS_DIR / f"{job['id']}.mp4"
if RECORDINGS_ENABLED:
ffmpeg_proc = subprocess.Popen(
[
"ffmpeg",
"-y",
"-video_size",
"1280x1024",
"-framerate",
"15",
"-f",
"x11grab",
"-i",
":99",
"-codec:v",
"libx264",
"-preset",
"ultrafast",
output_path,
]
)
_ = await update_job([job["id"]], field="status", value="Scraping")
scraped = await scrape(
job["url"],
[Element(**j) for j in job["elements"]],
job["job_options"]["custom_headers"],
job["job_options"]["multi_page_scrape"],
job["job_options"]["proxies"],
job["job_options"]["site_map"],
job["job_options"]["collect_media"],
)
proxies = job["job_options"]["proxies"]
if proxies and isinstance(proxies[0], str) and proxies[0].startswith("{"):
try:
proxies = [json.loads(p) for p in proxies]
except json.JSONDecodeError:
LOG.error(f"Failed to parse proxy JSON: {proxies}")
proxies = []
if job["agent_mode"]:
scraped = await scrape_with_agent(job)
else:
scraped = await scrape(
job["id"],
job["url"],
[Element(**j) for j in job["elements"]],
{**job["job_options"], "proxies": proxies},
)
LOG.info(
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
)
@@ -75,12 +116,18 @@ async def process_job():
},
)
if ffmpeg_proc:
ffmpeg_proc.terminate()
ffmpeg_proc.wait()
async def main():
LOG.info("Starting job worker...")
init_database()
RECORDINGS_DIR.mkdir(parents=True, exist_ok=True)
while True:
await process_job()
await asyncio.sleep(5)

View File

@@ -1,12 +1,13 @@
# STL
import logging
import os
from api.backend.utils import get_log_level
# LOCAL
from api.backend.app import LOG_LEVEL
logging.basicConfig(
level=get_log_level(os.getenv("LOG_LEVEL")),
format="%(levelname)s: %(asctime)s - %(name)s - %(message)s",
level=LOG_LEVEL,
format="%(levelname)s: %(asctime)s - [%(name)s] - %(message)s",
handlers=[logging.StreamHandler()],
)
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Job Worker")

View File

@@ -1,5 +1,7 @@
# STL
from typing import Any
# LOCAL
from api.backend.worker.post_job_complete.models import PostJobCompleteOptions
from api.backend.worker.post_job_complete.email_notifcation import (
send_job_complete_email,
@@ -16,9 +18,10 @@ async def post_job_complete(job: dict[str, Any], options: PostJobCompleteOptions
if not options.values():
return
if options["channel"] == "discord":
discord_notification(job, options)
elif options["channel"] == "email":
send_job_complete_email(job, options)
else:
raise ValueError(f"Invalid channel: {options['channel']}")
match options["channel"]:
case "discord":
discord_notification(job, options)
case "email":
send_job_complete_email(job, options)
case _:
raise ValueError(f"Invalid channel: {options['channel']}")

View File

@@ -0,0 +1,23 @@
describe("Global setup", () => {
it("signs up user once", () => {
cy.request({
method: "POST",
url: "/api/signup",
body: JSON.stringify({
data: {
email: "test@test.com",
password: "password",
full_name: "John Doe",
},
}),
headers: {
"Content-Type": "application/json",
},
failOnStatusCode: false,
}).then((response) => {
if (response.status !== 200 && response.status !== 201) {
console.warn("Signup failed:", response.status, response.body);
}
});
});
});

View File

@@ -0,0 +1,101 @@
import { login } from "../utilities/authentication.utils";
import {
addCustomHeaders,
addElement,
addMedia,
addSiteMapAction,
checkForMedia,
cleanUpJobs,
enterJobUrl,
openAdvancedJobOptions,
submitBasicJob,
submitJob,
waitForJobCompletion,
} from "../utilities/job.utilities";
import { mockSubmitJob } from "../utilities/mocks";
describe.only("Advanced Job Options", () => {
beforeEach(() => {
mockSubmitJob();
login();
cy.visit("/");
});
afterEach(() => {
cleanUpJobs();
});
it.only("should handle custom headers", () => {
const customHeaders = {
"User-Agent": "Test Agent",
"Accept-Language": "en-US",
};
addCustomHeaders(customHeaders);
submitBasicJob("https://httpbin.org/headers", "headers", "//pre");
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
expect(
interception.request?.body.data.job_options.custom_headers
).to.deep.equal(customHeaders);
});
waitForJobCompletion("https://httpbin.org/headers");
});
it("should handle site map actions", () => {
addSiteMapAction("click", "//button[contains(text(), 'Load More')]");
addSiteMapAction("input", "//input[@type='search']", "test search");
submitBasicJob("https://example.com", "content", "//div[@class='content']");
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
const siteMap = interception.request?.body.data.job_options.site_map;
expect(siteMap.actions).to.have.length(2);
expect(siteMap.actions[0].type).to.equal("click");
expect(siteMap.actions[1].type).to.equal("input");
});
waitForJobCompletion("https://example.com");
});
it("should handle multiple elements", () => {
enterJobUrl("https://books.toscrape.com");
addElement("titles", "//h3");
addElement("prices", "//p[@class='price_color']");
submitJob();
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
expect(interception.request?.body.data.elements).to.have.length(2);
});
waitForJobCompletion("https://books.toscrape.com");
});
it.only("should handle collecting media", () => {
enterJobUrl("https://books.toscrape.com");
openAdvancedJobOptions();
addMedia();
cy.get("body").type("{esc}");
addElement("images", "//img");
submitJob();
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
expect(interception.request?.body.data.job_options.collect_media).to.be
.true;
});
waitForJobCompletion("https://books.toscrape.com");
checkForMedia();
});
});

35
cypress/e2e/agent.cy.ts Normal file
View File

@@ -0,0 +1,35 @@
import { login } from "../utilities/authentication.utils";
import {
buildAgentJob,
cleanUpJobs,
submitJob,
waitForJobCompletion,
} from "../utilities/job.utilities";
import { mockSubmitJob } from "../utilities/mocks";
describe.only("Agent", () => {
beforeEach(() => {
mockSubmitJob();
login();
cy.visit("/agent");
});
afterEach(() => {
cleanUpJobs();
});
it("should be able to scrape some data", () => {
const url = "https://books.toscrape.com";
const prompt = "Collect all the links on the page";
buildAgentJob(url, prompt);
submitJob();
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
expect(interception.request?.body.data.url).to.eq(url);
expect(interception.request?.body.data.prompt).to.eq(prompt);
});
waitForJobCompletion("https://books.toscrape.com");
});
});

View File

@@ -1,60 +1,61 @@
describe("Authentication", () => {
it("should register", () => {
cy.intercept("POST", "/api/signup").as("signup");
import { faker } from "@faker-js/faker";
import { mockLogin, mockSignup } from "../utilities/mocks";
cy.visit("/").then(() => {
cy.get("button").contains("Login").click();
cy.url().should("include", "/login");
const mockEmail = faker.internet.email();
const mockPassword = faker.internet.password();
cy.get("form").should("be.visible");
cy.get("button")
.contains("No Account? Sign up")
.should("be.visible")
.click();
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("input[name='fullName']").type("John Doe");
cy.get("button[type='submit']").contains("Signup").click();
cy.wait("@signup").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("signup request did not return a response");
}
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
expect(interception.response.statusCode).to.eq(200);
});
});
describe.only("Authentication", () => {
beforeEach(() => {
cy.visit("/");
mockSignup();
mockLogin();
});
it("should login", () => {
cy.intercept("POST", "/api/token").as("token");
it("should register", () => {
cy.get("button").contains("Login").click();
cy.url().should("include", "/login");
cy.visit("/").then(() => {
cy.get("button")
.contains("Login")
.click()
.then(() => {
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("button[type='submit']").contains("Login").click();
cy.get("form").should("be.visible");
cy.wait("@token").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("token request did not return a response");
}
cy.get("button")
.contains("No Account? Sign up")
.should("be.visible")
.click();
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
cy.get("input[name='email']").type(mockEmail);
cy.get("input[name='password']").type(mockPassword);
cy.get("input[name='fullName']").type(faker.person.fullName());
cy.get("button[type='submit']").contains("Signup").click();
expect(interception.response.statusCode).to.eq(200);
});
});
cy.wait("@signup").then((interception) => {
if (!interception.response) {
throw new Error("signup request did not return a response");
}
expect(interception.response.statusCode).to.eq(200);
});
});
});
it("should login", () => {
cy.intercept("POST", "/api/token").as("token");
cy.visit("/").then(() => {
cy.get("button")
.contains("Login")
.click()
.then(() => {
cy.get("input[name='email']").type(mockEmail);
cy.get("input[name='password']").type(mockPassword);
cy.get("button[type='submit']").contains("Login").click();
cy.wait("@token").then((interception) => {
if (!interception.response) {
throw new Error("token request did not return a response");
}
expect(interception.response.statusCode).to.eq(200);
});
});
});
});

34
cypress/e2e/chat.cy.ts Normal file
View File

@@ -0,0 +1,34 @@
import { login } from "../utilities/authentication.utils";
import {
cleanUpJobs,
selectJobFromSelector,
submitBasicJob,
waitForJobCompletion,
} from "../utilities/job.utilities";
import { mockLogin } from "../utilities/mocks";
describe.only("Chat", () => {
beforeEach(() => {
mockLogin();
login();
cy.visit("/");
});
afterEach(() => {
cleanUpJobs();
});
it.only("should be able to chat", () => {
const url = "https://books.toscrape.com";
submitBasicJob(url, "test", "//body");
waitForJobCompletion(url);
cy.visit("/chat");
selectJobFromSelector();
cy.get("[data-cy='message-input']").type("Hello");
cy.get("[data-cy='send-message']").click();
cy.get("[data-cy='ai-message']").should("exist");
});
});

View File

@@ -1,34 +1,37 @@
import { login } from "../utilities/authentication.utils";
import {
addElement,
cleanUpJobs,
enterJobUrl,
submitJob,
waitForJobCompletion,
} from "../utilities/job.utilities";
import { mockSubmitJob } from "../utilities/mocks";
describe.only("Job", () => {
it("should create a job", () => {
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
beforeEach(() => {
mockSubmitJob();
login();
cy.visit("/");
});
cy.get('[data-cy="url-input"]').type("https://example.com");
cy.get('[data-cy="name-field"]').type("example");
cy.get('[data-cy="xpath-field"]').type("//body");
cy.get('[data-cy="add-button"]').click();
afterEach(() => {
cleanUpJobs();
});
cy.contains("Submit").click();
it("should create a job", () => {
enterJobUrl("https://books.toscrape.com");
addElement("body", "//body");
submitJob();
cy.wait("@submitScrapeJob").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
cy.log("Request body: " + JSON.stringify(interception.request?.body));
throw new Error("submitScrapeJob request did not return a response");
}
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
expect(interception.response.statusCode).to.eq(200);
});
cy.get("li").contains("Jobs").click();
cy.contains("div", "https://example.com", { timeout: 10000 }).should(
"exist"
);
cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
waitForJobCompletion("https://books.toscrape.com");
});
});

View File

@@ -14,7 +14,7 @@
// ***********************************************************
// Import commands.js using ES2015 syntax:
import './commands'
import "./commands";
// Alternatively you can use CommonJS syntax:
// require('./commands')
// require('./commands')

View File

@@ -0,0 +1,68 @@
export const signup = () => {
cy.intercept("POST", "/api/token").as("token");
cy.visit("/").then(() => {
cy.get("button").contains("Login").click();
cy.url().should("include", "/login");
cy.get("form").should("be.visible");
cy.get("button")
.contains("No Account? Sign up")
.should("be.visible")
.click();
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("input[name='fullName']").type("John Doe");
cy.get("button[type='submit']").contains("Signup").click();
cy.wait("@token").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("token request did not return a response");
}
});
});
};
export const login = () => {
cy.intercept("POST", "/api/token").as("token");
cy.intercept("GET", "/api/me").as("me");
cy.intercept("GET", "/api/check").as("check");
cy.visit("/").then(() => {
cy.get("body").then(() => {
cy.get("button")
.contains("Login")
.click()
.then(() => {
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("button[type='submit']").contains("Login").click();
cy.wait("@token").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("token request did not return a response");
}
});
cy.wait("@me").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("me request did not return a response");
}
});
cy.wait("@check").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("check request did not return a response");
}
});
cy.url().should("not.include", "/login");
});
});
});
};

View File

@@ -0,0 +1,151 @@
export const cleanUpJobs = () => {
cy.intercept("POST", "/api/retrieve").as("retrieve");
cy.visit("/jobs");
cy.wait("@retrieve", { timeout: 15000 });
cy.get("tbody tr", { timeout: 10000 }).should("have.length.at.least", 1);
const tryClickSelectAll = (attempt = 1, maxAttempts = 5) => {
cy.log(`Attempt ${attempt} to click Select All`);
cy.get('[data-testid="select-all"]')
.closest("button")
.then(($btn) => {
// Retry if button is disabled
if ($btn.is(":disabled") || $btn.css("pointer-events") === "none") {
if (attempt < maxAttempts) {
cy.wait(1000).then(() =>
tryClickSelectAll(attempt + 1, maxAttempts)
);
} else {
throw new Error(
"Select All button is still disabled after max retries"
);
}
} else {
// Click and then verify if checkbox is checked
cy.wrap($btn)
.click({ force: true })
.then(() => {
cy.get("tbody tr")
.first()
.find("td")
.first()
.find("input[type='checkbox']")
.should("be.checked")
.then(() => {
cy.log("Select All successful");
});
});
// Handle failure case
cy.on("fail", () => {
cy.log("Error clicking Select All");
if (attempt < maxAttempts) {
cy.wait(1000).then(() =>
tryClickSelectAll(attempt + 1, maxAttempts)
);
} else {
throw new Error(
"Checkbox was never checked after clicking Select All"
);
}
return false; // Prevent Cypress from failing the test
});
}
});
};
tryClickSelectAll();
cy.get('[data-testid="DeleteIcon"]', { timeout: 10000 })
.closest("button")
.should("not.be.disabled")
.click();
};
export const submitBasicJob = (url: string, name: string, xpath: string) => {
cy.get('[data-cy="url-input"]').type(url);
cy.get('[data-cy="name-field"]').type(name);
cy.get('[data-cy="xpath-field"]').type(xpath);
cy.get('[data-cy="add-button"]').click();
cy.contains("Submit").click();
};
export const waitForJobCompletion = (url: string) => {
cy.visit("/jobs");
cy.contains("div", url, { timeout: 10000 }).should("exist");
cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
};
export const enableMultiPageScraping = () => {
cy.get("button").contains("Advanced Job Options").click();
cy.get('[data-cy="multi-page-toggle"]').click();
cy.get("body").type("{esc}");
};
export const addCustomHeaders = (headers: Record<string, string>) => {
cy.get("button").contains("Advanced Job Options").click();
cy.get('[name="custom_headers"]').type(JSON.stringify(headers), {
parseSpecialCharSequences: false,
});
cy.get("body").type("{esc}");
};
export const addCustomCookies = (cookies: Record<string, string>) => {
cy.get("button").contains("Advanced Job Options").click();
cy.get('[name="custom_cookies"]').type(JSON.stringify(cookies));
cy.get("body").type("{esc}");
};
export const openAdvancedJobOptions = () => {
cy.get("button").contains("Advanced Job Options").click();
};
export const selectJobFromSelector = () => {
cy.get("div[id='select-job']").click();
cy.get("li[role='option']").click().first();
};
export const addMedia = () => {
cy.get('[data-cy="collect-media-checkbox"]').click();
};
export const checkForMedia = () => {
cy.visit("/media");
selectJobFromSelector();
cy.get("[data-testid='media-grid']", { timeout: 10000 }).should("exist");
};
export const addSiteMapAction = (
type: "click" | "input",
xpath: string,
input?: string
) => {
cy.get("button").contains("Create Site Map").click();
cy.get('[data-cy="site-map-select"]').select(type);
cy.get('[data-cy="site-map-xpath"]').type(xpath);
if (type === "input" && input) {
cy.get('[data-cy="site-map-input"]').type(input);
}
cy.get('[data-cy="add-site-map-action"]').click();
};
export const addElement = (name: string, xpath: string) => {
cy.get('[data-cy="name-field"]').type(name);
cy.get('[data-cy="xpath-field"]').type(xpath);
cy.get('[data-cy="add-button"]').click();
};
export const buildAgentJob = (url: string, prompt: string) => {
enterJobUrl(url);
cy.get("[data-cy='prompt-input']").type(prompt);
};
export const submitJob = () => {
cy.get("button").contains("Submit").click();
};
export const enterJobUrl = (url: string) => {
cy.get('[data-cy="url-input"]').type(url);
};

View File

@@ -0,0 +1,15 @@
export const mockSubmitJob = () => {
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
};
export const mockToken = () => {
cy.intercept("POST", "/api/token").as("token");
};
export const mockSignup = () => {
cy.intercept("POST", "/api/signup").as("signup");
};
export const mockLogin = () => {
cy.intercept("POST", "/api/token").as("token");
};

View File

@@ -0,0 +1 @@
export * from "./authentication.utils";

View File

@@ -1,6 +1,9 @@
version: "3"
services:
scraperr:
build:
context: .
dockerfile: docker/frontend/Dockerfile
command: ["npm", "run", "dev"]
volumes:
- "$PWD/src:/app/src"
@@ -10,7 +13,12 @@ services:
- "$PWD/package-lock.json:/app/package-lock.json"
- "$PWD/tsconfig.json:/app/tsconfig.json"
scraperr_api:
build:
context: .
dockerfile: docker/api/Dockerfile
environment:
- LOG_LEVEL=INFO
volumes:
- "$PWD/api:/project/app/api"
ports:
- "5900:5900"

View File

@@ -1,11 +1,6 @@
services:
scraperr:
depends_on:
- scraperr_api
image: jpyles0524/scraperr:latest
build:
context: .
dockerfile: docker/frontend/Dockerfile
container_name: scraperr
command: ["npm", "run", "start"]
environment:
@@ -18,11 +13,9 @@ services:
scraperr_api:
init: True
image: jpyles0524/scraperr_api:latest
build:
context: .
dockerfile: docker/api/Dockerfile
environment:
- LOG_LEVEL=INFO
- OPENAI_KEY=${OPENAI_KEY}
container_name: scraperr_api
ports:
- 8000:8000

View File

@@ -3,7 +3,7 @@ FROM python:3.10.12-slim as pybuilder
RUN apt-get update && \
apt-get install -y curl && \
apt-get install -y uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 && \
apt-get install -y x11vnc xvfb uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 ffmpeg && \
curl -LsSf https://astral.sh/uv/install.sh | sh && \
apt-get remove -y curl && \
apt-get autoremove -y && \
@@ -14,7 +14,8 @@ RUN pdm config python.use_venv false
WORKDIR /project/app
COPY pyproject.toml pdm.lock /project/app/
RUN pdm install
RUN pdm install -v --frozen-lockfile
RUN pdm run playwright install --with-deps
@@ -30,7 +31,12 @@ EXPOSE 8000
WORKDIR /project/app
RUN mkdir -p /project/app/media
RUN mkdir -p /project/app/data
RUN touch /project/app/data/database.db
EXPOSE 5900
COPY start.sh /project/app/start.sh
CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ]

View File

@@ -1,10 +1,14 @@
# Build next dependencies
FROM node:23.1
FROM node:23.1-slim
WORKDIR /app
COPY package*.json ./
RUN npm install
# Copy package files first to leverage Docker cache
COPY package.json yarn.lock ./
# Install dependencies in a separate layer
RUN yarn install --frozen-lockfile
# Copy the rest of the application
COPY tsconfig.json /app/tsconfig.json
COPY tailwind.config.js /app/tailwind.config.js
COPY next.config.mjs /app/next.config.mjs
@@ -13,6 +17,7 @@ COPY postcss.config.js /app/postcss.config.js
COPY public /app/public
COPY src /app/src
RUN npm run build
# Build the application
RUN yarn build
EXPOSE 3000

Binary file not shown.

Before

Width:  |  Height:  |  Size: 47 KiB

After

Width:  |  Height:  |  Size: 48 KiB

View File

@@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 1.0.13
version: 1.1.0
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to

11371
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -12,9 +12,11 @@
"@minchat/react-chat-ui": "^0.16.2",
"@mui/icons-material": "^5.15.3",
"@mui/material": "^5.16.0",
"@reduxjs/toolkit": "^2.8.2",
"@testing-library/jest-dom": "^5.16.5",
"@testing-library/react": "^13.4.0",
"@testing-library/user-event": "^13.5.0",
"@types/react": "^18.3.21",
"axios": "^1.7.2",
"bootstrap": "^5.3.0",
"chart.js": "^4.4.3",
@@ -30,16 +32,19 @@
"react-dom": "^18.3.1",
"react-markdown": "^9.0.0",
"react-modal-image": "^2.6.0",
"react-redux": "^9.2.0",
"react-router": "^6.14.1",
"react-router-dom": "^6.14.1",
"react-spinners": "^0.14.1",
"react-toastify": "^11.0.5",
"redux-persist": "^6.0.0",
"typescript": "^4.9.5",
"web-vitals": "^2.1.4"
},
"scripts": {
"dev": "next dev",
"build": "next build",
"start": "next start",
"dev": "yarn next dev",
"build": "yarn next build",
"start": "yarn next start",
"serve": "serve -s ./dist",
"cy:open": "cypress open",
"cy:run": "cypress run"
@@ -63,6 +68,7 @@
]
},
"devDependencies": {
"@faker-js/faker": "^9.8.0",
"@types/cypress": "^1.1.6",
"@types/js-cookie": "^3.0.6",
"autoprefixer": "^10.4.21",

24
pdm.lock generated
View File

@@ -5,7 +5,7 @@
groups = ["default", "dev"]
strategy = ["inherit_metadata"]
lock_version = "4.5.0"
content_hash = "sha256:cb37fedd6d022515dde14e475588a8da2144ba22e41dfdfacfe3f7a7d14486ca"
content_hash = "sha256:1a65c1e288d2c6827fc6866d3bfe6a9b8707b2ca895d488f4a9b11cd579c4359"
[[metadata.targets]]
requires_python = ">=3.10"
@@ -1174,6 +1174,17 @@ files = [
{file = "hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca"},
]
[[package]]
name = "html2text"
version = "2025.4.15"
requires_python = ">=3.9"
summary = "Turn HTML into equivalent Markdown-structured text."
groups = ["default"]
files = [
{file = "html2text-2025.4.15-py3-none-any.whl", hash = "sha256:00569167ffdab3d7767a4cdf589b7f57e777a5ed28d12907d8c58769ec734acc"},
{file = "html2text-2025.4.15.tar.gz", hash = "sha256:948a645f8f0bc3abe7fd587019a2197a12436cd73d0d4908af95bfc8da337588"},
]
[[package]]
name = "httpcore"
version = "1.0.9"
@@ -2303,6 +2314,17 @@ files = [
{file = "propcache-0.3.1.tar.gz", hash = "sha256:40d980c33765359098837527e18eddefc9a24cea5b45e078a7f3bb5b032c6ecf"},
]
[[package]]
name = "proxy-py"
version = "2.4.10"
requires_python = ">=3.6"
summary = "\\u26a1 Fast \\u2022 \\U0001fab6 Lightweight \\u2022 \\U0001f51f Dependency \\u2022 \\U0001f50c Pluggable \\u2022 \\U0001f608 TLS interception \\u2022 \\U0001f512 DNS-over-HTTPS \\u2022 \\U0001f525 Poor Mans VPN \\u2022 \\u23ea Reverse & \\u23e9 Forward \\u2022 \\U0001f46e\\U0001f3ff Proxy Server framework \\u2022 \\U0001f310 Web Server framework \\u2022 \\u27b5 \\u27b6 \\u27b7 \\u27a0 PubSub framework \\u2022 \\U0001f477 Work acceptor & executor framework."
groups = ["default"]
files = [
{file = "proxy.py-2.4.10-py3-none-any.whl", hash = "sha256:ef3a31f6ef3be6ff78559c0e68198523bfe2fb1e820bb16686750c1bb5baf9e8"},
{file = "proxy_py-2.4.10.tar.gz", hash = "sha256:41b9e9d3aae6f80e2304d3726e8e9c583a510d8de224eada53d115f48a63a9ce"},
]
[[package]]
name = "ptyprocess"
version = "0.7.0"

View File

@@ -41,6 +41,8 @@ dependencies = [
"apscheduler>=3.11.0",
"playwright>=1.52.0",
"camoufox>=0.4.11",
"html2text>=2025.4.15",
"proxy-py>=2.4.10",
]
requires-python = ">=3.10"
readme = "README.md"
@@ -97,9 +99,9 @@ strictSetInference = true
[tool.isort]
length_sort = "1"
length_sort = true
profile = "black"
sections = "STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER"
sections = ["STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
import_heading_stdlib = "STL"
import_heading_thirdparty = "PDM"
import_heading_firstparty = "LOCAL"

View File

@@ -1,5 +0,0 @@
import React from "react";
export const Chat = () => {
return <h1>Chat</h1>;
};

View File

@@ -1,133 +0,0 @@
import React, { useState, useEffect, Dispatch, useRef } from "react";
import { Job } from "../../types";
import { fetchJobs } from "../../lib";
import Box from "@mui/material/Box";
import InputLabel from "@mui/material/InputLabel";
import FormControl from "@mui/material/FormControl";
import Select from "@mui/material/Select";
import Popover from "@mui/material/Popover";
import { Typography, MenuItem, useTheme } from "@mui/material";
import { SxProps } from "@mui/material";
interface Props {
sxProps: SxProps;
setSelectedJob: Dispatch<React.SetStateAction<Job | null>>;
selectedJob: Job | null;
setJobs: Dispatch<React.SetStateAction<Job[]>>;
jobs: Job[];
}
export const JobSelector = ({
sxProps,
selectedJob,
setSelectedJob,
setJobs,
jobs,
}: Props) => {
const [anchorEl, setAnchorEl] = useState<HTMLElement | null>(null);
const [popoverJob, setPopoverJob] = useState<Job | null>(null);
const theme = useTheme();
const handlePopoverOpen = (
event: React.MouseEvent<HTMLElement>,
job: Job
) => {
setAnchorEl(event.currentTarget);
setPopoverJob(job);
};
const handlePopoverClose = () => {
setAnchorEl(null);
setPopoverJob(null);
};
const open = Boolean(anchorEl);
return (
<Box sx={sxProps}>
<FormControl fullWidth>
{jobs.length ? (
<>
<InputLabel id="select-job">Job</InputLabel>
<Select
labelId="select-job"
id="select-job"
value={selectedJob?.id || ""}
label="Job"
onChange={(e) => {
setSelectedJob(
jobs.find((job) => job.id === e.target.value) || null
);
}}
>
{jobs.map((job) => (
<MenuItem
key={job.id}
value={job.id}
aria-owns={open ? "mouse-over-popover" : undefined}
aria-haspopup="true"
onMouseEnter={(e) => handlePopoverOpen(e, job)}
onMouseLeave={handlePopoverClose}
onClick={handlePopoverClose}
>
{job.id}
</MenuItem>
))}
</Select>
</>
) : null}
</FormControl>
<Popover
id="mouse-over-popover"
sx={{
pointerEvents: "none",
padding: 0,
}}
open={open}
anchorEl={anchorEl}
anchorOrigin={{
vertical: "bottom",
horizontal: "left",
}}
transformOrigin={{
vertical: "top",
horizontal: "left",
}}
onClose={handlePopoverClose}
>
{popoverJob && (
<Box
sx={{
border:
theme.palette.mode === "light"
? "2px solid black"
: "2px solid white",
}}
>
<Typography
variant="body1"
sx={{ paddingLeft: 1, paddingRight: 1 }}
>
{popoverJob.url}
</Typography>
<div className="flex flex-row w-full justify-end mb-1">
<Typography
variant="body2"
sx={{
paddingLeft: 1,
paddingRight: 1,
color: theme.palette.mode === "dark" ? "#d3d7e6" : "#5b5d63",
fontStyle: "italic",
}}
>
{popoverJob.time_created
? new Date(popoverJob.time_created).toLocaleString()
: "Unknown"}
</Typography>
</div>
</Box>
)}
</Popover>
</Box>
);
};

View File

@@ -1,2 +0,0 @@
export * from "./Chat";
export * from "./JobSelector";

View File

@@ -0,0 +1,50 @@
import { Box, Link, Typography } from "@mui/material";
import { SetStateAction, Dispatch, useState } from "react";
import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog";
import { RawJobOptions } from "@/types";
export type AdvancedJobOptionsProps = {
jobOptions: RawJobOptions;
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
multiPageScrapeEnabled?: boolean;
};
export const AdvancedJobOptions = ({
jobOptions,
setJobOptions,
multiPageScrapeEnabled = true,
}: AdvancedJobOptionsProps) => {
const [open, setOpen] = useState(false);
return (
<Box sx={{ mb: 2 }}>
<Link
component="button"
variant="body2"
onClick={() => setOpen(true)}
sx={{
textDecoration: "none",
color: "primary.main",
"&:hover": {
color: "primary.dark",
textDecoration: "underline",
},
paddingLeft: 1,
display: "inline-flex",
alignItems: "center",
gap: 0.5,
}}
>
<Typography variant="body2">Advanced Job Options</Typography>
</Link>
<AdvancedJobOptionsDialog
open={open}
onClose={() => setOpen(false)}
jobOptions={jobOptions}
setJobOptions={setJobOptions}
multiPageScrapeEnabled={multiPageScrapeEnabled}
/>
</Box>
);
};

View File

@@ -0,0 +1,295 @@
import { ExpandedTableInput } from "@/components/common/expanded-table-input";
import { RawJobOptions } from "@/types";
import {
Code as CodeIcon,
ExpandMore as ExpandMoreIcon,
InfoOutlined,
Settings,
} from "@mui/icons-material";
import {
Accordion,
AccordionDetails,
AccordionSummary,
Box,
Checkbox,
Dialog,
DialogContent,
DialogTitle,
Divider,
FormControl,
FormControlLabel,
FormGroup,
IconButton,
TextField,
Tooltip,
Typography,
useTheme,
} from "@mui/material";
import { Dispatch, SetStateAction, useEffect, useState } from "react";
export type AdvancedJobOptionsDialogProps = {
open: boolean;
onClose: () => void;
jobOptions: RawJobOptions;
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
multiPageScrapeEnabled?: boolean;
};
export const AdvancedJobOptionsDialog = ({
open,
onClose,
jobOptions,
setJobOptions,
multiPageScrapeEnabled = true,
}: AdvancedJobOptionsDialogProps) => {
const theme = useTheme();
const [localJobOptions, setLocalJobOptions] =
useState<RawJobOptions>(jobOptions);
// Update local state when prop changes
useEffect(() => {
setLocalJobOptions(jobOptions);
}, [jobOptions]);
const handleMultiPageScrapeChange = () => {
setLocalJobOptions((prevJobOptions) => ({
...prevJobOptions,
multi_page_scrape: !prevJobOptions.multi_page_scrape,
}));
};
const handleProxiesChange = (e: React.ChangeEvent<HTMLInputElement>) => {
setLocalJobOptions((prevJobOptions) => ({
...prevJobOptions,
proxies: e.target.value,
}));
};
const handleCollectMediaChange = () => {
setLocalJobOptions((prevJobOptions) => ({
...prevJobOptions,
collect_media: !prevJobOptions.collect_media,
}));
};
const handleClose = () => {
// Save the local state back to the parent before closing
setJobOptions(localJobOptions);
onClose();
};
return (
<Dialog
open={open}
onClose={handleClose}
maxWidth="md"
fullWidth
PaperProps={{
sx: {
borderRadius: 2,
boxShadow: "0 8px 32px rgba(0, 0, 0, 0.1)",
},
}}
>
<DialogTitle
sx={{
borderBottom: `1px solid ${theme.palette.divider}`,
backgroundColor: theme.palette.background.default,
color: theme.palette.primary.contrastText,
borderRadius: 2,
display: "flex",
alignItems: "center",
justifyContent: "space-between",
padding: "1rem 2rem",
marginRight: 2,
marginLeft: 2,
}}
>
<Typography variant="h6" component="div">
Advanced Job Options
</Typography>
<Settings
sx={{
color: theme.palette.primary.contrastText,
}}
/>
</DialogTitle>
<DialogContent
sx={{ padding: 3, overflowY: "auto", marginTop: 2, height: "60rem" }}
>
<FormControl fullWidth>
<Box sx={{ mb: 3 }}>
<Typography
variant="subtitle1"
sx={{
mb: 1,
fontWeight: "bold",
color: theme.palette.text.primary,
}}
>
Collection Options
</Typography>
<Divider sx={{ mb: 2, backgroundColor: theme.palette.divider }} />
<FormGroup row sx={{ gap: 4, mb: 1 }}>
<FormControlLabel
control={
<Checkbox
checked={localJobOptions.multi_page_scrape}
onChange={handleMultiPageScrapeChange}
disabled={!multiPageScrapeEnabled}
/>
}
label={
<Box sx={{ display: "flex", alignItems: "center" }}>
<Typography>Multi Page Scrape</Typography>
<Tooltip
title={
multiPageScrapeEnabled
? "Enable crawling through multiple pages"
: "Multi page scrape is disabled"
}
>
<IconButton size="small">
<InfoOutlined fontSize="small" />
</IconButton>
</Tooltip>
</Box>
}
/>
<FormControlLabel
control={
<Checkbox
checked={localJobOptions.collect_media}
onChange={handleCollectMediaChange}
data-cy="collect-media-checkbox"
/>
}
label={
<Box sx={{ display: "flex", alignItems: "center" }}>
<Typography>Collect Media</Typography>
<Tooltip title="Download images and other media">
<IconButton size="small">
<InfoOutlined fontSize="small" />
</IconButton>
</Tooltip>
</Box>
}
/>
</FormGroup>
</Box>
<Box sx={{ mb: 3 }}>
<Typography
variant="subtitle1"
sx={{
mb: 1,
fontWeight: "bold",
color: theme.palette.text.primary,
}}
>
Custom Options
</Typography>
<Divider sx={{ mb: 2, backgroundColor: theme.palette.divider }} />
{/* Proxies Section */}
<Accordion
defaultExpanded
elevation={0}
sx={{
mb: 2,
border: `1px solid ${theme.palette.divider}`,
"&:before": { display: "none" },
borderRadius: 1,
overflow: "hidden",
padding: 1,
}}
>
<AccordionSummary
expandIcon={<ExpandMoreIcon />}
sx={{
backgroundColor: theme.palette.background.paper,
borderBottom: `1px solid ${theme.palette.divider}`,
"&.Mui-expanded": {
borderBottom: `1px solid ${theme.palette.divider}`,
},
}}
>
<Box sx={{ display: "flex", alignItems: "center" }}>
<div
style={{
display: "flex",
alignItems: "center",
gap: "0.5rem",
}}
>
<Typography
sx={{
fontWeight: 500,
color: theme.palette.text.primary,
}}
>
Proxies
</Typography>
<Tooltip title="Comma separated list of proxies that should follow Playwright proxy format">
<InfoOutlined fontSize="small" />
</Tooltip>
</div>
</Box>
</AccordionSummary>
<AccordionDetails
sx={{ p: 2, backgroundColor: theme.palette.background.default }}
>
<TextField
placeholder='Proxies ([{"server": "proxy.example.com:8080", "username": "username", "password": "password"}])'
fullWidth
variant="outlined"
size="small"
value={localJobOptions.proxies}
onChange={handleProxiesChange}
InputProps={{
startAdornment: (
<CodeIcon
sx={{ color: theme.palette.text.secondary, mr: 1 }}
/>
),
}}
/>
</AccordionDetails>
</Accordion>
{/* Custom Headers Section */}
<ExpandedTableInput
label="Custom Headers"
placeholder='{"User-Agent": "CustomAgent", "Accept": "*/*"}'
urlParam="custom_headers"
name="custom_headers"
onChange={(value) => {
setLocalJobOptions((prevJobOptions) => ({
...prevJobOptions,
custom_headers: value,
}));
}}
/>
{/* Custom Cookies Section */}
<ExpandedTableInput
label="Custom Cookies"
placeholder='[{"name": "value", "name2": "value2"}]'
urlParam="custom_cookies"
name="custom_cookies"
onChange={(value) => {
setLocalJobOptions((prevJobOptions) => ({
...prevJobOptions,
custom_cookies: value,
}));
}}
/>
</Box>
</FormControl>
</DialogContent>
</Dialog>
);
};

View File

@@ -0,0 +1 @@
export * from "./advanced-job-options-dialog";

View File

@@ -0,0 +1 @@
export * from "./advanced-job-options";

View File

@@ -1,17 +1,17 @@
import React, { useState } from "react";
import {
alpha,
Box,
Paper,
Table,
TableBody,
TableCell,
TableContainer,
TableHead,
TableRow,
Paper,
Box,
Typography,
useTheme,
alpha,
} from "@mui/material";
import React, { useState } from "react";
export type CsvRow = {
[key: string]: string;
@@ -131,8 +131,9 @@ export const CsvTable: React.FC<CsvTableProps> = ({ csv, className }) => {
<Typography variant="body2" color="text.secondary">
{row.text
? row.text
.replace(/(\r\n|\n|\r)/g, " ")
.replace(/\t/g, " ")
.replace(/[\n\t\r]+/g, " ")
.replace(/\s+/g, " ")
.trim()
: "No text available"}
</Typography>
</Paper>

View File

@@ -0,0 +1,29 @@
import { Box } from "@mui/material";
export type DisabledProps = {
message: string;
};
export const Disabled = ({ message }: DisabledProps) => {
return (
<Box
bgcolor="background.default"
minHeight="100vh"
display="flex"
justifyContent="center"
alignItems="center"
>
<h4
style={{
color: "#fff",
padding: "20px",
borderRadius: "8px",
background: "rgba(0, 0, 0, 0.6)",
boxShadow: "0 4px 8px rgba(0, 0, 0, 0.2)",
}}
>
{message}
</h4>
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./disabled";

View File

@@ -0,0 +1,207 @@
import { parseJsonToEntries } from "@/lib/helpers/parse-json-to-entries";
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
import {
Accordion,
AccordionDetails,
AccordionSummary,
Box,
Paper,
Table,
TableBody,
TableCell,
TableContainer,
TableHead,
TableRow,
TextField,
Typography,
useTheme,
} from "@mui/material";
import { useEffect, useState } from "react";
export type ExpandedTableInputProps = {
label: string;
onChange: (value: any) => void;
placeholder: string;
urlParam: string;
name: string;
};
export const ExpandedTableInput = ({
label,
onChange,
placeholder,
urlParam,
name,
}: ExpandedTableInputProps) => {
const theme = useTheme();
const [value, setValue] = useState("");
const [parsedHeaders, setParsedHeaders] = useState<[string, string][] | null>(
null
);
const [jsonError, setJsonError] = useState<string | null>(null);
const urlParams = new URLSearchParams(window.location.search);
const validateAndParse = (val: string) => {
if (val.trim() === "") {
setParsedHeaders(null);
setJsonError(null);
return null;
}
try {
const parsed = JSON.parse(val);
const entries = parseJsonToEntries(val);
if (entries === null) {
setParsedHeaders(null);
setJsonError("Invalid JSON object");
return null;
} else {
setParsedHeaders(entries);
setJsonError(null);
return parsed;
}
} catch (e) {
setParsedHeaders(null);
setJsonError("Invalid JSON format");
return null;
}
};
const handleChange = (e: React.ChangeEvent<HTMLInputElement>) => {
const val = e.target.value;
setValue(val);
const parsed = validateAndParse(val);
onChange(parsed);
};
useEffect(() => {
const jobOptions = urlParams.get("job_options");
if (!jobOptions) {
setParsedHeaders(null);
setJsonError(null);
return;
}
const jobOptionsObject = JSON.parse(jobOptions || "{}");
let val = jobOptionsObject[urlParam];
if (val.length === 0 || Object.keys(val).length === 0) {
setParsedHeaders(null);
setJsonError(null);
return;
}
if (typeof val === "string") {
try {
val = JSON.parse(val);
} catch {}
}
const finalVal =
typeof val === "string" ? val : val != null ? JSON.stringify(val) : "";
setValue(finalVal);
const parsed = validateAndParse(finalVal);
onChange(parsed);
}, [urlParam]);
return (
<Accordion
defaultExpanded
elevation={0}
sx={{
mb: 2,
border: `1px solid ${theme.palette.divider}`,
"&:before": { display: "none" },
borderRadius: 1,
overflow: "hidden",
padding: 1,
}}
>
<AccordionSummary
expandIcon={<ExpandMoreIcon />}
sx={{
backgroundColor: theme.palette.background.paper,
borderBottom: `1px solid ${theme.palette.divider}`,
"&.Mui-expanded": {
borderBottom: `1px solid ${theme.palette.divider}`,
},
}}
>
<Box sx={{ display: "flex", alignItems: "center" }}>
<Typography
sx={{ fontWeight: 500, color: theme.palette.text.primary }}
>
{label}
</Typography>
</Box>
</AccordionSummary>
<AccordionDetails
sx={{ p: 2, backgroundColor: theme.palette.background.default }}
>
<TextField
placeholder={placeholder}
value={value}
onChange={handleChange}
fullWidth
variant="outlined"
size="small"
error={jsonError !== null}
helperText={jsonError ?? ""}
name={name}
/>
{parsedHeaders && parsedHeaders.length > 0 && (
<Paper
variant="outlined"
sx={{
marginTop: 1,
border: `1px solid ${theme.palette.divider}`,
borderRadius: 1,
overflow: "hidden",
padding: 0,
}}
>
<TableContainer sx={{ maxHeight: 200 }}>
<Table size="small" stickyHeader>
<TableHead>
<TableRow
sx={{
backgroundColor: theme.palette.background.paper,
}}
>
<TableCell sx={{ fontWeight: "bold" }}>Header</TableCell>
<TableCell sx={{ fontWeight: "bold" }}>Value</TableCell>
</TableRow>
</TableHead>
<TableBody>
{parsedHeaders.map(([key, val]) => (
<TableRow
key={key}
hover
sx={{
"&:nth-of-type(odd)": {
backgroundColor:
theme.palette.mode === "light"
? "rgba(0, 0, 0, 0.02)"
: "rgba(255, 255, 255, 0.02)",
},
}}
>
<TableCell sx={{ fontWeight: 500 }}>{key}</TableCell>
<TableCell>{val}</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</TableContainer>
</Paper>
)}
</AccordionDetails>
</Accordion>
);
};

View File

@@ -0,0 +1 @@
export * from "./expanded-table-input";

View File

@@ -1,16 +1,16 @@
import { useDownloadJob } from "@/hooks/use-download-job";
import {
Dialog,
DialogTitle,
DialogContent,
DialogActions,
Button,
FormControl,
RadioGroup,
FormControlLabel,
Radio,
FormLabel,
Typography,
Box,
Button,
Dialog,
DialogContent,
DialogTitle,
FormControl,
FormControlLabel,
FormLabel,
Radio,
RadioGroup,
Typography,
} from "@mui/material";
import { useState } from "react";
@@ -26,27 +26,10 @@ export const JobDownloadDialog = ({
ids,
}: JobDownloadDialogProps) => {
const [jobFormat, setJobFormat] = useState<string>("csv");
const handleDownload = async () => {
const response = await fetch("/api/download", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ data: { ids: ids, job_format: jobFormat } }),
});
const { downloadJob } = useDownloadJob();
if (response.ok) {
const blob = await response.blob();
const url = window.URL.createObjectURL(blob);
const a = document.createElement("a");
a.style.display = "none";
a.href = url;
a.download = `job_${ids[0]}.${jobFormat}`;
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
} else {
console.error("Failed to download the file.");
}
const handleDownload = () => {
downloadJob(ids, jobFormat);
};
return (

View File

@@ -0,0 +1 @@
export * from "./job-selector";

View File

@@ -0,0 +1,151 @@
import { Job } from "@/types";
import {
ClickAwayListener,
MenuItem,
SxProps,
Typography,
useTheme,
} from "@mui/material";
import Box from "@mui/material/Box";
import FormControl from "@mui/material/FormControl";
import InputLabel from "@mui/material/InputLabel";
import Popover from "@mui/material/Popover";
import Select from "@mui/material/Select";
import React, { Dispatch, useEffect, useState } from "react";
interface Props {
sxProps?: SxProps;
setSelectedJob:
| Dispatch<React.SetStateAction<Job | null>>
| ((job: Job) => void);
selectedJob: Job | null;
jobs: Job[];
}
export const JobSelector = ({
sxProps,
selectedJob,
setSelectedJob,
jobs,
}: Props) => {
const [anchorEl, setAnchorEl] = useState<HTMLElement | null>(null);
const [popoverJob, setPopoverJob] = useState<Job | null>(null);
const theme = useTheme();
const handlePopoverOpen = (
event: React.MouseEvent<HTMLElement>,
job: Job
) => {
setAnchorEl(event.currentTarget);
setPopoverJob(job);
};
const handlePopoverClose = () => {
setAnchorEl(null);
setPopoverJob(null);
};
const open = Boolean(anchorEl);
useEffect(() => {
if (!open) {
setAnchorEl(null);
}
}, [open]);
return (
<Box sx={sxProps}>
<FormControl fullWidth>
{jobs.length ? (
<>
<InputLabel id="select-job">Job</InputLabel>
<Select
labelId="select-job"
id="select-job"
value={selectedJob?.id || ""}
label="Job"
onChange={(e) => {
const job = jobs.find((job) => job.id === e.target.value);
if (job) {
setSelectedJob(job);
}
}}
>
{jobs.map((job) => (
<MenuItem
key={job.id}
value={job.id}
aria-owns={open ? "mouse-over-popover" : undefined}
aria-haspopup="true"
onMouseEnter={(e) => handlePopoverOpen(e, job)}
onMouseLeave={handlePopoverClose}
onClick={handlePopoverClose}
>
{job.id}
</MenuItem>
))}
</Select>
</>
) : null}
</FormControl>
{open && (
<ClickAwayListener onClickAway={handlePopoverClose}>
<Popover
id="mouse-over-popover"
sx={{
pointerEvents: "none",
padding: 0,
}}
open={open}
anchorEl={anchorEl}
anchorOrigin={{
vertical: "bottom",
horizontal: "left",
}}
transformOrigin={{
vertical: "top",
horizontal: "left",
}}
onClose={handlePopoverClose}
>
{popoverJob && (
<Box
sx={{
border:
theme.palette.mode === "light"
? "2px solid black"
: "2px solid white",
}}
>
<Typography
variant="body1"
sx={{ paddingLeft: 1, paddingRight: 1 }}
>
{popoverJob.url}
</Typography>
<div className="flex flex-row w-full justify-end mb-1">
<Typography
variant="body2"
sx={{
paddingLeft: 1,
paddingRight: 1,
color:
theme.palette.mode === "dark" ? "#d3d7e6" : "#5b5d63",
fontStyle: "italic",
}}
>
{popoverJob.time_created
? new Date(popoverJob.time_created).toLocaleString()
: "Unknown"}
</Typography>
</div>
</Box>
)}
</Popover>
</ClickAwayListener>
)}
</Box>
);
};

View File

@@ -0,0 +1,40 @@
import { Box, Typography } from "@mui/material";
interface AudioViewerProps {
mediaUrl: string;
selectedMedia: string;
onError: () => void;
}
export const AudioViewer = ({
mediaUrl,
selectedMedia,
onError,
}: AudioViewerProps) => {
return (
<Box
sx={{
display: "flex",
justifyContent: "center",
alignItems: "center",
flexDirection: "column",
height: "100%",
gap: 2,
}}
>
<Typography variant="h6">{selectedMedia}</Typography>
<audio
controls
onError={onError}
style={{
width: "80%",
maxWidth: "500px",
}}
>
<source src={mediaUrl} type="audio/mpeg" />
Your browser does not support the audio element.
</audio>
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./audio-viewer";

Some files were not shown because too many files have changed in this diff Show More