45 Commits

Author SHA1 Message Date
github-actions[bot]
93b0c83381 chore: bump version to 1.1.2
Some checks failed
Merge / tests (push) Has been cancelled
Merge / version (push) Has been cancelled
Merge / build-and-deploy (push) Has been cancelled
2025-06-08 23:24:17 +00:00
Jayden Pyles
9381ba9232 chore: update workflow 2025-06-08 18:17:03 -05:00
Jayden Pyles
20dccc5527 feat: edit ui + add return html option (#90)
* fix: restyle the element table

* chore: wip ui

* wip: edit styles

* feat: add html return

* fix: build

* fix: workflow

* fix: workflow

* fix: workflow

* fix: workflow

* fix: workflow

* fix: workflow

* fix: workflow

* fix: cypress test

* chore: update photo [skip ci]
2025-06-08 18:14:02 -05:00
Jayden Pyles
02619eb184 feat: update workflows [no bump]
Some checks failed
Merge / tests (push) Has been cancelled
Merge / version (push) Has been cancelled
Merge / build-and-deploy (push) Has been cancelled
2025-06-05 22:19:41 -05:00
github-actions[bot]
58c6c09fc9 chore: bump version to 1.1.2 2025-06-06 03:18:03 +00:00
Jayden Pyles
bf896b4c6b feat: update workflows [no bump] 2025-06-05 22:09:49 -05:00
Jayden Pyles
e3b9c11ab7 feat: update workflows [no bump] 2025-06-05 21:59:54 -05:00
github-actions[bot]
32da3375b3 chore: bump version to 1.1.1 2025-06-06 02:56:42 +00:00
Jayden Pyles
b5131cbe4c feat: update workflows [no bump] 2025-06-05 21:47:55 -05:00
github-actions[bot]
47c4c9a7d1 chore: bump version to 1.1.1
Some checks failed
Merge / tests (push) Has been cancelled
Merge / version (push) Has been cancelled
Merge / build-and-deploy (push) Has been cancelled
2025-06-05 01:48:00 +00:00
Jayden Pyles
4352988666 feat: update workflows 2025-06-04 20:40:16 -05:00
Jayden Pyles
00759151e6 feat: update workflos 2025-06-04 20:35:06 -05:00
github-actions[bot]
bfae00ca72 chore: bump version to 1.1.1 2025-06-04 23:06:51 +00:00
Jayden Pyles
e810700569 chore: remove deprecated output version 2025-06-04 17:58:49 -05:00
Jayden Pyles
9857fa96e0 fix: message [skip ci] 2025-06-03 17:12:51 -05:00
github-actions[bot]
b52fbc538d chore: bump version to 1.1.1
Some checks failed
Merge / tests (push) Has been cancelled
Merge / version (push) Has been cancelled
Merge / build-and-deploy (push) Has been cancelled
2025-06-03 16:27:54 +00:00
Jayden Pyles
42c0f3ae79 feat: add auto deploy workflow 2025-06-03 11:20:13 -05:00
github-actions[bot]
9aab2f9b4f chore: bump version to 1.1.1 2025-06-03 15:55:42 +00:00
Jayden Pyles
e182d3e4b8 feat: add auto deploy workflow 2025-06-03 10:48:02 -05:00
github-actions[bot]
53f35989f5 chore: bump version to 1.1.1
Some checks failed
Merge / tests (push) Has been cancelled
Merge / version (push) Has been cancelled
Merge / build-and-deploy (push) Has been cancelled
2025-06-03 03:06:41 +00:00
Jayden Pyles
a67ab34cfa feat: add auto deploy workflow 2025-06-02 21:59:06 -05:00
Jayden Pyles
3bf6657191 Merge branch 'master' of github.com:jaypyles/Scraperr 2025-06-02 21:58:04 -05:00
Jayden Pyles
c38d19a0ca feat: add auto deploy workflow 2025-06-02 21:57:44 -05:00
github-actions[bot]
a53e7e1aa1 chore: bump version to 1.1.1 2025-06-03 02:46:40 +00:00
Jayden Pyles
84368b1f6d feat: add auto deploy workflow 2025-06-02 21:38:25 -05:00
github-actions[bot]
ce4c1ceaa7 chore: bump version to 1.1.1 2025-06-03 02:27:38 +00:00
Jayden Pyles
7e1ce58bb8 feat: add auto deploy workflow 2025-06-02 21:19:43 -05:00
github-actions[bot]
175e7d63bf chore: bump version to 1.1.1 2025-06-03 01:52:57 +00:00
Jayden Pyles
d2c06de247 feat: add auto deploy workflow 2025-06-02 20:45:06 -05:00
github-actions[bot]
e0159bf9d4 chore: bump version to 1.1.1 2025-06-03 01:39:05 +00:00
Jayden Pyles
6d574ddfd2 feat: add auto deploy workflow 2025-06-02 20:31:34 -05:00
github-actions[bot]
b089d72786 chore: bump version to 2025-06-03 01:02:11 +00:00
Jayden Pyles
9ee4d577fd feat: add auto deploy workflow 2025-06-02 19:54:33 -05:00
Jayden Pyles
cddce5164d feat: add auto deploy workflow 2025-06-02 19:45:49 -05:00
Jayden Pyles
bf3163bfba feat: add auto deploy workflow 2025-06-02 19:09:57 -05:00
Jayden Pyles
54b513e92c feat: auto deploy 2025-06-02 19:08:18 -05:00
Jayden Pyles
6c56f2f161 Chore: app refactor (#88)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
* chore: refactor wip

* chore: refactor wip

* chore: refactor wip

* chore: refactor wip

* chore: refactor wip

* chore: refactor wip

* chore: work in progress

* chore: work in progress

* chore: work in progress

* chore: work in progress

* chore: work in progress

* chore: work in progress

* chore: work in progress

* chore: work in progress

* chore: work in progress

* chore: refactor wip

* chore: work in progress

* chore: refactor wip

* chore: refactor wip

* chore: refactor wip

* fix: build

* fix: cypress test

* fix: cypress test

* fix: cypress test

* fix: cypress test

* fix: cypress test

* fix: cypress test

* fix: cypress test

* fix: cypress test

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests
2025-06-01 15:56:15 -05:00
Jayden Pyles
d4edb9d93e chore: update chart version [skip ci] 2025-05-19 20:46:19 -05:00
Jayden Pyles
5ebd96b62b feat: add agent mode (#81)
* chore: wip agent mode

* wip: add agent mode frontend

* wip: add agent mode frontend

* chore: cleanup code

* chore: cleanup code

* chore: cleanup code
2025-05-19 20:44:41 -05:00
Jayden Pyles
d602d3330a fix: site map
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
2025-05-17 17:05:37 -05:00
Jayden Pyles
6639e8b48f chore: update chart version [skip ci] 2025-05-17 16:33:18 -05:00
Jayden Pyles
263e46ba4d feat: add media viewer + other fixes (#79)
* feat: add media viewer + other fixes

* chore: remove logging [skip ci]

* chore: remove logging [skip ci]

* feat: add unit test for media

* feat: add unit test for media

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* chore: update docs [skip ci]
2025-05-17 16:31:34 -05:00
Jayden Pyles
f815a58efc chore: update docker version [skip ci] 2025-05-16 22:04:46 -05:00
Jayden Pyles
50ec5df657 chore: update chart version [skip ci] 2025-05-16 21:39:04 -05:00
Jayden Pyles
28de0f362c feat: add recording viewer and vnc (#78)
* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* chore: update gitignore [skip ci]

* chore: update dev compose [skip ci]

* fix: only run manually
2025-05-16 21:37:09 -05:00
188 changed files with 11259 additions and 13181 deletions

4
.dockerignore Normal file
View File

@@ -0,0 +1,4 @@
node_modules
npm-debug.log
Dockerfile
.dockerignore

View File

@@ -5,6 +5,9 @@ inputs:
app-repo-token: app-repo-token:
required: true required: true
description: "The token for the target repository" description: "The token for the target repository"
version:
required: true
description: "The version of the Helm chart"
runs: runs:
using: 'composite' using: 'composite'
@@ -15,6 +18,11 @@ runs:
- name: Set up Helm - name: Set up Helm
uses: azure/setup-helm@v3 uses: azure/setup-helm@v3
- name: Update Helm chart version
run: |
sed -i "s/^version: .*/version: ${{ inputs.version }}/" helm/Chart.yaml
shell: bash
- name: Package Helm chart - name: Package Helm chart
run: | run: |
mkdir -p packaged mkdir -p packaged

View File

@@ -2,6 +2,13 @@ name: Run Cypress Tests
description: Run Cypress tests description: Run Cypress tests
inputs:
openai_key:
description: "OpenAI API key"
required: true
default: ""
runs: runs:
using: "composite" using: "composite"
steps: steps:
@@ -13,13 +20,25 @@ runs:
with: with:
node-version: 22 node-version: 22
- name: Setup yarn
shell: bash
run: npm install -g yarn
- name: Install xvfb for headless testing
shell: bash
run: |
sudo apt-get update
sudo apt-get install -y xvfb libnss3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libasound2t64 libpango-1.0-0 libcairo2 libgtk-3-0 libgdk-pixbuf2.0-0 libx11-6 libx11-xcb1 libxcb1 libxss1 libxtst6 libnspr4
- name: Setup Docker project - name: Setup Docker project
shell: bash shell: bash
run: make build up-dev run: |
export OPENAI_KEY="${{ inputs.openai_key }}"
make build-ci up-ci
- name: Install dependencies - name: Install dependencies
shell: bash shell: bash
run: npm install run: yarn install
- name: Wait for frontend to be ready - name: Wait for frontend to be ready
shell: bash shell: bash
@@ -54,5 +73,8 @@ runs:
- name: Run Cypress tests - name: Run Cypress tests
shell: bash shell: bash
run: npm run cy:run run: |
set -e
npm run cy:run

31
.github/workflows/cypress-tests.yml vendored Normal file
View File

@@ -0,0 +1,31 @@
name: Cypress Tests
on:
workflow_call:
secrets:
openai_key:
required: true
jobs:
cypress-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run Cypress Tests
id: run-tests
uses: ./.github/actions/run-cypress-tests
with:
openai_key: ${{ secrets.openai_key }}
- name: Check container logs on failure
if: steps.run-tests.conclusion == 'failure'
run: |
echo "Cypress tests failed. Dumping container logs..."
docker logs scraperr_api || true
- name: Fail job if Cypress failed
if: steps.run-tests.conclusion == 'failure'
run: exit 1

View File

@@ -1,24 +1,30 @@
name: Docker Image name: Docker Image
on: on:
workflow_run: workflow_call:
workflows: ["Unit Tests"] inputs:
types: version:
- completed required: true
workflow_dispatch: type: string
secrets:
dockerhub_username:
required: true
dockerhub_token:
required: true
repo_token:
required: true
discord_webhook_url:
required: true
jobs: jobs:
build: build:
if: ${{ github.event.workflow_run.conclusion == 'success' && github.ref == 'refs/heads/master' }}
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Get version from helm chart - name: Echo version
run: | run: |
VERSION=$(grep "version:" ./helm/Chart.yaml | cut -d: -f2 | tr -d ' ') echo "Version is ${{ inputs.version }}"
echo "VERSION=$VERSION" >> $GITHUB_ENV
echo "Version is $VERSION"
- name: Login to Docker Hub - name: Login to Docker Hub
uses: docker/login-action@v3 uses: docker/login-action@v3
@@ -37,7 +43,7 @@ jobs:
push: true push: true
tags: | tags: |
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:latest ${{ secrets.DOCKERHUB_USERNAME }}/scraperr:latest
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:${{ env.VERSION }} ${{ secrets.DOCKERHUB_USERNAME }}/scraperr:${{ inputs.version }}
- name: Build and push api - name: Build and push api
uses: docker/build-push-action@v5 uses: docker/build-push-action@v5
@@ -47,7 +53,7 @@ jobs:
push: true push: true
tags: | tags: |
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:${{ env.VERSION }} ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:${{ inputs.version }}
push-helm-chart: push-helm-chart:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@@ -59,7 +65,8 @@ jobs:
- name: Push Helm Chart - name: Push Helm Chart
uses: ./.github/actions/push-to-helm uses: ./.github/actions/push-to-helm
with: with:
app-repo-token: ${{ secrets.GPAT_TOKEN }} app-repo-token: ${{ secrets.repo_token }}
version: ${{ inputs.version }}
success-message: success-message:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@@ -71,7 +78,7 @@ jobs:
uses: jaypyles/discord-webhook-action@v1.0.0 uses: jaypyles/discord-webhook-action@v1.0.0
with: with:
webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }} webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
content: "Scraperr Successfully Built Docker Images" content: "Scraperr Successfully Built Docker Images (v${{ inputs.version }})"
username: "Scraperr CI" username: "Scraperr CI"
embed-title: "✅ Deployment Status" embed-title: "✅ Deployment Status"
embed-description: "Scraperr successfully built docker images." embed-description: "Scraperr successfully built docker images."

35
.github/workflows/merge.yml vendored Normal file
View File

@@ -0,0 +1,35 @@
name: Merge
on:
push:
branches:
- master
pull_request:
types: [closed]
branches:
- master
jobs:
tests:
uses: ./.github/workflows/tests.yml
secrets:
openai_key: ${{ secrets.OPENAI_KEY }}
discord_webhook_url: ${{ secrets.DISCORD_WEBHOOK_URL }}
version:
needs: tests
uses: ./.github/workflows/version.yml
secrets:
git_token: ${{ secrets.GPAT_TOKEN }}
build-and-deploy:
if: needs.version.outputs.version_bump == 'true'
needs: version
uses: ./.github/workflows/docker-image.yml
secrets:
dockerhub_username: ${{ secrets.DOCKERHUB_USERNAME }}
dockerhub_token: ${{ secrets.DOCKERHUB_TOKEN }}
repo_token: ${{ secrets.GPAT_TOKEN }}
discord_webhook_url: ${{ secrets.DISCORD_WEBHOOK_URL }}
with:
version: ${{ needs.version.outputs.version }}

20
.github/workflows/pr.yml vendored Normal file
View File

@@ -0,0 +1,20 @@
name: PR
on:
pull_request:
branches:
- master
types: [opened, synchronize, reopened]
workflow_dispatch:
jobs:
checkout:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
tests:
uses: ./.github/workflows/tests.yml
secrets:
openai_key: ${{ secrets.OPENAI_KEY }}
discord_webhook_url: ${{ secrets.DISCORD_WEBHOOK_URL }}

27
.github/workflows/pytest.yml vendored Normal file
View File

@@ -0,0 +1,27 @@
name: Pytest
on:
workflow_call:
jobs:
unit-tests:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set env
run: echo "ENV=test" >> $GITHUB_ENV
- name: Install pdm
run: pip install pdm
- name: Install project dependencies
run: pdm install
- name: Install playwright
run: pdm run playwright install
- name: Run tests
run: PYTHONPATH=. pdm run pytest -v -ra api/backend/tests

42
.github/workflows/tests.yml vendored Normal file
View File

@@ -0,0 +1,42 @@
name: Reusable PR Tests
on:
workflow_call:
secrets:
openai_key:
required: true
discord_webhook_url:
required: true
jobs:
pytest:
uses: ./.github/workflows/pytest.yml
cypress-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run Cypress Tests
uses: ./.github/actions/run-cypress-tests
with:
openai_key: ${{ secrets.openai_key }}
success-message:
runs-on: ubuntu-latest
needs:
- pytest
- cypress-tests
steps:
- name: Send Discord Message
uses: jaypyles/discord-webhook-action@v1.0.0
with:
webhook-url: ${{ secrets.discord_webhook_url }}
content: "Scraperr Successfully Passed Tests"
username: "Scraperr CI"
embed-title: "✅ Deployment Status"
embed-description: "Scraperr successfully passed all tests."
embed-color: 3066993
embed-footer-text: "Scraperr CI"
embed-timestamp: ${{ github.event.head_commit.timestamp }}

View File

@@ -1,57 +0,0 @@
name: Unit Tests
on:
push:
branches:
- master
pull_request:
types: [opened, synchronize, reopened]
workflow_dispatch:
jobs:
unit-tests:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set env
run: echo "ENV=test" >> $GITHUB_ENV
- name: Install pdm
run: pip install pdm
- name: Install project dependencies
run: pdm install
- name: Install playwright
run: pdm run playwright install
- name: Run tests
run: PYTHONPATH=. pdm run pytest api/backend/tests
cypress-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/run-cypress-tests
success-message:
runs-on: ubuntu-latest
needs:
- unit-tests
- cypress-tests
steps:
- name: Send Discord Message
uses: jaypyles/discord-webhook-action@v1.0.0
with:
webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
content: "Scraperr Successfully Passed Tests"
username: "Scraperr CI"
embed-title: "✅ Deployment Status"
embed-description: "Scraperr successfully passed all tests."
embed-color: 3066993 # Green
embed-footer-text: "Scraperr CI"
embed-timestamp: ${{ github.event.head_commit.timestamp }}

87
.github/workflows/version.yml vendored Normal file
View File

@@ -0,0 +1,87 @@
name: Version
on:
workflow_call:
secrets:
git_token:
required: true
outputs:
version:
description: "The new version number"
value: ${{ jobs.version.outputs.version }}
version_bump:
description: "Whether the version was bumped"
value: ${{ jobs.version.outputs.version_bump }}
jobs:
version:
runs-on: ubuntu-latest
outputs:
version: ${{ steps.set_version.outputs.version }}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Get version bump
id: get_version_type
run: |
COMMIT_MSG=$(git log -1 --pretty=%B)
if [[ $COMMIT_MSG =~ ^feat\(breaking\) ]]; then
VERSION_TYPE="major"
elif [[ $COMMIT_MSG =~ ^feat! ]]; then
VERSION_TYPE="minor"
elif [[ $COMMIT_MSG =~ ^(feat|fix|chore): ]]; then
VERSION_TYPE="patch"
else
VERSION_TYPE="patch"
fi
echo "VERSION_TYPE=$VERSION_TYPE" >> $GITHUB_ENV
- name: Check for version bump
id: check_version_bump
run: |
COMMIT_MSG=$(git log -1 --pretty=%B)
if [[ $COMMIT_MSG =~ ^feat\(breaking\) ]]; then
echo "version_bump=true" >> $GITHUB_OUTPUT
elif [[ $COMMIT_MSG =~ .*\[no\ bump\].* ]]; then
echo "version_bump=false" >> $GITHUB_OUTPUT
fi
- name: Skip version bump
if: steps.check_version_bump.outputs.version_bump == 'false'
run: |
echo "Skipping version bump as requested"
gh run cancel ${{ github.run_id }}
exit 0
env:
GITHUB_TOKEN: ${{ secrets.git_token }}
- name: Set version
if: steps.check_version_bump.outputs.version_bump != 'false'
id: set_version
run: |
VERSION=$(./scripts/version.sh "$VERSION_TYPE")
echo "VERSION=$VERSION" >> $GITHUB_ENV
echo "Version is $VERSION"
echo "version=$VERSION" >> $GITHUB_OUTPUT
env:
VERSION_TYPE: ${{ env.VERSION_TYPE }}
- name: Update chart file
if: steps.check_version_bump.outputs.version_bump != 'false'
run: |
sed -i "s/^version: .*/version: $VERSION/" helm/Chart.yaml
git config --local user.email "github-actions[bot]@users.noreply.github.com"
git config --local user.name "github-actions[bot]"
git add helm/Chart.yaml
git commit -m "chore: bump version to $VERSION"
git push
env:
VERSION: ${{ env.VERSION }}

16
.gitignore vendored
View File

@@ -188,4 +188,18 @@ postgres_data
.vscode .vscode
ollama ollama
data data
media
media/images
media/videos
media/audio
media/pdfs
media/spreadsheets
media/presentations
media/documents
media/recordings
media/download_summary.txt
cypress/screenshots
cypress/videos
docker-compose.dev.local.yml

View File

@@ -1,6 +1,6 @@
.DEFAULT_GOAL := help .DEFAULT_GOAL := help
COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.yml COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.local.yml
COMPOSE_PROD = docker compose -f docker-compose.yml COMPOSE_PROD = docker compose -f docker-compose.yml
.PHONY: help deps build pull up up-dev down setup deploy .PHONY: help deps build pull up up-dev down setup deploy
@@ -17,6 +17,7 @@ help:
@echo " make down - Stop and remove containers, networks, images, and volumes" @echo " make down - Stop and remove containers, networks, images, and volumes"
@echo " make setup - Setup server with dependencies and clone repo" @echo " make setup - Setup server with dependencies and clone repo"
@echo " make deploy - Deploy site onto server" @echo " make deploy - Deploy site onto server"
@echo " make cypress-start - Start Cypress"
@echo "" @echo ""
logs: logs:
@@ -51,3 +52,15 @@ setup:
deploy: deploy:
ansible-playbook -i ./ansible/inventory.yaml ./ansible/deploy_site.yaml -v ansible-playbook -i ./ansible/inventory.yaml ./ansible/deploy_site.yaml -v
build-ci:
docker compose -f docker-compose.yml -f docker-compose.dev.yml build
up-ci:
docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d --force-recreate
cypress-start:
DISPLAY=:0 npx cypress open
cypress-run:
npx cypress run

View File

@@ -13,7 +13,7 @@
## 📋 Overview ## 📋 Overview
Scraperr enables you to extract data from websites with precision using XPath selectors. This self-hosted application provides a clean interface to manage scraping jobs, view results, and export data. Scrape websites without writing a single line of code.
> 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information. > 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information.
@@ -29,7 +29,7 @@ Scraperr enables you to extract data from websites with precision using XPath se
- **Custom Headers**: Add JSON headers to your scraping requests - **Custom Headers**: Add JSON headers to your scraping requests
- **Media Downloads**: Automatically download images, videos, and other media - **Media Downloads**: Automatically download images, videos, and other media
- **Results Visualization**: View scraped data in a structured table format - **Results Visualization**: View scraped data in a structured table format
- **Data Export**: Export your results in various formats - **Data Export**: Export your results in markdown and csv formats
- **Notifcation Channels**: Send completion notifcations, through various channels - **Notifcation Channels**: Send completion notifcations, through various channels
## 🚀 Getting Started ## 🚀 Getting Started

View File

@@ -0,0 +1,6 @@
from typing_extensions import TypedDict
class Action(TypedDict):
type: str
url: str

View File

@@ -0,0 +1,93 @@
# STL
import random
from typing import Any
# PDM
from camoufox import AsyncCamoufox
from playwright.async_api import Page
# LOCAL
from api.backend.ai.clients import ask_ollama, ask_open_ai, open_ai_key
from api.backend.job.models import CapturedElement
from api.backend.worker.logger import LOG
from api.backend.ai.agent.utils import (
parse_response,
capture_elements,
convert_to_markdown,
)
from api.backend.ai.agent.prompts import (
EXTRACT_ELEMENTS_PROMPT,
ELEMENT_EXTRACTION_PROMPT,
)
from api.backend.job.scraping.add_custom import add_custom_items
from api.backend.job.scraping.collect_media import collect_media
ask_ai = ask_open_ai if open_ai_key else ask_ollama
async def scrape_with_agent(agent_job: dict[str, Any]):
LOG.info(f"Starting work for agent job: {agent_job}")
pages = set()
if agent_job["job_options"]["proxies"]:
proxy = random.choice(agent_job["job_options"]["proxies"])
LOG.info(f"Using proxy: {proxy}")
async with AsyncCamoufox(headless=True) as browser:
page: Page = await browser.new_page()
await add_custom_items(
agent_job["url"],
page,
agent_job["job_options"]["custom_cookies"],
agent_job["job_options"]["custom_headers"],
)
try:
await page.set_viewport_size({"width": 1920, "height": 1080})
await page.goto(agent_job["url"], timeout=60000)
if agent_job["job_options"]["collect_media"]:
await collect_media(agent_job["id"], page)
html_content = await page.content()
markdown_content = convert_to_markdown(html_content)
response = await ask_ai(
ELEMENT_EXTRACTION_PROMPT.format(
extraction_prompt=EXTRACT_ELEMENTS_PROMPT,
webpage=markdown_content,
prompt=agent_job["prompt"],
)
)
xpaths = parse_response(response)
captured_elements = await capture_elements(
page, xpaths, agent_job["job_options"]["return_html"]
)
final_url = page.url
pages.add((html_content, final_url))
finally:
await page.close()
await browser.close()
name_to_elements = {}
for page in pages:
for element in captured_elements:
if element.name not in name_to_elements:
name_to_elements[element.name] = []
name_to_elements[element.name].append(element)
scraped_elements: list[dict[str, dict[str, list[CapturedElement]]]] = [
{
page[1]: name_to_elements,
}
for page in pages
]
return scraped_elements

View File

@@ -0,0 +1,58 @@
EXTRACT_ELEMENTS_PROMPT = """
You are an assistant that extracts XPath expressions from webpages.
You will receive HTML content in markdown format.
Each element in the markdown has their xpath shown above them in a path like:
<!-- //div -->
Respond only with a list of general XPath expressions inside `<xpaths>...</xpaths>` tags.
You will also decide the decision of what to do next. If there is no decision available, return nothing for that section.
"""
ELEMENT_EXTRACTION_PROMPT = """
{extraction_prompt}
**Guidelines:**
- Prefer shorter, more general XPaths like `//div[...]` or `//span[...]`.
- Avoid overly specific or deep paths like `//div[3]/ul/li[2]/a`.
- Do **not** chain multiple elements deeply (e.g., `//div/span/a`).
- Use XPaths further down the tree when possible.
- Do not include any extra explanation or text.
- One XPath is acceptable if that's all that's needed.
- Try and limit it down to 1 - 3 xpaths.
- Include a name for each xpath.
<important>
- USE THE MOST SIMPLE XPATHS POSSIBLE.
- USE THE MOST GENERAL XPATHS POSSIBLE.
- USE THE MOST SPECIFIC XPATHS POSSIBLE.
- USE THE MOST GENERAL XPATHS POSSIBLE.
</important>
**Example Format:**
```xml
<xpaths>
- <name: insert_name_here>: <xpath: //div>
- <name: insert_name_here>: <xpath: //span>
- <name: insert_name_here>: <xpath: //span[contains(@text, 'example')]>
- <name: insert_name_here>: <xpath: //div[contains(@text, 'example')]>
- <name: insert_name_here>: <xpath: //a[@href]>
- etc
</xpaths>
<decision>
<next_page>
- //a[@href='next_page_url']
</next_page>
</decision>
```
**Input webpage:**
{webpage}
**Target content:**
{prompt}
"""

View File

@@ -0,0 +1,272 @@
# STL
import re
# PDM
from lxml import html, etree
from playwright.async_api import Page
# LOCAL
from api.backend.job.models import CapturedElement
from api.backend.job.utils.text_utils import clean_text
def convert_to_markdown(html_str: str):
parser = html.HTMLParser()
tree = html.fromstring(html_str, parser=parser)
root = tree.getroottree()
def format_attributes(el: etree._Element) -> str:
"""Convert element attributes into a string."""
return " ".join(f'{k}="{v}"' for k, v in el.attrib.items())
def is_visible(el: etree._Element) -> bool:
style = el.attrib.get("style", "").lower()
class_ = el.attrib.get("class", "").lower()
# Check for visibility styles
if "display: none" in style or "visibility: hidden" in style:
return False
if "opacity: 0" in style or "opacity:0" in style:
return False
if "height: 0" in style or "width: 0" in style:
return False
# Check for common hidden classes
if any(
hidden in class_
for hidden in ["hidden", "invisible", "truncate", "collapse"]
):
return False
# Check for hidden attributes
if el.attrib.get("hidden") is not None:
return False
if el.attrib.get("aria-hidden") == "true":
return False
# Check for empty or whitespace-only content
if not el.text and len(el) == 0:
return False
return True
def is_layout_or_decorative(el: etree._Element) -> bool:
tag = el.tag.lower()
# Layout elements
if tag in {"nav", "footer", "header", "aside", "main", "section"}:
return True
# Decorative elements
if tag in {"svg", "path", "circle", "rect", "line", "polygon", "polyline"}:
return True
# Check id and class for layout/decorative keywords
id_class = " ".join(
[el.attrib.get("id", ""), el.attrib.get("class", "")]
).lower()
layout_keywords = {
"sidebar",
"nav",
"header",
"footer",
"menu",
"advert",
"ads",
"breadcrumb",
"container",
"wrapper",
"layout",
"grid",
"flex",
"row",
"column",
"section",
"banner",
"hero",
"card",
"modal",
"popup",
"tooltip",
"dropdown",
"overlay",
}
return any(keyword in id_class for keyword in layout_keywords)
# Tags to ignore in the final markdown output
included_tags = {
"div",
"span",
"a",
"p",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"img",
"button",
"input",
"textarea",
"ul",
"ol",
"li",
"table",
"tr",
"td",
"th",
"input",
"textarea",
"select",
"option",
"optgroup",
"fieldset",
"legend",
}
special_elements = []
normal_elements = []
for el in tree.iter():
if el.tag is etree.Comment:
continue
tag = el.tag.lower()
if tag not in included_tags:
continue
if not is_visible(el):
continue
if is_layout_or_decorative(el):
continue
path = root.getpath(el)
attrs = format_attributes(el)
attrs_str = f" {attrs}" if attrs else ""
text = el.text.strip() if el.text else ""
if not text and not attrs:
continue
# input elements
if tag == "button":
prefix = "🔘 **<button>**"
special_elements.append(f"<!-- {path} -->\n{prefix} {text}")
elif tag == "a":
href = el.attrib.get("href", "")
prefix = f"🔗 **<a href='{href}'>**"
special_elements.append(f"<!-- {path} -->\n{prefix} {text}")
elif tag == "input":
input_type = el.attrib.get("type", "text")
prefix = f"📝 **<input type='{input_type}'>**"
special_elements.append(f"<!-- {path} -->\n{prefix}")
else:
prefix = f"**<{tag}{attrs_str}>**"
if text:
normal_elements.append(f"<!-- {path} -->\n{prefix} {text}")
return "\n\n".join(normal_elements + special_elements) # type: ignore
def parse_response(text: str) -> list[dict[str, str]]:
xpaths = re.findall(r"<xpaths>(.*?)</xpaths>", text, re.DOTALL)
results = []
if xpaths:
lines = xpaths[0].strip().splitlines()
for line in lines:
if line.strip().startswith("-"):
name = re.findall(r"<name: (.*?)>", line)[0]
xpath = re.findall(r"<xpath: (.*?)>", line)[0]
results.append({"name": name, "xpath": xpath})
else:
results.append({"name": "", "xpath": line.strip()})
return results
def parse_next_page(text: str) -> str | None:
next_page = re.findall(r"<next_page>(.*?)</next_page>", text, re.DOTALL)
if next_page:
lines = next_page[0].strip().splitlines()
next_page = [
line.strip().lstrip("-").strip()
for line in lines
if line.strip().startswith("-")
]
return next_page[0] if next_page else None
async def capture_elements(
page: Page, xpaths: list[dict[str, str]], return_html: bool
) -> list[CapturedElement]:
captured_elements = []
seen_texts = set()
for xpath in xpaths:
try:
locator = page.locator(f"xpath={xpath['xpath']}")
count = await locator.count()
for i in range(count):
if return_html:
element_text = (
await page.locator(f"xpath={xpath['xpath']}")
.nth(i)
.inner_html()
)
seen_texts.add(element_text)
captured_elements.append(
CapturedElement(
name=xpath["name"],
text=element_text,
xpath=xpath["xpath"],
)
)
continue
element_text = ""
element_handle = await locator.nth(i).element_handle()
if not element_handle:
continue
link = await element_handle.get_attribute("href") or ""
text = await element_handle.text_content()
if text:
element_text += text
if link:
element_text += f" ({link})"
cleaned = clean_text(element_text)
if cleaned in seen_texts:
continue
seen_texts.add(cleaned)
captured_elements.append(
CapturedElement(
name=xpath["name"],
text=cleaned,
xpath=xpath["xpath"],
)
)
except Exception as e:
print(f"Error processing xpath {xpath}: {e}")
return captured_elements

View File

@@ -1,32 +1,28 @@
# STL # STL
import os
import logging import logging
from collections.abc import Iterable, AsyncGenerator from collections.abc import Iterable, AsyncGenerator
# PDM # PDM
from openai import OpenAI from ollama import Message
from fastapi import APIRouter from fastapi import APIRouter
from fastapi.responses import JSONResponse, StreamingResponse from fastapi.responses import JSONResponse, StreamingResponse
from openai.types.chat import ChatCompletionMessageParam from openai.types.chat import ChatCompletionMessageParam
# LOCAL # LOCAL
from ollama import Message, AsyncClient from api.backend.ai.clients import (
from api.backend.models import AI llama_model,
open_ai_key,
llama_client,
open_ai_model,
openai_client,
)
from api.backend.ai.schemas import AI
from api.backend.routers.handle_exceptions import handle_exceptions
LOG = logging.getLogger(__name__) LOG = logging.getLogger("AI")
ai_router = APIRouter() ai_router = APIRouter()
# Load environment variables
open_ai_key = os.getenv("OPENAI_KEY")
open_ai_model = os.getenv("OPENAI_MODEL")
llama_url = os.getenv("OLLAMA_URL")
llama_model = os.getenv("OLLAMA_MODEL")
# Initialize clients
openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None
llama_client = AsyncClient(host=llama_url) if llama_url else None
async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]: async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]:
if llama_client and llama_model: if llama_client and llama_model:
@@ -67,6 +63,7 @@ chat_function = llama_chat if llama_client else openai_chat
@ai_router.post("/ai") @ai_router.post("/ai")
@handle_exceptions(logger=LOG)
async def ai(c: AI): async def ai(c: AI):
return StreamingResponse( return StreamingResponse(
chat_function(chat_messages=c.messages), media_type="text/plain" chat_function(chat_messages=c.messages), media_type="text/plain"
@@ -74,5 +71,6 @@ async def ai(c: AI):
@ai_router.get("/ai/check") @ai_router.get("/ai/check")
@handle_exceptions(logger=LOG)
async def check(): async def check():
return JSONResponse(content={"ai_enabled": bool(open_ai_key or llama_model)}) return JSONResponse(content={"ai_enabled": bool(open_ai_key or llama_model)})

39
api/backend/ai/clients.py Normal file
View File

@@ -0,0 +1,39 @@
# STL
import os
# PDM
from ollama import AsyncClient
from openai import OpenAI
# Load environment variables
open_ai_key = os.getenv("OPENAI_KEY")
open_ai_model = os.getenv("OPENAI_MODEL")
llama_url = os.getenv("OLLAMA_URL")
llama_model = os.getenv("OLLAMA_MODEL")
# Initialize clients
openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None
llama_client = AsyncClient(host=llama_url) if llama_url else None
async def ask_open_ai(prompt: str) -> str:
if not openai_client:
raise ValueError("OpenAI client not initialized")
response = openai_client.chat.completions.create(
model=open_ai_model or "gpt-4.1-mini",
messages=[{"role": "user", "content": prompt}],
)
return response.choices[0].message.content or ""
async def ask_ollama(prompt: str) -> str:
if not llama_client:
raise ValueError("Ollama client not initialized")
response = await llama_client.chat(
model=llama_model or "", messages=[{"role": "user", "content": prompt}]
)
return response.message.content or ""

View File

@@ -0,0 +1,4 @@
# LOCAL
from .ai import AI
__all__ = ["AI"]

View File

@@ -0,0 +1,9 @@
# STL
from typing import Any
# PDM
import pydantic
class AI(pydantic.BaseModel):
messages: list[Any]

View File

@@ -1,39 +1,60 @@
# STL # STL
import os import os
import logging import logging
import apscheduler # type: ignore from contextlib import asynccontextmanager
# PDM # PDM
import apscheduler.schedulers
import apscheduler.schedulers.background
from fastapi import FastAPI, Request, status from fastapi import FastAPI, Request, status
from fastapi.responses import JSONResponse
from fastapi.exceptions import RequestValidationError from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
# LOCAL # LOCAL
from api.backend.ai.ai_router import ai_router
from api.backend.auth.auth_router import auth_router
from api.backend.utils import get_log_level from api.backend.utils import get_log_level
from api.backend.routers.job_router import job_router
from api.backend.routers.stats_router import stats_router
from api.backend.database.startup import init_database
from fastapi.responses import JSONResponse
from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
from api.backend.scheduler import scheduler from api.backend.scheduler import scheduler
from api.backend.ai.ai_router import ai_router
from api.backend.job.job_router import job_router
from api.backend.auth.auth_router import auth_router
from api.backend.database.startup import init_database
from api.backend.stats.stats_router import stats_router
from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
log_level = os.getenv("LOG_LEVEL") log_level = os.getenv("LOG_LEVEL")
LOG_LEVEL = get_log_level(log_level) LOG_LEVEL = get_log_level(log_level)
logging.basicConfig( logging.basicConfig(
level=LOG_LEVEL, level=LOG_LEVEL,
format="%(levelname)s: %(asctime)s - %(name)s - %(message)s", format="%(levelname)s: %(asctime)s - [%(name)s] - %(message)s",
handlers=[logging.StreamHandler()], handlers=[logging.StreamHandler()],
) )
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
app = FastAPI(title="api", root_path="/api")
@asynccontextmanager
async def lifespan(_: FastAPI):
# Startup
LOG.info("Starting application...")
init_database()
LOG.info("Starting cron scheduler...")
start_cron_scheduler(scheduler)
scheduler.start()
LOG.info("Cron scheduler started successfully")
yield
# Shutdown
LOG.info("Shutting down application...")
LOG.info("Stopping cron scheduler...")
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
LOG.info("Cron scheduler stopped")
LOG.info("Application shutdown complete")
app = FastAPI(title="api", root_path="/api", lifespan=lifespan)
app.add_middleware( app.add_middleware(
CORSMiddleware, CORSMiddleware,
@@ -43,28 +64,12 @@ app.add_middleware(
allow_headers=["*"], allow_headers=["*"],
) )
app.include_router(auth_router) app.include_router(auth_router)
app.include_router(ai_router) app.include_router(ai_router)
app.include_router(job_router) app.include_router(job_router)
app.include_router(stats_router) app.include_router(stats_router)
@app.on_event("startup")
async def startup_event():
start_cron_scheduler(scheduler)
scheduler.start()
if os.getenv("ENV") != "test":
init_database()
LOG.info("Starting up...")
@app.on_event("shutdown")
def shutdown_scheduler():
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
@app.exception_handler(RequestValidationError) @app.exception_handler(RequestValidationError)
async def validation_exception_handler(request: Request, exc: RequestValidationError): async def validation_exception_handler(request: Request, exc: RequestValidationError):
exc_str = f"{exc}".replace("\n", " ").replace(" ", " ") exc_str = f"{exc}".replace("\n", " ").replace(" ", " ")

View File

@@ -1,13 +1,14 @@
# STL # STL
from datetime import timedelta
import os import os
import logging
from datetime import timedelta
# PDM # PDM
from fastapi import Depends, APIRouter, HTTPException, status from fastapi import Depends, APIRouter, HTTPException, status
from fastapi.security import OAuth2PasswordRequestForm from fastapi.security import OAuth2PasswordRequestForm
# LOCAL # LOCAL
from api.backend.schemas import User, Token, UserCreate from api.backend.auth.schemas import User, Token, UserCreate
from api.backend.auth.auth_utils import ( from api.backend.auth.auth_utils import (
ACCESS_TOKEN_EXPIRE_MINUTES, ACCESS_TOKEN_EXPIRE_MINUTES,
get_current_user, get_current_user,
@@ -15,18 +16,19 @@ from api.backend.auth.auth_utils import (
get_password_hash, get_password_hash,
create_access_token, create_access_token,
) )
import logging
from api.backend.database.common import update from api.backend.database.common import update
from api.backend.routers.handle_exceptions import handle_exceptions
auth_router = APIRouter() auth_router = APIRouter()
LOG = logging.getLogger("auth_router") LOG = logging.getLogger("Auth")
@auth_router.post("/auth/token", response_model=Token) @auth_router.post("/auth/token", response_model=Token)
@handle_exceptions(logger=LOG)
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()): async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
user = await authenticate_user(form_data.username, form_data.password) user = await authenticate_user(form_data.username, form_data.password)
if not user: if not user:
raise HTTPException( raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED, status_code=status.HTTP_401_UNAUTHORIZED,
@@ -47,6 +49,7 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends(
@auth_router.post("/auth/signup", response_model=User) @auth_router.post("/auth/signup", response_model=User)
@handle_exceptions(logger=LOG)
async def create_user(user: UserCreate): async def create_user(user: UserCreate):
hashed_password = get_password_hash(user.password) hashed_password = get_password_hash(user.password)
user_dict = user.model_dump() user_dict = user.model_dump()
@@ -60,10 +63,16 @@ async def create_user(user: UserCreate):
@auth_router.get("/auth/users/me", response_model=User) @auth_router.get("/auth/users/me", response_model=User)
@handle_exceptions(logger=LOG)
async def read_users_me(current_user: User = Depends(get_current_user)): async def read_users_me(current_user: User = Depends(get_current_user)):
return current_user return current_user
@auth_router.get("/auth/check") @auth_router.get("/auth/check")
@handle_exceptions(logger=LOG)
async def check_auth(): async def check_auth():
return {"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True"} return {
"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True",
"recordings_enabled": os.environ.get("RECORDINGS_ENABLED", "true").lower()
== "true",
}

View File

@@ -1,8 +1,8 @@
# STL # STL
import os import os
import logging
from typing import Any, Optional from typing import Any, Optional
from datetime import datetime, timedelta from datetime import datetime, timedelta
import logging
# PDM # PDM
from jose import JWTError, jwt from jose import JWTError, jwt
@@ -12,11 +12,10 @@ from passlib.context import CryptContext
from fastapi.security import OAuth2PasswordBearer from fastapi.security import OAuth2PasswordBearer
# LOCAL # LOCAL
from api.backend.schemas import User, UserInDB, TokenData from api.backend.auth.schemas import User, UserInDB, TokenData
from api.backend.database.common import query from api.backend.database.common import query
LOG = logging.getLogger(__name__) LOG = logging.getLogger("Auth")
_ = load_dotenv() _ = load_dotenv()
@@ -118,7 +117,8 @@ async def get_current_user(token: str = Depends(oauth2_scheme)):
LOG.error(f"Exception occurred: {e}") LOG.error(f"Exception occurred: {e}")
return EMPTY_USER return EMPTY_USER
user = await get_user(email=token_data.email) user = await get_user(email=token_data.email or "")
if user is None: if user is None:
return EMPTY_USER return EMPTY_USER
@@ -136,6 +136,7 @@ async def require_user(token: str = Depends(oauth2_scheme)):
payload: Optional[dict[str, Any]] = jwt.decode( payload: Optional[dict[str, Any]] = jwt.decode(
token, SECRET_KEY, algorithms=[ALGORITHM] token, SECRET_KEY, algorithms=[ALGORITHM]
) )
if not payload: if not payload:
raise credentials_exception raise credentials_exception
@@ -149,7 +150,7 @@ async def require_user(token: str = Depends(oauth2_scheme)):
except JWTError: except JWTError:
raise credentials_exception raise credentials_exception
user = await get_user(email=token_data.email) user = await get_user(email=token_data.email or "")
if user is None: if user is None:
raise credentials_exception raise credentials_exception

View File

@@ -0,0 +1,4 @@
# LOCAL
from .auth import User, Token, UserInDB, TokenData, UserCreate
__all__ = ["User", "Token", "UserInDB", "TokenData", "UserCreate"]

View File

@@ -1 +1,24 @@
# STL
import os
from pathlib import Path
DATABASE_PATH = "data/database.db" DATABASE_PATH = "data/database.db"
RECORDINGS_DIR = Path("media/recordings")
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
MEDIA_DIR = Path("media")
MEDIA_TYPES = [
"audio",
"documents",
"images",
"pdfs",
"presentations",
"spreadsheets",
"videos",
]
REGISTRATION_ENABLED = os.getenv("REGISTRATION_ENABLED", "true").lower() == "true"
DEFAULT_USER_EMAIL = os.getenv("DEFAULT_USER_EMAIL")
DEFAULT_USER_PASSWORD = os.getenv("DEFAULT_USER_PASSWORD")
DEFAULT_USER_FULL_NAME = os.getenv("DEFAULT_USER_FULL_NAME")
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")

View File

@@ -1,3 +1,5 @@
from .common import insert, QUERIES, update # LOCAL
from .common import insert, update, connect
from .schema import INIT_QUERY
__all__ = ["insert", "QUERIES", "update"] __all__ = ["insert", "update", "INIT_QUERY", "connect"]

View File

@@ -1,12 +1,13 @@
# STL
import logging
import sqlite3 import sqlite3
from typing import Any, Optional from typing import Any, Optional
from api.backend.constants import DATABASE_PATH
from api.backend.utils import format_json, format_sql_row_to_python
from api.backend.database.schema import INIT_QUERY
from api.backend.database.queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
import logging
LOG = logging.getLogger(__name__) # LOCAL
from api.backend.constants import DATABASE_PATH
from api.backend.database.utils import format_json, format_sql_row_to_python
LOG = logging.getLogger("Database")
def connect(): def connect():
@@ -25,8 +26,10 @@ def insert(query: str, values: tuple[Any, ...]):
try: try:
_ = cursor.execute(query, copy) _ = cursor.execute(query, copy)
connection.commit() connection.commit()
except sqlite3.Error as e: except sqlite3.Error as e:
LOG.error(f"An error occurred: {e}") LOG.error(f"An error occurred: {e}")
finally: finally:
cursor.close() cursor.close()
connection.close() connection.close()
@@ -78,15 +81,9 @@ def update(query: str, values: Optional[tuple[Any, ...]] = None):
return res.rowcount return res.rowcount
except sqlite3.Error as e: except sqlite3.Error as e:
LOG.error(f"An error occurred: {e}") LOG.error(f"An error occurred: {e}")
finally: finally:
cursor.close() cursor.close()
connection.close() connection.close()
return 0 return 0
QUERIES = {
"init": INIT_QUERY,
"insert_job": JOB_INSERT_QUERY,
"delete_job": DELETE_JOB_QUERY,
}

View File

@@ -1,3 +1,4 @@
from .queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY # LOCAL
from .job.job_queries import DELETE_JOB_QUERY, JOB_INSERT_QUERY
__all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"] __all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"]

View File

@@ -0,0 +1,68 @@
# STL
import logging
from typing import Any
# LOCAL
from api.backend.database.utils import format_list_for_query
from api.backend.database.common import query, insert, update
JOB_INSERT_QUERY = """
INSERT INTO jobs
(id, url, elements, user, time_created, result, status, chat, job_options, agent_mode, prompt)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
"""
DELETE_JOB_QUERY = """
DELETE FROM jobs WHERE id IN ()
"""
LOG = logging.getLogger("Database")
def insert_job(item: dict[str, Any]) -> None:
insert(
JOB_INSERT_QUERY,
(
item["id"],
item["url"],
item["elements"],
item["user"],
item["time_created"],
item["result"],
item["status"],
item["chat"],
item["job_options"],
item["agent_mode"],
item["prompt"],
),
)
LOG.info(f"Inserted item: {item}")
async def get_queued_job():
queued_job_query = (
"SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
)
res = query(queued_job_query)
LOG.info(f"Got queued job: {res}")
return res[0] if res else None
async def update_job(ids: list[str], field: str, value: Any):
query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
res = update(query, tuple([value] + ids))
LOG.info(f"Updated job: {res}")
async def delete_jobs(jobs: list[str]):
if not jobs:
LOG.info("No jobs to delete.")
return False
query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}"
res = update(query, tuple(jobs))
LOG.info(f"Deleted jobs: {res}")
return res

View File

@@ -1,9 +0,0 @@
JOB_INSERT_QUERY = """
INSERT INTO jobs
(id, url, elements, user, time_created, result, status, chat, job_options)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
"""
DELETE_JOB_QUERY = """
DELETE FROM jobs WHERE id IN ()
"""

View File

@@ -0,0 +1,41 @@
# LOCAL
from api.backend.database.common import query
async def average_elements_per_link(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
AVG(json_array_length(elements)) AS average_elements,
COUNT(*) AS count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = query(job_query, (user,))
return results
async def get_jobs_per_day(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
COUNT(*) AS job_count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = query(job_query, (user,))
return results

View File

@@ -27,4 +27,8 @@ CREATE TABLE IF NOT EXISTS cron_jobs (
time_updated DATETIME NOT NULL, time_updated DATETIME NOT NULL,
FOREIGN KEY (job_id) REFERENCES jobs(id) FOREIGN KEY (job_id) REFERENCES jobs(id)
); );
ALTER TABLE jobs ADD COLUMN agent_mode BOOLEAN NOT NULL DEFAULT FALSE;
ALTER TABLE jobs ADD COLUMN prompt STRING;
ALTER TABLE jobs ADD COLUMN favorite BOOLEAN NOT NULL DEFAULT FALSE;
""" """

View File

@@ -1,24 +1,52 @@
import os # STL
from api.backend.database.common import connect, QUERIES, insert
import logging import logging
import sqlite3
# LOCAL
from api.backend.constants import (
DEFAULT_USER_EMAIL,
REGISTRATION_ENABLED,
DEFAULT_USER_PASSWORD,
DEFAULT_USER_FULL_NAME,
)
from api.backend.auth.auth_utils import get_password_hash from api.backend.auth.auth_utils import get_password_hash
from api.backend.database.common import insert, connect
from api.backend.database.schema import INIT_QUERY
LOG = logging.getLogger(__name__) LOG = logging.getLogger("Database")
def init_database(): def execute_startup_query():
cursor = connect() cursor = connect()
for query in QUERIES["init"].strip().split(";"): for query in INIT_QUERY.strip().split(";"):
if query.strip(): query = query.strip()
if not query:
continue
try:
LOG.info(f"Executing query: {query}") LOG.info(f"Executing query: {query}")
_ = cursor.execute(query) _ = cursor.execute(query)
if os.environ.get("REGISTRATION_ENABLED", "True") == "False": except sqlite3.OperationalError as e:
default_user_email = os.environ.get("DEFAULT_USER_EMAIL") if "duplicate column name" in str(e).lower():
default_user_password = os.environ.get("DEFAULT_USER_PASSWORD") LOG.warning(f"Skipping duplicate column error: {e}")
default_user_full_name = os.environ.get("DEFAULT_USER_FULL_NAME") continue
else:
LOG.error(f"Error executing query: {query}")
raise
cursor.close()
def init_database():
execute_startup_query()
if not REGISTRATION_ENABLED:
default_user_email = DEFAULT_USER_EMAIL
default_user_password = DEFAULT_USER_PASSWORD
default_user_full_name = DEFAULT_USER_FULL_NAME
if ( if (
not default_user_email not default_user_email
@@ -39,5 +67,3 @@ def init_database():
default_user_full_name, default_user_full_name,
), ),
) )
cursor.close()

View File

@@ -0,0 +1,30 @@
# STL
import json
from typing import Any
def format_list_for_query(ids: list[str]):
return (
f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)"
)
def format_sql_row_to_python(row: dict[str, Any]):
new_row: dict[str, Any] = {}
for key, value in row.items():
if isinstance(value, str):
try:
new_row[key] = json.loads(value)
except json.JSONDecodeError:
new_row[key] = value
else:
new_row[key] = value
return new_row
def format_json(items: list[Any]):
for idx, item in enumerate(items):
if isinstance(item, (dict, list)):
formatted_item = json.dumps(item)
items[idx] = formatted_item

View File

@@ -1,17 +1,9 @@
from .job import ( # LOCAL
insert, from .job import insert, update_job, delete_jobs, get_queued_job
update_job,
delete_jobs,
get_jobs_per_day,
get_queued_job,
average_elements_per_link,
)
__all__ = [ __all__ = [
"insert", "insert",
"update_job", "update_job",
"delete_jobs", "delete_jobs",
"get_jobs_per_day",
"get_queued_job", "get_queued_job",
"average_elements_per_link",
] ]

View File

@@ -1,15 +1,19 @@
# STL
import uuid
import logging
import datetime import datetime
from typing import Any from typing import Any
import uuid
from api.backend.database.common import insert, query
from api.backend.models import CronJob
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
from apscheduler.triggers.cron import CronTrigger # type: ignore
# PDM
from apscheduler.triggers.cron import CronTrigger
from apscheduler.schedulers.background import BackgroundScheduler
# LOCAL
from api.backend.job import insert as insert_job from api.backend.job import insert as insert_job
import logging from api.backend.schemas.cron import CronJob
from api.backend.database.common import query, insert
LOG = logging.getLogger("Cron Scheduler") LOG = logging.getLogger("Cron")
def insert_cron_job(cron_job: CronJob): def insert_cron_job(cron_job: CronJob):
@@ -17,6 +21,7 @@ def insert_cron_job(cron_job: CronJob):
INSERT INTO cron_jobs (id, user_email, job_id, cron_expression, time_created, time_updated) INSERT INTO cron_jobs (id, user_email, job_id, cron_expression, time_created, time_updated)
VALUES (?, ?, ?, ?, ?, ?) VALUES (?, ?, ?, ?, ?, ?)
""" """
values = ( values = (
cron_job.id, cron_job.id,
cron_job.user_email, cron_job.user_email,
@@ -36,6 +41,7 @@ def delete_cron_job(id: str, user_email: str):
DELETE FROM cron_jobs DELETE FROM cron_jobs
WHERE id = ? AND user_email = ? WHERE id = ? AND user_email = ?
""" """
values = (id, user_email) values = (id, user_email)
insert(query, values) insert(query, values)

View File

@@ -3,20 +3,18 @@ import logging
from typing import Any from typing import Any
# LOCAL # LOCAL
from api.backend.utils import format_list_for_query from api.backend.database.utils import format_list_for_query
from api.backend.database.common import ( from api.backend.database.common import query as common_query
insert as common_insert, from api.backend.database.common import insert as common_insert
query as common_query, from api.backend.database.common import update as common_update
QUERIES, from api.backend.database.queries.job.job_queries import JOB_INSERT_QUERY
update as common_update,
)
LOG = logging.getLogger(__name__) LOG = logging.getLogger("Job")
def insert(item: dict[str, Any]) -> None: def insert(item: dict[str, Any]) -> None:
common_insert( common_insert(
QUERIES["insert_job"], JOB_INSERT_QUERY,
( (
item["id"], item["id"],
item["url"], item["url"],
@@ -27,9 +25,12 @@ def insert(item: dict[str, Any]) -> None:
item["status"], item["status"],
item["chat"], item["chat"],
item["job_options"], item["job_options"],
item["agent_mode"],
item["prompt"],
), ),
) )
LOG.info(f"Inserted item: {item}")
LOG.debug(f"Inserted item: {item}")
async def get_queued_job(): async def get_queued_job():
@@ -37,61 +38,22 @@ async def get_queued_job():
"SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1" "SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
) )
res = common_query(query) res = common_query(query)
LOG.info(f"Got queued job: {res}") LOG.debug(f"Got queued job: {res}")
return res[0] if res else None return res[0] if res else None
async def update_job(ids: list[str], field: str, value: Any): async def update_job(ids: list[str], field: str, value: Any):
query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}" query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
res = common_update(query, tuple([value] + ids)) res = common_update(query, tuple([value] + ids))
LOG.info(f"Updated job: {res}") LOG.debug(f"Updated job: {res}")
async def delete_jobs(jobs: list[str]): async def delete_jobs(jobs: list[str]):
if not jobs: if not jobs:
LOG.info("No jobs to delete.") LOG.debug("No jobs to delete.")
return False return False
query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}" query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}"
res = common_update(query, tuple(jobs)) res = common_update(query, tuple(jobs))
return res > 0 return res > 0
async def average_elements_per_link(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
AVG(json_array_length(elements)) AS average_elements,
COUNT(*) AS count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = common_query(job_query, (user,))
return results
async def get_jobs_per_day(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
COUNT(*) AS job_count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = common_query(job_query, (user,))
return results

View File

@@ -0,0 +1,248 @@
# STL
import csv
import uuid
import random
import logging
import datetime
from io import StringIO
# PDM
from fastapi import Depends, APIRouter
from fastapi.encoders import jsonable_encoder
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
from apscheduler.triggers.cron import CronTrigger # type: ignore
# LOCAL
from api.backend.job import insert, update_job, delete_jobs
from api.backend.constants import MEDIA_DIR, MEDIA_TYPES, RECORDINGS_DIR
from api.backend.scheduler import scheduler
from api.backend.schemas.job import Job, UpdateJobs, DownloadJob, DeleteScrapeJobs
from api.backend.auth.schemas import User
from api.backend.schemas.cron import CronJob, DeleteCronJob
from api.backend.database.utils import format_list_for_query
from api.backend.auth.auth_utils import get_current_user
from api.backend.database.common import query
from api.backend.job.utils.text_utils import clean_text
from api.backend.job.models.job_options import FetchOptions
from api.backend.routers.handle_exceptions import handle_exceptions
from api.backend.job.utils.clean_job_format import clean_job_format
from api.backend.job.cron_scheduling.cron_scheduling import (
get_cron_jobs,
delete_cron_job,
insert_cron_job,
get_cron_job_trigger,
insert_job_from_cron_job,
)
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
LOG = logging.getLogger("Job")
job_router = APIRouter()
@job_router.post("/update")
@handle_exceptions(logger=LOG)
async def update(update_jobs: UpdateJobs, _: User = Depends(get_current_user)):
"""Used to update jobs"""
await update_job(update_jobs.ids, update_jobs.field, update_jobs.value)
return JSONResponse(content={"message": "Jobs updated successfully."})
@job_router.post("/submit-scrape-job")
@handle_exceptions(logger=LOG)
async def submit_scrape_job(job: Job):
LOG.info(f"Recieved job: {job}")
job.id = uuid.uuid4().hex
job_dict = job.model_dump()
insert(job_dict)
return JSONResponse(
content={"id": job.id, "message": "Job submitted successfully."}
)
@job_router.post("/retrieve-scrape-jobs")
@handle_exceptions(logger=LOG)
async def retrieve_scrape_jobs(
fetch_options: FetchOptions, user: User = Depends(get_current_user)
):
LOG.info(f"Retrieving jobs for account: {user.email}")
ATTRIBUTES = "chat" if fetch_options.chat else "*"
job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?"
results = query(job_query, (user.email,))
return JSONResponse(content=jsonable_encoder(results[::-1]))
@job_router.get("/job/{id}")
@handle_exceptions(logger=LOG)
async def job(id: str, user: User = Depends(get_current_user)):
LOG.info(f"Retrieving jobs for account: {user.email}")
job_query = "SELECT * FROM jobs WHERE user = ? AND id = ?"
results = query(job_query, (user.email, id))
return JSONResponse(content=jsonable_encoder(results))
@job_router.post("/download")
@handle_exceptions(logger=LOG)
async def download(download_job: DownloadJob):
LOG.info(f"Downloading job with ids: {download_job.ids}")
job_query = (
f"SELECT * FROM jobs WHERE id IN {format_list_for_query(download_job.ids)}"
)
results = query(job_query, tuple(download_job.ids))
if download_job.job_format == "csv":
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
headers = [
"id",
"url",
"element_name",
"xpath",
"text",
"user",
"time_created",
]
csv_writer.writerow(headers)
for result in results:
for res in result["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
csv_writer.writerow(
[
result.get("id", "")
+ "-"
+ str(random.randint(0, 1000000)),
url,
element_name,
value.get("xpath", ""),
text,
result.get("user", ""),
result.get("time_created", ""),
]
)
_ = csv_buffer.seek(0)
response = StreamingResponse(
csv_buffer,
media_type="text/csv",
)
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
elif download_job.job_format == "md":
response = StreamingResponse(
stream_md_from_job_results(results),
media_type="text/markdown",
)
response.headers["Content-Disposition"] = "attachment; filename=export.md"
return response
@job_router.get("/job/{id}/convert-to-csv")
@handle_exceptions(logger=LOG)
async def convert_to_csv(id: str):
job_query = f"SELECT * FROM jobs WHERE id = ?"
results = query(job_query, (id,))
return JSONResponse(content=clean_job_format(results))
@job_router.post("/delete-scrape-jobs")
@handle_exceptions(logger=LOG)
async def delete(delete_scrape_jobs: DeleteScrapeJobs):
result = await delete_jobs(delete_scrape_jobs.ids)
return (
JSONResponse(content={"message": "Jobs successfully deleted."})
if result
else JSONResponse(content={"error": "Jobs not deleted."})
)
@job_router.post("/schedule-cron-job")
@handle_exceptions(logger=LOG)
async def schedule_cron_job(cron_job: CronJob):
if not cron_job.id:
cron_job.id = uuid.uuid4().hex
if not cron_job.time_created:
cron_job.time_created = datetime.datetime.now()
if not cron_job.time_updated:
cron_job.time_updated = datetime.datetime.now()
insert_cron_job(cron_job)
queried_job = query("SELECT * FROM jobs WHERE id = ?", (cron_job.job_id,))
scheduler.add_job(
insert_job_from_cron_job,
get_cron_job_trigger(cron_job.cron_expression),
id=cron_job.id,
args=[queried_job[0]],
)
return JSONResponse(content={"message": "Cron job scheduled successfully."})
@job_router.post("/delete-cron-job")
@handle_exceptions(logger=LOG)
async def delete_cron_job_request(request: DeleteCronJob):
if not request.id:
return JSONResponse(
content={"error": "Cron job id is required."}, status_code=400
)
delete_cron_job(request.id, request.user_email)
scheduler.remove_job(request.id)
return JSONResponse(content={"message": "Cron job deleted successfully."})
@job_router.get("/cron-jobs")
@handle_exceptions(logger=LOG)
async def get_cron_jobs_request(user: User = Depends(get_current_user)):
cron_jobs = get_cron_jobs(user.email)
return JSONResponse(content=jsonable_encoder(cron_jobs))
@job_router.get("/recordings/{id}")
@handle_exceptions(logger=LOG)
async def get_recording(id: str):
path = RECORDINGS_DIR / f"{id}.mp4"
if not path.exists():
return JSONResponse(content={"error": "Recording not found."}, status_code=404)
return FileResponse(
path, headers={"Content-Type": "video/mp4", "Accept-Ranges": "bytes"}
)
@job_router.get("/get-media")
@handle_exceptions(logger=LOG)
async def get_media(id: str):
files: dict[str, list[str]] = {}
for media_type in MEDIA_TYPES:
path = MEDIA_DIR / media_type / f"{id}"
files[media_type] = [file.name for file in path.glob("*")]
return JSONResponse(content={"files": files})
@job_router.get("/media")
@handle_exceptions(logger=LOG)
async def get_media_file(id: str, type: str, file: str):
path = MEDIA_DIR / type / f"{id}" / file
if not path.exists():
return JSONResponse(content={"error": "Media file not found."}, status_code=404)
return FileResponse(path)

View File

@@ -1,3 +1,5 @@
from .job_options import JobOptions # LOCAL
from .job import Element, CapturedElement
from .job_options import Proxy, JobOptions
__all__ = ["JobOptions"] __all__ = ["JobOptions", "CapturedElement", "Element", "Proxy"]

View File

@@ -0,0 +1,15 @@
from typing import Optional
import pydantic
class Element(pydantic.BaseModel):
name: str
xpath: str
url: Optional[str] = None
class CapturedElement(pydantic.BaseModel):
xpath: str
text: str
name: str

View File

@@ -1,8 +1,19 @@
from pydantic import BaseModel # STL
from typing import Any, Optional from typing import Any, Optional
# PDM
from pydantic import BaseModel
# LOCAL
from api.backend.job.models.site_map import SiteMap from api.backend.job.models.site_map import SiteMap
class Proxy(BaseModel):
server: str
username: Optional[str] = None
password: Optional[str] = None
class FetchOptions(BaseModel): class FetchOptions(BaseModel):
chat: Optional[bool] = None chat: Optional[bool] = None
@@ -10,7 +21,8 @@ class FetchOptions(BaseModel):
class JobOptions(BaseModel): class JobOptions(BaseModel):
multi_page_scrape: bool = False multi_page_scrape: bool = False
custom_headers: dict[str, Any] = {} custom_headers: dict[str, Any] = {}
proxies: list[str] = [] proxies: list[Proxy] = []
site_map: Optional[SiteMap] = None site_map: Optional[SiteMap] = None
collect_media: bool = False collect_media: bool = False
custom_cookies: list[dict[str, Any]] = [] custom_cookies: list[dict[str, Any]] = []
return_html: bool = False

View File

@@ -1,11 +1,12 @@
# STL
import logging
from typing import Any, Optional from typing import Any, Optional
from urllib.parse import urlparse from urllib.parse import urlparse
# PDM
from playwright.async_api import Page, BrowserContext from playwright.async_api import Page, BrowserContext
import logging LOG = logging.getLogger("Job")
LOG = logging.getLogger(__name__)
async def add_custom_cookies( async def add_custom_cookies(
@@ -18,8 +19,8 @@ async def add_custom_cookies(
for cookie in custom_cookies: for cookie in custom_cookies:
cookie_dict = { cookie_dict = {
"name": cookie.get("name", "default_name"), "name": cookie.get("name", ""),
"value": cookie.get("value", "default_value"), "value": cookie.get("value", ""),
"domain": domain, "domain": domain,
"path": "/", "path": "/",
} }

View File

@@ -1,20 +1,24 @@
# STL
import os import os
from pathlib import Path import re
from urllib.parse import urlparse import logging
from typing import Dict, List from typing import Dict, List
from pathlib import Path
from urllib.parse import urljoin, urlparse
# PDM
import aiohttp import aiohttp
from playwright.async_api import Page from playwright.async_api import Page
from api.backend.utils import LOG LOG = logging.getLogger("Job")
async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]: async def collect_media(id: str, page: Page) -> dict[str, list[dict[str, str]]]:
media_types = { media_types = {
"images": "img", "images": "img",
"videos": "video", "videos": "video",
"audio": "audio", "audio": "audio",
"pdfs": 'a[href$=".pdf"]', "pdfs": 'a[href$=".pdf"], a[href*=".pdf#page="]',
"documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]', "documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]',
"presentations": 'a[href$=".ppt"], a[href$=".pptx"]', "presentations": 'a[href$=".ppt"], a[href$=".pptx"]',
"spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]', "spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]',
@@ -48,6 +52,11 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
root_domain = f"{root_url.scheme}://{root_url.netloc}" root_domain = f"{root_url.scheme}://{root_url.netloc}"
url = f"{root_domain}{url}" url = f"{root_domain}{url}"
if url and re.match(r"^[\w\-]+/", url):
root_url = urlparse(page.url)
root_domain = f"{root_url.scheme}://{root_url.netloc}"
url = urljoin(root_domain + "/", url)
if url and url.startswith(("http://", "https://")): if url and url.startswith(("http://", "https://")):
try: try:
parsed = urlparse(url) parsed = urlparse(url)
@@ -67,15 +76,20 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
}.get(media_type, "") }.get(media_type, "")
filename += ext filename += ext
file_path = media_dir / filename if not os.path.exists(media_dir / id):
os.makedirs(media_dir / id, exist_ok=True)
file_path = media_dir / id / f"{filename}"
async with session.get(url) as response: async with session.get(url) as response:
response.raise_for_status() response.raise_for_status()
with open(file_path, "wb") as f: with open(file_path, "wb") as f:
while True: while True:
chunk = await response.content.read(8192) chunk = await response.content.read(8192)
if not chunk: if not chunk:
break break
f.write(chunk) f.write(chunk)
urls.append({"url": url, "local_path": str(file_path)}) urls.append({"url": url, "local_path": str(file_path)})

View File

@@ -1,53 +1,45 @@
import logging # STL
import random import random
from typing import Any, Optional, cast import logging
from typing import Any, cast
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup, Tag # PDM
from bs4 import Tag, BeautifulSoup
from lxml import etree from lxml import etree
from camoufox import AsyncCamoufox from camoufox import AsyncCamoufox
from playwright.async_api import Page from playwright.async_api import Page
from urllib.parse import urlparse, urljoin
from api.backend.models import Element, CapturedElement # LOCAL
from api.backend.job.scraping.scraping_utils import scrape_content from api.backend.constants import RECORDINGS_ENABLED
from api.backend.job.models import Element, CapturedElement
from api.backend.job.utils.text_utils import clean_text
from api.backend.job.scraping.add_custom import add_custom_items
from api.backend.job.scraping.scraping_utils import (
sxpath,
is_same_domain,
scrape_content,
)
from api.backend.job.site_mapping.site_mapping import handle_site_mapping from api.backend.job.site_mapping.site_mapping import handle_site_mapping
from api.backend.job.scraping.add_custom import add_custom_items LOG = logging.getLogger("Job")
LOG = logging.getLogger(__name__)
def is_same_domain(url: str, original_url: str) -> bool:
parsed_url = urlparse(url)
parsed_original_url = urlparse(original_url)
return parsed_url.netloc == parsed_original_url.netloc or parsed_url.netloc == ""
def clean_xpath(xpath: str) -> str:
parts = xpath.split("/")
clean_parts = ["/" if part == "" else part for part in parts]
clean_xpath = "//".join(clean_parts).replace("////", "//").replace("'", "\\'")
LOG.info(f"Cleaned xpath: {clean_xpath}")
return clean_xpath
def sxpath(context: etree._Element, xpath: str):
return context.xpath(xpath)
async def make_site_request( async def make_site_request(
id: str,
url: str, url: str,
headers: Optional[dict[str, Any]], job_options: dict[str, Any],
multi_page_scrape: bool = False,
visited_urls: set[str] = set(), visited_urls: set[str] = set(),
pages: set[tuple[str, str]] = set(), pages: set[tuple[str, str]] = set(),
original_url: str = "", original_url: str = "",
proxies: Optional[list[str]] = None,
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
custom_cookies: Optional[list[dict[str, Any]]] = None,
): ):
headers = job_options["custom_headers"]
multi_page_scrape = job_options["multi_page_scrape"]
proxies = job_options["proxies"]
site_map = job_options["site_map"]
collect_media = job_options["collect_media"]
custom_cookies = job_options["custom_cookies"]
if url in visited_urls: if url in visited_urls:
return return
@@ -57,8 +49,9 @@ async def make_site_request(
proxy = random.choice(proxies) proxy = random.choice(proxies)
LOG.info(f"Using proxy: {proxy}") LOG.info(f"Using proxy: {proxy}")
async with AsyncCamoufox(headless=True, proxy=proxy) as browser: async with AsyncCamoufox(headless=not RECORDINGS_ENABLED, proxy=proxy) as browser:
page: Page = await browser.new_page() page: Page = await browser.new_page()
await page.set_viewport_size({"width": 1920, "height": 1080})
# Add cookies and headers # Add cookies and headers
await add_custom_items(url, page, custom_cookies, headers) await add_custom_items(url, page, custom_cookies, headers)
@@ -67,21 +60,21 @@ async def make_site_request(
try: try:
await page.goto(url, timeout=60000) await page.goto(url, timeout=60000)
await page.wait_for_load_state("networkidle", timeout=10000) await page.wait_for_load_state("networkidle")
final_url = page.url final_url = page.url
visited_urls.add(url) visited_urls.add(url)
visited_urls.add(final_url) visited_urls.add(final_url)
html_content = await scrape_content(page, pages, collect_media) html_content = await scrape_content(id, page, pages, collect_media)
html_content = await page.content() html_content = await page.content()
pages.add((html_content, final_url)) pages.add((html_content, final_url))
if site_map: if site_map:
await handle_site_mapping( await handle_site_mapping(
site_map, page, pages, collect_media=collect_media id, site_map, page, pages, collect_media=collect_media
) )
finally: finally:
@@ -108,20 +101,18 @@ async def make_site_request(
if link not in visited_urls and is_same_domain(link, original_url): if link not in visited_urls and is_same_domain(link, original_url):
await make_site_request( await make_site_request(
id,
link, link,
headers=headers, job_options=job_options,
multi_page_scrape=multi_page_scrape,
visited_urls=visited_urls, visited_urls=visited_urls,
pages=pages, pages=pages,
original_url=original_url, original_url=original_url,
proxies=proxies,
site_map=site_map,
collect_media=collect_media,
custom_cookies=custom_cookies,
) )
async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]): async def collect_scraped_elements(
page: tuple[str, str], xpaths: list[Element], return_html: bool
):
soup = BeautifulSoup(page[0], "lxml") soup = BeautifulSoup(page[0], "lxml")
root = etree.HTML(str(soup)) root = etree.HTML(str(soup))
@@ -131,12 +122,24 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
el = sxpath(root, elem.xpath) el = sxpath(root, elem.xpath)
for e in el: # type: ignore for e in el: # type: ignore
if return_html:
elements[elem.name] = [
CapturedElement(
xpath=elem.xpath,
text=page[0],
name=elem.name,
)
]
continue
text = ( text = (
"\t".join(str(t) for t in e.itertext()) " ".join(str(t) for t in e.itertext())
if isinstance(e, etree._Element) if isinstance(e, etree._Element)
else str(e) # type: ignore else str(e) # type: ignore
) )
text = clean_text(text)
captured_element = CapturedElement( captured_element = CapturedElement(
xpath=elem.xpath, text=text, name=elem.name xpath=elem.xpath, text=text, name=elem.name
) )
@@ -150,34 +153,28 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
async def scrape( async def scrape(
id: str,
url: str, url: str,
xpaths: list[Element], xpaths: list[Element],
headers: Optional[dict[str, Any]] = None, job_options: dict[str, Any],
multi_page_scrape: bool = False,
proxies: Optional[list[str]] = None,
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
custom_cookies: Optional[list[dict[str, Any]]] = None,
): ):
visited_urls: set[str] = set() visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set() pages: set[tuple[str, str]] = set()
await make_site_request( await make_site_request(
id,
url, url,
headers=headers, job_options=job_options,
multi_page_scrape=multi_page_scrape,
visited_urls=visited_urls, visited_urls=visited_urls,
pages=pages, pages=pages,
original_url=url, original_url=url,
proxies=proxies,
site_map=site_map,
collect_media=collect_media,
custom_cookies=custom_cookies,
) )
elements: list[dict[str, dict[str, list[CapturedElement]]]] = [] elements: list[dict[str, dict[str, list[CapturedElement]]]] = []
for page in pages: for page in pages:
elements.append(await collect_scraped_elements(page, xpaths)) elements.append(
await collect_scraped_elements(page, xpaths, job_options["return_html"])
)
return elements return elements

View File

@@ -1,14 +1,21 @@
# STL
import asyncio import asyncio
import logging
from typing import Set, Tuple from typing import Set, Tuple
from urllib.parse import urlparse
# PDM
from lxml import etree
from playwright.async_api import Page from playwright.async_api import Page
from api.backend.utils import LOG # LOCAL
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
LOG = logging.getLogger("Job")
async def scrape_content( async def scrape_content(
page: Page, pages: Set[Tuple[str, str]], collect_media: bool id: str, page: Page, pages: Set[Tuple[str, str]], collect_media: bool
) -> str: ) -> str:
last_height = await page.evaluate("document.body.scrollHeight") last_height = await page.evaluate("document.body.scrollHeight")
@@ -27,6 +34,25 @@ async def scrape_content(
if collect_media: if collect_media:
LOG.info("Collecting media") LOG.info("Collecting media")
await collect_media_utils(page) await collect_media_utils(id, page)
return html return html
def is_same_domain(url: str, original_url: str) -> bool:
parsed_url = urlparse(url)
parsed_original_url = urlparse(original_url)
return parsed_url.netloc == parsed_original_url.netloc or parsed_url.netloc == ""
def clean_xpath(xpath: str) -> str:
parts = xpath.split("/")
clean_parts = ["/" if part == "" else part for part in parts]
clean_xpath = "//".join(clean_parts).replace("////", "//").replace("'", "\\'")
LOG.info(f"Cleaned xpath: {clean_xpath}")
return clean_xpath
def sxpath(context: etree._Element, xpath: str):
return context.xpath(xpath)

View File

@@ -1,14 +1,17 @@
import logging # STL
import asyncio import asyncio
import logging
from copy import deepcopy from copy import deepcopy
from typing import Any from typing import Any
# PDM
from playwright.async_api import Page from playwright.async_api import Page
# LOCAL
from api.backend.job.models.site_map import Action, SiteMap from api.backend.job.models.site_map import Action, SiteMap
from api.backend.job.scraping.scraping_utils import scrape_content from api.backend.job.scraping.scraping_utils import scrape_content
LOG = logging.getLogger(__name__) LOG = logging.getLogger("Job")
def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]: def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]:
@@ -24,7 +27,6 @@ def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]:
async def handle_input(action: Action, page: Page) -> bool: async def handle_input(action: Action, page: Page) -> bool:
try: try:
element = page.locator(f"xpath={action.xpath}") element = page.locator(f"xpath={action.xpath}")
await element.wait_for(state="visible", timeout=10000)
LOG.info(f"Sending keys: {action.input} to element: {action.xpath}") LOG.info(f"Sending keys: {action.input} to element: {action.xpath}")
await element.fill(action.input) await element.fill(action.input)
return True return True
@@ -36,7 +38,6 @@ async def handle_input(action: Action, page: Page) -> bool:
async def handle_click(action: Action, page: Page) -> bool: async def handle_click(action: Action, page: Page) -> bool:
try: try:
element = page.locator(f"xpath={action.xpath}") element = page.locator(f"xpath={action.xpath}")
await element.wait_for(state="visible", timeout=10000)
LOG.info(f"Clicking element: {action.xpath}") LOG.info(f"Clicking element: {action.xpath}")
await element.click() await element.click()
return True return True
@@ -52,6 +53,7 @@ ACTION_MAP = {
async def handle_site_mapping( async def handle_site_mapping(
id: str,
site_map_dict: dict[str, Any], site_map_dict: dict[str, Any],
page: Page, page: Page,
pages: set[tuple[str, str]], pages: set[tuple[str, str]],
@@ -68,11 +70,11 @@ async def handle_site_mapping(
await asyncio.sleep(2) await asyncio.sleep(2)
await scrape_content(page, pages, collect_media=collect_media) await scrape_content(id, page, pages, collect_media=collect_media)
cleared_site_map_dict = clear_done_actions(site_map_dict) cleared_site_map_dict = clear_done_actions(site_map_dict)
if cleared_site_map_dict["actions"]: if cleared_site_map_dict["actions"]:
await handle_site_mapping( await handle_site_mapping(
cleared_site_map_dict, page, pages, collect_media=collect_media id, cleared_site_map_dict, page, pages, collect_media=collect_media
) )

View File

@@ -1,6 +1,8 @@
# STL
from typing import Any from typing import Any
from api.backend.utils import clean_text # LOCAL
from api.backend.job.utils.text_utils import clean_text
def clean_job_format(jobs: list[dict[str, Any]]) -> dict[str, Any]: def clean_job_format(jobs: list[dict[str, Any]]) -> dict[str, Any]:

View File

@@ -1,6 +1,8 @@
# STL
from typing import Any from typing import Any
from api.backend.utils import clean_text # LOCAL
from api.backend.job.utils.text_utils import clean_text
def stream_md_from_job_results(jobs: list[dict[str, Any]]): def stream_md_from_job_results(jobs: list[dict[str, Any]]):

View File

@@ -0,0 +1,10 @@
def clean_text(text: str):
text = text.strip()
text = text.replace("\n", " ")
text = text.replace("\t", " ")
text = text.replace("\r", " ")
text = text.replace("\f", " ")
text = text.replace("\v", " ")
text = text.replace("\b", " ")
text = text.replace("\a", " ")
return text

View File

@@ -0,0 +1,31 @@
# STL
import logging
import traceback
from typing import Any, Union, Callable, Awaitable
from functools import wraps
# PDM
from fastapi.responses import JSONResponse
def handle_exceptions(
logger: logging.Logger,
) -> Callable[
[Callable[..., Awaitable[Any]]], Callable[..., Awaitable[Union[Any, JSONResponse]]]
]:
def decorator(
func: Callable[..., Awaitable[Any]],
) -> Callable[..., Awaitable[Union[Any, JSONResponse]]]:
@wraps(func)
async def wrapper(*args: Any, **kwargs: Any) -> Union[Any, JSONResponse]:
try:
return await func(*args, **kwargs)
except Exception as e:
logger.error(f"Exception occurred: {e}")
traceback.print_exc()
return JSONResponse(content={"error": str(e)}, status_code=500)
return wrapper
return decorator

View File

@@ -1,233 +0,0 @@
# STL
import datetime
import uuid
import traceback
from io import StringIO
import csv
import logging
import random
# PDM
from fastapi import Depends, APIRouter
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse, StreamingResponse
from api.backend.scheduler import scheduler
from apscheduler.triggers.cron import CronTrigger # type: ignore
# LOCAL
from api.backend.job import insert, update_job, delete_jobs
from api.backend.models import (
DeleteCronJob,
UpdateJobs,
DownloadJob,
DeleteScrapeJobs,
Job,
CronJob,
)
from api.backend.schemas import User
from api.backend.auth.auth_utils import get_current_user
from api.backend.utils import clean_text, format_list_for_query
from api.backend.job.models.job_options import FetchOptions
from api.backend.database.common import query
from api.backend.job.cron_scheduling.cron_scheduling import (
delete_cron_job,
get_cron_job_trigger,
insert_cron_job,
get_cron_jobs,
insert_job_from_cron_job,
)
from api.backend.job.utils.clean_job_format import clean_job_format
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
LOG = logging.getLogger(__name__)
job_router = APIRouter()
@job_router.post("/update")
async def update(update_jobs: UpdateJobs, _: User = Depends(get_current_user)):
"""Used to update jobs"""
await update_job(update_jobs.ids, update_jobs.field, update_jobs.value)
@job_router.post("/submit-scrape-job")
async def submit_scrape_job(job: Job):
LOG.info(f"Recieved job: {job}")
try:
job.id = uuid.uuid4().hex
job_dict = job.model_dump()
insert(job_dict)
return JSONResponse(content={"id": job.id})
except Exception as e:
LOG.error(f"Exception occurred: {traceback.format_exc()}")
return JSONResponse(content={"error": str(e)}, status_code=500)
@job_router.post("/retrieve-scrape-jobs")
async def retrieve_scrape_jobs(
fetch_options: FetchOptions, user: User = Depends(get_current_user)
):
LOG.info(f"Retrieving jobs for account: {user.email}")
ATTRIBUTES = "chat" if fetch_options.chat else "*"
try:
job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?"
results = query(job_query, (user.email,))
return JSONResponse(content=jsonable_encoder(results[::-1]))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
return JSONResponse(content=[], status_code=500)
@job_router.get("/job/{id}")
async def job(id: str, user: User = Depends(get_current_user)):
LOG.info(f"Retrieving jobs for account: {user.email}")
try:
job_query = "SELECT * FROM jobs WHERE user = ? AND id = ?"
results = query(job_query, (user.email, id))
return JSONResponse(content=jsonable_encoder(results))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
return JSONResponse(content={"error": str(e)}, status_code=500)
@job_router.post("/download")
async def download(download_job: DownloadJob):
LOG.info(f"Downloading job with ids: {download_job.ids}")
try:
job_query = (
f"SELECT * FROM jobs WHERE id IN {format_list_for_query(download_job.ids)}"
)
results = query(job_query, tuple(download_job.ids))
if download_job.job_format == "csv":
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
headers = [
"id",
"url",
"element_name",
"xpath",
"text",
"user",
"time_created",
]
csv_writer.writerow(headers)
for result in results:
for res in result["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
csv_writer.writerow(
[
result.get("id", "")
+ "-"
+ str(random.randint(0, 1000000)),
url,
element_name,
value.get("xpath", ""),
text,
result.get("user", ""),
result.get("time_created", ""),
]
)
_ = csv_buffer.seek(0)
response = StreamingResponse(
csv_buffer,
media_type="text/csv",
)
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
elif download_job.job_format == "md":
response = StreamingResponse(
stream_md_from_job_results(results),
media_type="text/markdown",
)
response.headers["Content-Disposition"] = "attachment; filename=export.md"
return response
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
return {"error": str(e)}
@job_router.get("/job/{id}/convert-to-csv")
async def convert_to_csv(id: str):
try:
job_query = f"SELECT * FROM jobs WHERE id = ?"
results = query(job_query, (id,))
return JSONResponse(content=clean_job_format(results))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
return {"error": str(e)}
@job_router.post("/delete-scrape-jobs")
async def delete(delete_scrape_jobs: DeleteScrapeJobs):
result = await delete_jobs(delete_scrape_jobs.ids)
return (
JSONResponse(content={"message": "Jobs successfully deleted."})
if result
else JSONResponse({"error": "Jobs not deleted."})
)
@job_router.post("/schedule-cron-job")
async def schedule_cron_job(cron_job: CronJob):
if not cron_job.id:
cron_job.id = uuid.uuid4().hex
if not cron_job.time_created:
cron_job.time_created = datetime.datetime.now()
if not cron_job.time_updated:
cron_job.time_updated = datetime.datetime.now()
insert_cron_job(cron_job)
queried_job = query("SELECT * FROM jobs WHERE id = ?", (cron_job.job_id,))
scheduler.add_job(
insert_job_from_cron_job,
get_cron_job_trigger(cron_job.cron_expression),
id=cron_job.id,
args=[queried_job[0]],
)
return JSONResponse(content={"message": "Cron job scheduled successfully."})
@job_router.post("/delete-cron-job")
async def delete_cron_job_request(request: DeleteCronJob):
if not request.id:
return JSONResponse(
content={"error": "Cron job id is required."}, status_code=400
)
delete_cron_job(request.id, request.user_email)
scheduler.remove_job(request.id)
return JSONResponse(content={"message": "Cron job deleted successfully."})
@job_router.get("/cron-jobs")
async def get_cron_jobs_request(user: User = Depends(get_current_user)):
cron_jobs = get_cron_jobs(user.email)
return JSONResponse(content=jsonable_encoder(cron_jobs))

View File

@@ -1,3 +1,4 @@
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore # PDM
from apscheduler.schedulers.background import BackgroundScheduler
scheduler = BackgroundScheduler() scheduler = BackgroundScheduler()

View File

@@ -0,0 +1,17 @@
from typing import Optional, Union
from datetime import datetime
import pydantic
class CronJob(pydantic.BaseModel):
id: Optional[str] = None
user_email: str
job_id: str
cron_expression: str
time_created: Optional[Union[datetime, str]] = None
time_updated: Optional[Union[datetime, str]] = None
class DeleteCronJob(pydantic.BaseModel):
id: str
user_email: str

View File

@@ -1,24 +1,24 @@
# STL
from typing import Any, Literal, Optional, Union from typing import Any, Literal, Optional, Union
from datetime import datetime from datetime import datetime
# LOCAL
from api.backend.job.models.job_options import JobOptions from api.backend.job.models.job_options import JobOptions
# PDM
import pydantic import pydantic
from api.backend.job.models import Element, CapturedElement
class Element(pydantic.BaseModel):
name: str
xpath: str
url: Optional[str] = None
class CapturedElement(pydantic.BaseModel): class Job(pydantic.BaseModel):
xpath: str id: Optional[str] = None
text: str url: str
name: str elements: list[Element]
user: str = ""
time_created: Optional[Union[datetime, str]] = None
result: list[dict[str, dict[str, list[CapturedElement]]]] = []
job_options: JobOptions
status: str = "Queued"
chat: Optional[str] = None
agent_mode: bool = False
prompt: Optional[str] = None
favorite: bool = False
class RetrieveScrapeJobs(pydantic.BaseModel): class RetrieveScrapeJobs(pydantic.BaseModel):
@@ -34,41 +34,7 @@ class DeleteScrapeJobs(pydantic.BaseModel):
ids: list[str] ids: list[str]
class GetStatistics(pydantic.BaseModel):
user: str
class UpdateJobs(pydantic.BaseModel): class UpdateJobs(pydantic.BaseModel):
ids: list[str] ids: list[str]
field: str field: str
value: Any value: Any
class AI(pydantic.BaseModel):
messages: list[Any]
class Job(pydantic.BaseModel):
id: Optional[str] = None
url: str
elements: list[Element]
user: str = ""
time_created: Optional[Union[datetime, str]] = None
result: list[dict[str, dict[str, list[CapturedElement]]]] = []
job_options: JobOptions
status: str = "Queued"
chat: Optional[str] = None
class CronJob(pydantic.BaseModel):
id: Optional[str] = None
user_email: str
job_id: str
cron_expression: str
time_created: Optional[Union[datetime, str]] = None
time_updated: Optional[Union[datetime, str]] = None
class DeleteCronJob(pydantic.BaseModel):
id: str
user_email: str

View File

@@ -2,28 +2,30 @@
import logging import logging
# PDM # PDM
from fastapi import APIRouter, Depends from fastapi import Depends, APIRouter
# LOCAL # LOCAL
from api.backend.job import ( from api.backend.auth.schemas import User
from api.backend.auth.auth_utils import get_current_user
from api.backend.routers.handle_exceptions import handle_exceptions
from api.backend.database.queries.statistics.statistic_queries import (
get_jobs_per_day, get_jobs_per_day,
average_elements_per_link, average_elements_per_link,
) )
from api.backend.auth.auth_utils import get_current_user
from api.backend.schemas import User
LOG = logging.getLogger("Statistics")
LOG = logging.getLogger(__name__)
stats_router = APIRouter() stats_router = APIRouter()
@stats_router.get("/statistics/get-average-element-per-link") @stats_router.get("/statistics/get-average-element-per-link")
@handle_exceptions(logger=LOG)
async def get_average_element_per_link(user: User = Depends(get_current_user)): async def get_average_element_per_link(user: User = Depends(get_current_user)):
return await average_elements_per_link(user.email) return await average_elements_per_link(user.email)
@stats_router.get("/statistics/get-average-jobs-per-day") @stats_router.get("/statistics/get-average-jobs-per-day")
@handle_exceptions(logger=LOG)
async def average_jobs_per_day(user: User = Depends(get_current_user)): async def average_jobs_per_day(user: User = Depends(get_current_user)):
data = await get_jobs_per_day(user.email) data = await get_jobs_per_day(user.email)
return data return data

View File

@@ -0,0 +1,63 @@
# STL
import os
import sqlite3
from typing import Generator
from unittest.mock import patch
# PDM
import pytest
from proxy import Proxy
# LOCAL
from api.backend.database.schema import INIT_QUERY
from api.backend.tests.constants import TEST_DB_PATH
@pytest.fixture(scope="session", autouse=True)
def running_proxy():
proxy = Proxy(["--hostname", "127.0.0.1", "--port", "8080"])
proxy.setup()
yield proxy
proxy.shutdown()
@pytest.fixture(scope="session", autouse=True)
def patch_database_path():
with patch("api.backend.database.common.DATABASE_PATH", TEST_DB_PATH):
yield
@pytest.fixture(scope="session", autouse=True)
def patch_recordings_enabled():
with patch("api.backend.job.scraping.scraping.RECORDINGS_ENABLED", False):
yield
@pytest.fixture(scope="session")
def test_db_path() -> str:
return TEST_DB_PATH
@pytest.fixture(scope="session", autouse=True)
def test_db(test_db_path: str) -> Generator[str, None, None]:
"""Create a fresh test database for each test function."""
os.makedirs(os.path.dirname(test_db_path), exist_ok=True)
if os.path.exists(test_db_path):
os.remove(test_db_path)
conn = sqlite3.connect(test_db_path)
cursor = conn.cursor()
for query in INIT_QUERY.strip().split(";"):
query = query.strip()
if query:
cursor.execute(query)
conn.commit()
conn.close()
yield test_db_path
if os.path.exists(test_db_path):
os.remove(test_db_path)

View File

@@ -0,0 +1 @@
TEST_DB_PATH = "tests/test_db.sqlite"

View File

@@ -1,7 +1,13 @@
from api.backend.models import Element, Job, JobOptions, CapturedElement # STL
import uuid import uuid
# PDM
from faker import Faker from faker import Faker
# LOCAL
from api.backend.job.models import Element, JobOptions, CapturedElement
from api.backend.schemas.job import Job
fake = Faker() fake = Faker()

View File

@@ -1,8 +1,13 @@
# STL
from unittest.mock import AsyncMock, patch
# PDM
import pytest import pytest
from fastapi.testclient import TestClient from fastapi.testclient import TestClient
from unittest.mock import AsyncMock, patch
# LOCAL
from api.backend.app import app from api.backend.app import app
from api.backend.models import DownloadJob from api.backend.schemas.job import DownloadJob
from api.backend.tests.factories.job_factory import create_completed_job from api.backend.tests.factories.job_factory import create_completed_job
client = TestClient(app) client = TestClient(app)
@@ -13,8 +18,8 @@ mocked_random_int = 123456
@pytest.mark.asyncio @pytest.mark.asyncio
@patch("api.backend.routers.job_router.query") @patch("api.backend.job.job_router.query")
@patch("api.backend.routers.job_router.random.randint") @patch("api.backend.job.job_router.random.randint")
async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock): async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
# Ensure the mock returns immediately # Ensure the mock returns immediately
mock_query.return_value = mock_results mock_query.return_value = mock_results

View File

@@ -1,12 +1,26 @@
import pytest # STL
import logging import logging
from typing import Dict from typing import Dict
from playwright.async_api import async_playwright, Cookie, Route from datetime import datetime
# PDM
import pytest
from fastapi.testclient import TestClient
from playwright.async_api import Route, Cookie, async_playwright
# LOCAL
from api.backend.app import app
from api.backend.job.models import Proxy, Element, JobOptions
from api.backend.schemas.job import Job
from api.backend.database.common import query
from api.backend.job.scraping.scraping import scrape
from api.backend.job.scraping.add_custom import add_custom_items from api.backend.job.scraping.add_custom import add_custom_items
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
client = TestClient(app)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_add_custom_items(): async def test_add_custom_items():
@@ -51,3 +65,46 @@ async def test_add_custom_items():
assert captured_headers.get("user-agent") == "test-agent" assert captured_headers.get("user-agent") == "test-agent"
await browser.close() await browser.close()
@pytest.mark.asyncio
async def test_proxies():
job = Job(
url="https://example.com",
elements=[Element(xpath="//div", name="test")],
job_options=JobOptions(
proxies=[
Proxy(
server="127.0.0.1:8080",
username="user",
password="pass",
)
],
),
time_created=datetime.now().isoformat(),
)
response = client.post("/submit-scrape-job", json=job.model_dump())
assert response.status_code == 200
jobs = query("SELECT * FROM jobs")
job = jobs[0]
assert job is not None
assert job["job_options"]["proxies"] == [
{
"server": "127.0.0.1:8080",
"username": "user",
"password": "pass",
}
]
response = await scrape(
id=job["id"],
url=job["url"],
xpaths=[Element(**e) for e in job["elements"]],
job_options=job["job_options"],
)
example_response = response[0]["https://example.com/"]
assert example_response is not {}

View File

@@ -0,0 +1,17 @@
# STL
import sqlite3
# LOCAL
from api.backend.database.schema import INIT_QUERY
from api.backend.tests.constants import TEST_DB_PATH
def connect_to_db():
conn = sqlite3.connect(TEST_DB_PATH)
cur = conn.cursor()
for query in INIT_QUERY.split(";"):
cur.execute(query)
conn.commit()
return conn, cur

View File

@@ -1,17 +1,10 @@
from typing import Any, Optional # STL
import logging import logging
import json from typing import Optional
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
def clean_text(text: str):
text = text.replace("\r\n", "\n") # Normalize newlines
text = text.replace("\n", "\\n") # Escape newlines
text = text.replace('"', '\\"') # Escape double quotes
return text
def get_log_level(level_name: Optional[str]) -> int: def get_log_level(level_name: Optional[str]) -> int:
level = logging.INFO level = logging.INFO
@@ -20,30 +13,3 @@ def get_log_level(level_name: Optional[str]) -> int:
level = getattr(logging, level_name, logging.INFO) level = getattr(logging, level_name, logging.INFO)
return level return level
def format_list_for_query(ids: list[str]):
return (
f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)"
)
def format_sql_row_to_python(row: dict[str, Any]):
new_row: dict[str, Any] = {}
for key, value in row.items():
if isinstance(value, str):
try:
new_row[key] = json.loads(value)
except json.JSONDecodeError:
new_row[key] = value
else:
new_row[key] = value
return new_row
def format_json(items: list[Any]):
for idx, item in enumerate(items):
if isinstance(item, (dict, list)):
formatted_item = json.dumps(item)
items[idx] = formatted_item

View File

@@ -0,0 +1,17 @@
# STL
import os
from pathlib import Path
NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "")
NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "")
SCRAPERR_FRONTEND_URL = os.getenv("SCRAPERR_FRONTEND_URL", "")
EMAIL = os.getenv("EMAIL", "")
TO = os.getenv("TO", "")
SMTP_HOST = os.getenv("SMTP_HOST", "")
SMTP_PORT = int(os.getenv("SMTP_PORT", 587))
SMTP_USER = os.getenv("SMTP_USER", "")
SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "")
USE_TLS = os.getenv("USE_TLS", "false").lower() == "true"
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
RECORDINGS_DIR = Path("/project/app/media/recordings")

View File

@@ -1,39 +1,68 @@
import os # STL
import json import json
from api.backend.job import get_queued_job, update_job
from api.backend.scraping import scrape
from api.backend.models import Element
from fastapi.encoders import jsonable_encoder
import asyncio import asyncio
import traceback import traceback
import subprocess
from api.backend.database.startup import init_database # PDM
from fastapi.encoders import jsonable_encoder
from api.backend.worker.post_job_complete.post_job_complete import post_job_complete # LOCAL
from api.backend.job import update_job, get_queued_job
from api.backend.job.models import Element
from api.backend.worker.logger import LOG from api.backend.worker.logger import LOG
from api.backend.ai.agent.agent import scrape_with_agent
from api.backend.database.startup import init_database
NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "") from api.backend.worker.constants import (
NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "") TO,
SCRAPERR_FRONTEND_URL = os.getenv("SCRAPERR_FRONTEND_URL", "") EMAIL,
EMAIL = os.getenv("EMAIL", "") USE_TLS,
TO = os.getenv("TO", "") SMTP_HOST,
SMTP_HOST = os.getenv("SMTP_HOST", "") SMTP_PORT,
SMTP_PORT = int(os.getenv("SMTP_PORT", 587)) SMTP_USER,
SMTP_USER = os.getenv("SMTP_USER", "") SMTP_PASSWORD,
SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "") RECORDINGS_DIR,
USE_TLS = os.getenv("USE_TLS", "false").lower() == "true" RECORDINGS_ENABLED,
NOTIFICATION_CHANNEL,
SCRAPERR_FRONTEND_URL,
NOTIFICATION_WEBHOOK_URL,
)
from api.backend.job.scraping.scraping import scrape
from api.backend.worker.post_job_complete.post_job_complete import post_job_complete
async def process_job(): async def process_job():
job = await get_queued_job() job = await get_queued_job()
ffmpeg_proc = None
status = "Queued" status = "Queued"
if job: if job:
LOG.info(f"Beginning processing job: {job}.") LOG.info(f"Beginning processing job: {job}.")
try: try:
output_path = RECORDINGS_DIR / f"{job['id']}.mp4"
if RECORDINGS_ENABLED:
ffmpeg_proc = subprocess.Popen(
[
"ffmpeg",
"-y",
"-video_size",
"1280x1024",
"-framerate",
"15",
"-f",
"x11grab",
"-i",
":99",
"-codec:v",
"libx264",
"-preset",
"ultrafast",
output_path,
]
)
_ = await update_job([job["id"]], field="status", value="Scraping") _ = await update_job([job["id"]], field="status", value="Scraping")
proxies = job["job_options"]["proxies"] proxies = job["job_options"]["proxies"]
@@ -45,16 +74,16 @@ async def process_job():
LOG.error(f"Failed to parse proxy JSON: {proxies}") LOG.error(f"Failed to parse proxy JSON: {proxies}")
proxies = [] proxies = []
scraped = await scrape( if job["agent_mode"]:
job["url"], scraped = await scrape_with_agent(job)
[Element(**j) for j in job["elements"]], else:
job["job_options"]["custom_headers"], scraped = await scrape(
job["job_options"]["multi_page_scrape"], job["id"],
proxies, job["url"],
job["job_options"]["site_map"], [Element(**j) for j in job["elements"]],
job["job_options"]["collect_media"], {**job["job_options"], "proxies": proxies},
job["job_options"]["custom_cookies"], )
)
LOG.info( LOG.info(
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}" f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
) )
@@ -87,12 +116,18 @@ async def process_job():
}, },
) )
if ffmpeg_proc:
ffmpeg_proc.terminate()
ffmpeg_proc.wait()
async def main(): async def main():
LOG.info("Starting job worker...") LOG.info("Starting job worker...")
init_database() init_database()
RECORDINGS_DIR.mkdir(parents=True, exist_ok=True)
while True: while True:
await process_job() await process_job()
await asyncio.sleep(5) await asyncio.sleep(5)

View File

@@ -1,12 +1,13 @@
# STL
import logging import logging
import os
from api.backend.utils import get_log_level # LOCAL
from api.backend.app import LOG_LEVEL
logging.basicConfig( logging.basicConfig(
level=get_log_level(os.getenv("LOG_LEVEL")), level=LOG_LEVEL,
format="%(levelname)s: %(asctime)s - %(name)s - %(message)s", format="%(levelname)s: %(asctime)s - [%(name)s] - %(message)s",
handlers=[logging.StreamHandler()], handlers=[logging.StreamHandler()],
) )
LOG = logging.getLogger(__name__) LOG = logging.getLogger("Job Worker")

View File

@@ -1,5 +1,7 @@
# STL
from typing import Any from typing import Any
# LOCAL
from api.backend.worker.post_job_complete.models import PostJobCompleteOptions from api.backend.worker.post_job_complete.models import PostJobCompleteOptions
from api.backend.worker.post_job_complete.email_notifcation import ( from api.backend.worker.post_job_complete.email_notifcation import (
send_job_complete_email, send_job_complete_email,
@@ -16,9 +18,10 @@ async def post_job_complete(job: dict[str, Any], options: PostJobCompleteOptions
if not options.values(): if not options.values():
return return
if options["channel"] == "discord": match options["channel"]:
discord_notification(job, options) case "discord":
elif options["channel"] == "email": discord_notification(job, options)
send_job_complete_email(job, options) case "email":
else: send_job_complete_email(job, options)
raise ValueError(f"Invalid channel: {options['channel']}") case _:
raise ValueError(f"Invalid channel: {options['channel']}")

View File

@@ -0,0 +1,23 @@
describe("Global setup", () => {
it("signs up user once", () => {
cy.request({
method: "POST",
url: "/api/signup",
body: JSON.stringify({
data: {
email: "test@test.com",
password: "password",
full_name: "John Doe",
},
}),
headers: {
"Content-Type": "application/json",
},
failOnStatusCode: false,
}).then((response) => {
if (response.status !== 200 && response.status !== 201) {
console.warn("Signup failed:", response.status, response.body);
}
});
});
});

View File

@@ -0,0 +1,101 @@
import { login } from "../utilities/authentication.utils";
import {
addCustomHeaders,
addElement,
addMedia,
addSiteMapAction,
checkForMedia,
cleanUpJobs,
enterJobUrl,
openAdvancedJobOptions,
submitBasicJob,
submitJob,
waitForJobCompletion,
} from "../utilities/job.utilities";
import { mockSubmitJob } from "../utilities/mocks";
describe.only("Advanced Job Options", () => {
beforeEach(() => {
mockSubmitJob();
login();
cy.visit("/");
});
afterEach(() => {
cleanUpJobs();
});
it.only("should handle custom headers", () => {
const customHeaders = {
"User-Agent": "Test Agent",
"Accept-Language": "en-US",
};
addCustomHeaders(customHeaders);
submitBasicJob("https://httpbin.org/headers", "headers", "//pre");
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
expect(
interception.request?.body.data.job_options.custom_headers
).to.deep.equal(customHeaders);
});
waitForJobCompletion("https://httpbin.org/headers");
});
it("should handle site map actions", () => {
addSiteMapAction("click", "//button[contains(text(), 'Load More')]");
addSiteMapAction("input", "//input[@type='search']", "test search");
submitBasicJob("https://example.com", "content", "//div[@class='content']");
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
const siteMap = interception.request?.body.data.job_options.site_map;
expect(siteMap.actions).to.have.length(2);
expect(siteMap.actions[0].type).to.equal("click");
expect(siteMap.actions[1].type).to.equal("input");
});
waitForJobCompletion("https://example.com");
});
it("should handle multiple elements", () => {
enterJobUrl("https://books.toscrape.com");
addElement("titles", "//h3");
addElement("prices", "//p[@class='price_color']");
submitJob();
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
expect(interception.request?.body.data.elements).to.have.length(2);
});
waitForJobCompletion("https://books.toscrape.com");
});
it.only("should handle collecting media", () => {
enterJobUrl("https://books.toscrape.com");
openAdvancedJobOptions();
addMedia();
cy.get("body").type("{esc}");
addElement("images", "//img");
submitJob();
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
expect(interception.request?.body.data.job_options.collect_media).to.be
.true;
});
waitForJobCompletion("https://books.toscrape.com");
checkForMedia();
});
});

38
cypress/e2e/agent.cy.ts Normal file
View File

@@ -0,0 +1,38 @@
import { login } from "../utilities/authentication.utils";
import {
buildAgentJob,
cleanUpJobs,
submitJob,
waitForJobCompletion,
} from "../utilities/job.utilities";
import { mockSubmitJob } from "../utilities/mocks";
describe("Agent", () => {
beforeEach(() => {
mockSubmitJob();
login();
});
afterEach(() => {
cleanUpJobs();
});
it("should be able to scrape some data", () => {
cy.visit("/agent");
cy.wait(1000);
const url = "https://books.toscrape.com";
const prompt = "Collect all the links on the page";
buildAgentJob(url, prompt);
submitJob();
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
expect(interception.request?.body.data.url).to.eq(url);
expect(interception.request?.body.data.prompt).to.eq(prompt);
});
waitForJobCompletion("https://books.toscrape.com");
});
});

View File

@@ -1,60 +1,61 @@
describe("Authentication", () => { import { faker } from "@faker-js/faker";
it("should register", () => { import { mockLogin, mockSignup } from "../utilities/mocks";
cy.intercept("POST", "/api/signup").as("signup");
cy.visit("/").then(() => { const mockEmail = faker.internet.email();
cy.get("button").contains("Login").click(); const mockPassword = faker.internet.password();
cy.url().should("include", "/login");
cy.get("form").should("be.visible"); describe.only("Authentication", () => {
cy.get("button") beforeEach(() => {
.contains("No Account? Sign up") cy.visit("/");
.should("be.visible") mockSignup();
.click(); mockLogin();
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("input[name='fullName']").type("John Doe");
cy.get("button[type='submit']").contains("Signup").click();
cy.wait("@signup").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("signup request did not return a response");
}
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
expect(interception.response.statusCode).to.eq(200);
});
});
}); });
it("should login", () => { it("should register", () => {
cy.intercept("POST", "/api/token").as("token"); cy.get("button").contains("Login").click();
cy.url().should("include", "/login");
cy.visit("/").then(() => { cy.get("form").should("be.visible");
cy.get("button")
.contains("Login")
.click()
.then(() => {
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("button[type='submit']").contains("Login").click();
cy.wait("@token").then((interception) => { cy.get("button")
if (!interception.response) { .contains("No Account? Sign up")
cy.log("No response received!"); .should("be.visible")
throw new Error("token request did not return a response"); .click();
}
cy.log("Response status: " + interception.response.statusCode); cy.get("input[name='email']").type(mockEmail);
cy.log("Response body: " + JSON.stringify(interception.response.body)); cy.get("input[name='password']").type(mockPassword);
cy.get("input[name='fullName']").type(faker.person.fullName());
cy.get("button[type='submit']").contains("Signup").click();
expect(interception.response.statusCode).to.eq(200); cy.wait("@signup").then((interception) => {
}); if (!interception.response) {
}); throw new Error("signup request did not return a response");
}
expect(interception.response.statusCode).to.eq(200);
}); });
}); });
}); });
it("should login", () => {
cy.intercept("POST", "/api/token").as("token");
cy.visit("/").then(() => {
cy.get("button")
.contains("Login")
.click()
.then(() => {
cy.get("input[name='email']").type(mockEmail);
cy.get("input[name='password']").type(mockPassword);
cy.get("button[type='submit']").contains("Login").click();
cy.wait("@token").then((interception) => {
if (!interception.response) {
throw new Error("token request did not return a response");
}
expect(interception.response.statusCode).to.eq(200);
});
});
});
});

34
cypress/e2e/chat.cy.ts Normal file
View File

@@ -0,0 +1,34 @@
import { login } from "../utilities/authentication.utils";
import {
cleanUpJobs,
selectJobFromSelector,
submitBasicJob,
waitForJobCompletion,
} from "../utilities/job.utilities";
import { mockLogin } from "../utilities/mocks";
describe.only("Chat", () => {
beforeEach(() => {
mockLogin();
login();
cy.visit("/");
});
afterEach(() => {
cleanUpJobs();
});
it.only("should be able to chat", () => {
const url = "https://books.toscrape.com";
submitBasicJob(url, "test", "//body");
waitForJobCompletion(url);
cy.visit("/chat");
selectJobFromSelector();
cy.get("[data-cy='message-input']").type("Hello");
cy.get("[data-cy='send-message']").click();
cy.get("[data-cy='ai-message']").should("exist");
});
});

View File

@@ -1,34 +1,37 @@
import { login } from "../utilities/authentication.utils";
import {
addElement,
cleanUpJobs,
enterJobUrl,
submitJob,
waitForJobCompletion,
} from "../utilities/job.utilities";
import { mockSubmitJob } from "../utilities/mocks";
describe.only("Job", () => { describe.only("Job", () => {
it("should create a job", () => { beforeEach(() => {
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob"); mockSubmitJob();
login();
cy.visit("/"); cy.visit("/");
});
cy.get('[data-cy="url-input"]').type("https://example.com"); afterEach(() => {
cy.get('[data-cy="name-field"]').type("example"); cleanUpJobs();
cy.get('[data-cy="xpath-field"]').type("//body"); });
cy.get('[data-cy="add-button"]').click();
cy.contains("Submit").click(); it("should create a job", () => {
enterJobUrl("https://books.toscrape.com");
addElement("body", "//body");
submitJob();
cy.wait("@submitScrapeJob").then((interception) => { cy.wait("@submitScrapeJob").then((interception) => {
if (!interception.response) { if (!interception.response) {
cy.log("No response received!");
cy.log("Request body: " + JSON.stringify(interception.request?.body));
throw new Error("submitScrapeJob request did not return a response"); throw new Error("submitScrapeJob request did not return a response");
} }
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
expect(interception.response.statusCode).to.eq(200); expect(interception.response.statusCode).to.eq(200);
}); });
cy.get("li").contains("Jobs").click(); waitForJobCompletion("https://books.toscrape.com");
cy.contains("div", "https://example.com", { timeout: 10000 }).should(
"exist"
);
cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
}); });
}); });

View File

@@ -14,7 +14,7 @@
// *********************************************************** // ***********************************************************
// Import commands.js using ES2015 syntax: // Import commands.js using ES2015 syntax:
import './commands' import "./commands";
// Alternatively you can use CommonJS syntax: // Alternatively you can use CommonJS syntax:
// require('./commands') // require('./commands')

View File

@@ -0,0 +1,68 @@
export const signup = () => {
cy.intercept("POST", "/api/token").as("token");
cy.visit("/").then(() => {
cy.get("button").contains("Login").click();
cy.url().should("include", "/login");
cy.get("form").should("be.visible");
cy.get("button")
.contains("No Account? Sign up")
.should("be.visible")
.click();
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("input[name='fullName']").type("John Doe");
cy.get("button[type='submit']").contains("Signup").click();
cy.wait("@token").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("token request did not return a response");
}
});
});
};
export const login = () => {
cy.intercept("POST", "/api/token").as("token");
cy.intercept("GET", "/api/me").as("me");
cy.intercept("GET", "/api/check").as("check");
cy.visit("/").then(() => {
cy.get("body").then(() => {
cy.get("button")
.contains("Login")
.click()
.then(() => {
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("button[type='submit']").contains("Login").click();
cy.wait("@token").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("token request did not return a response");
}
});
cy.wait("@me").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("me request did not return a response");
}
});
cy.wait("@check").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("check request did not return a response");
}
});
cy.url().should("not.include", "/login");
});
});
});
};

View File

@@ -0,0 +1,187 @@
export const cleanUpJobs = () => {
cy.intercept("POST", "/api/retrieve").as("retrieve");
cy.visit("/jobs");
cy.wait("@retrieve", { timeout: 15000 });
cy.get("tbody tr", { timeout: 20000 }).should("have.length.at.least", 1);
const tryClickSelectAll = (attempt = 1, maxAttempts = 5) => {
cy.log(`Attempt ${attempt} to click Select All`);
cy.get('[data-testid="select-all"]')
.closest("button")
.then(($btn) => {
// Retry if button is disabled
if ($btn.is(":disabled") || $btn.css("pointer-events") === "none") {
if (attempt < maxAttempts) {
cy.wait(1000).then(() =>
tryClickSelectAll(attempt + 1, maxAttempts)
);
} else {
throw new Error(
"Select All button is still disabled after max retries"
);
}
} else {
// Click and then verify if checkbox is checked
cy.wrap($btn)
.click({ force: true })
.then(() => {
cy.get("tbody tr")
.first()
.find("td")
.first()
.find("input[type='checkbox']")
.should("be.checked")
.then(() => {
cy.log("Select All successful");
});
});
// Handle failure case
cy.on("fail", () => {
cy.log("Error clicking Select All");
if (attempt < maxAttempts) {
cy.wait(1000).then(() =>
tryClickSelectAll(attempt + 1, maxAttempts)
);
} else {
throw new Error(
"Checkbox was never checked after clicking Select All"
);
}
return false; // Prevent Cypress from failing the test
});
}
});
};
tryClickSelectAll();
cy.get('[data-testid="DeleteIcon"]', { timeout: 10000 })
.closest("button")
.should("not.be.disabled")
.click();
};
export const submitBasicJob = (url: string, name: string, xpath: string) => {
cy.get('[data-cy="url-input"]').type(url);
cy.get('[data-cy="name-field"]').type(name);
cy.get('[data-cy="xpath-field"]').type(xpath);
cy.get('[data-cy="add-button"]').click();
cy.contains("Submit").click();
};
export const waitForJobCompletion = (url: string) => {
cy.intercept("POST", "/api/retrieve").as("retrieve");
cy.visit("/jobs");
cy.wait("@retrieve", { timeout: 30000 });
cy.contains("div", url, { timeout: 30000 }).should("exist");
const checkJobStatus = () => {
cy.get("[data-testid='job-status']", { timeout: 120000 }).then(($el) => {
const status = $el.text().toLowerCase().trim();
if (status.includes("completed")) {
return true;
} else if (status.includes("scraping") || status.includes("queued")) {
cy.wait(5000);
checkJobStatus();
} else {
throw new Error(`Unexpected job status: ${status}`);
}
});
};
checkJobStatus();
};
export const enableMultiPageScraping = () => {
cy.get("button").contains("Advanced Options").click();
cy.get('[data-cy="multi-page-toggle"]').click();
cy.get("body").type("{esc}");
};
export const addCustomHeaders = (headers: Record<string, string>) => {
cy.get("button").contains("Advanced Options").click();
cy.get('[name="custom_headers"]').type(JSON.stringify(headers), {
parseSpecialCharSequences: false,
});
cy.get("body").type("{esc}");
};
export const addCustomCookies = (cookies: Record<string, string>) => {
cy.get("button").contains("Advanced Options").click();
cy.get('[name="custom_cookies"]').type(JSON.stringify(cookies));
cy.get("body").type("{esc}");
};
export const openAdvancedJobOptions = () => {
cy.get("button").contains("Advanced Options").click();
};
export const selectJobFromSelector = () => {
checkAiDisabled();
cy.get("div[id='select-job']", { timeout: 10000 }).first().click();
cy.get("li[role='option']", { timeout: 10000 }).first().click();
};
export const addMedia = () => {
cy.get('[data-cy="collect-media-checkbox"]').click();
};
export const checkForMedia = () => {
cy.intercept("GET", "/api/media/get-media?id=**").as("getMedia");
cy.visit("/media");
selectJobFromSelector();
cy.wait("@getMedia", { timeout: 30000 });
};
export const addSiteMapAction = (
type: "click" | "input",
xpath: string,
input?: string
) => {
cy.get("button").contains("Create Site Map").click();
cy.get('[data-cy="site-map-select"]').select(type);
cy.get('[data-cy="site-map-xpath"]').type(xpath);
if (type === "input" && input) {
cy.get('[data-cy="site-map-input"]').type(input);
}
cy.get('[data-cy="add-site-map-action"]').click();
};
export const addElement = (name: string, xpath: string) => {
cy.get('[data-cy="name-field"]').type(name);
cy.get('[data-cy="xpath-field"]').type(xpath);
cy.get('[data-cy="add-button"]').click();
};
export const checkAiDisabled = () => {
cy.getAllLocalStorage().then((result) => {
const storage = JSON.parse(
result["http://localhost"]["persist:root"] as string
);
const settings = JSON.parse(storage.settings);
expect(settings.aiEnabled).to.equal(true);
});
};
export const buildAgentJob = (url: string, prompt: string) => {
checkAiDisabled();
enterJobUrl(url);
cy.get("[data-cy='prompt-input']").type(prompt);
};
export const submitJob = () => {
cy.get("button").contains("Submit").click();
};
export const enterJobUrl = (url: string) => {
cy.get('[data-cy="url-input"]').type(url);
};

View File

@@ -0,0 +1,15 @@
export const mockSubmitJob = () => {
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
};
export const mockToken = () => {
cy.intercept("POST", "/api/token").as("token");
};
export const mockSignup = () => {
cy.intercept("POST", "/api/signup").as("signup");
};
export const mockLogin = () => {
cy.intercept("POST", "/api/token").as("token");
};

View File

@@ -0,0 +1 @@
export * from "./authentication.utils";

View File

@@ -1,6 +1,9 @@
version: "3" version: "3"
services: services:
scraperr: scraperr:
build:
context: .
dockerfile: docker/frontend/Dockerfile
command: ["npm", "run", "dev"] command: ["npm", "run", "dev"]
volumes: volumes:
- "$PWD/src:/app/src" - "$PWD/src:/app/src"
@@ -10,7 +13,12 @@ services:
- "$PWD/package-lock.json:/app/package-lock.json" - "$PWD/package-lock.json:/app/package-lock.json"
- "$PWD/tsconfig.json:/app/tsconfig.json" - "$PWD/tsconfig.json:/app/tsconfig.json"
scraperr_api: scraperr_api:
build:
context: .
dockerfile: docker/api/Dockerfile
environment: environment:
- LOG_LEVEL=INFO - LOG_LEVEL=INFO
volumes: volumes:
- "$PWD/api:/project/app/api" - "$PWD/api:/project/app/api"
ports:
- "5900:5900"

View File

@@ -1,11 +1,6 @@
services: services:
scraperr: scraperr:
depends_on: image: jpyles0524/scraperr:latest
- scraperr_api
image: jpyles0524/scraperr:1.0.13
build:
context: .
dockerfile: docker/frontend/Dockerfile
container_name: scraperr container_name: scraperr
command: ["npm", "run", "start"] command: ["npm", "run", "start"]
environment: environment:
@@ -18,11 +13,9 @@ services:
scraperr_api: scraperr_api:
init: True init: True
image: jpyles0524/scraperr_api:latest image: jpyles0524/scraperr_api:latest
build:
context: .
dockerfile: docker/api/Dockerfile
environment: environment:
- LOG_LEVEL=INFO - LOG_LEVEL=INFO
- OPENAI_KEY=${OPENAI_KEY}
container_name: scraperr_api container_name: scraperr_api
ports: ports:
- 8000:8000 - 8000:8000

View File

@@ -3,7 +3,7 @@ FROM python:3.10.12-slim as pybuilder
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y curl && \ apt-get install -y curl && \
apt-get install -y uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 && \ apt-get install -y x11vnc xvfb uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 ffmpeg && \
curl -LsSf https://astral.sh/uv/install.sh | sh && \ curl -LsSf https://astral.sh/uv/install.sh | sh && \
apt-get remove -y curl && \ apt-get remove -y curl && \
apt-get autoremove -y && \ apt-get autoremove -y && \
@@ -14,7 +14,8 @@ RUN pdm config python.use_venv false
WORKDIR /project/app WORKDIR /project/app
COPY pyproject.toml pdm.lock /project/app/ COPY pyproject.toml pdm.lock /project/app/
RUN pdm install
RUN pdm install -v --frozen-lockfile
RUN pdm run playwright install --with-deps RUN pdm run playwright install --with-deps
@@ -30,7 +31,12 @@ EXPOSE 8000
WORKDIR /project/app WORKDIR /project/app
RUN mkdir -p /project/app/media
RUN mkdir -p /project/app/data RUN mkdir -p /project/app/data
RUN touch /project/app/data/database.db RUN touch /project/app/data/database.db
EXPOSE 5900
COPY start.sh /project/app/start.sh
CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ] CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ]

View File

@@ -1,10 +1,14 @@
# Build next dependencies # Build next dependencies
FROM node:23.1 FROM node:23.1-slim
WORKDIR /app WORKDIR /app
COPY package*.json ./ # Copy package files first to leverage Docker cache
RUN npm install COPY package.json yarn.lock ./
# Install dependencies in a separate layer
RUN yarn install --frozen-lockfile
# Copy the rest of the application
COPY tsconfig.json /app/tsconfig.json COPY tsconfig.json /app/tsconfig.json
COPY tailwind.config.js /app/tailwind.config.js COPY tailwind.config.js /app/tailwind.config.js
COPY next.config.mjs /app/next.config.mjs COPY next.config.mjs /app/next.config.mjs
@@ -13,6 +17,7 @@ COPY postcss.config.js /app/postcss.config.js
COPY public /app/public COPY public /app/public
COPY src /app/src COPY src /app/src
RUN npm run build # Build the application
RUN yarn build
EXPOSE 3000 EXPOSE 3000

Binary file not shown.

Before

Width:  |  Height:  |  Size: 47 KiB

After

Width:  |  Height:  |  Size: 67 KiB

View File

@@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes # This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version. # to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/) # Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 1.0.14 version: 1.1.2
# This is the version number of the application being deployed. This version number should be # This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to # incremented each time you make changes to the application. Versions are not expected to

11371
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -12,9 +12,11 @@
"@minchat/react-chat-ui": "^0.16.2", "@minchat/react-chat-ui": "^0.16.2",
"@mui/icons-material": "^5.15.3", "@mui/icons-material": "^5.15.3",
"@mui/material": "^5.16.0", "@mui/material": "^5.16.0",
"@reduxjs/toolkit": "^2.8.2",
"@testing-library/jest-dom": "^5.16.5", "@testing-library/jest-dom": "^5.16.5",
"@testing-library/react": "^13.4.0", "@testing-library/react": "^13.4.0",
"@testing-library/user-event": "^13.5.0", "@testing-library/user-event": "^13.5.0",
"@types/react": "^18.3.21",
"axios": "^1.7.2", "axios": "^1.7.2",
"bootstrap": "^5.3.0", "bootstrap": "^5.3.0",
"chart.js": "^4.4.3", "chart.js": "^4.4.3",
@@ -30,16 +32,19 @@
"react-dom": "^18.3.1", "react-dom": "^18.3.1",
"react-markdown": "^9.0.0", "react-markdown": "^9.0.0",
"react-modal-image": "^2.6.0", "react-modal-image": "^2.6.0",
"react-redux": "^9.2.0",
"react-router": "^6.14.1", "react-router": "^6.14.1",
"react-router-dom": "^6.14.1", "react-router-dom": "^6.14.1",
"react-spinners": "^0.14.1", "react-spinners": "^0.14.1",
"react-toastify": "^11.0.5",
"redux-persist": "^6.0.0",
"typescript": "^4.9.5", "typescript": "^4.9.5",
"web-vitals": "^2.1.4" "web-vitals": "^2.1.4"
}, },
"scripts": { "scripts": {
"dev": "next dev", "dev": "yarn next dev",
"build": "next build", "build": "yarn next build",
"start": "next start", "start": "yarn next start",
"serve": "serve -s ./dist", "serve": "serve -s ./dist",
"cy:open": "cypress open", "cy:open": "cypress open",
"cy:run": "cypress run" "cy:run": "cypress run"
@@ -63,6 +68,7 @@
] ]
}, },
"devDependencies": { "devDependencies": {
"@faker-js/faker": "^9.8.0",
"@types/cypress": "^1.1.6", "@types/cypress": "^1.1.6",
"@types/js-cookie": "^3.0.6", "@types/js-cookie": "^3.0.6",
"autoprefixer": "^10.4.21", "autoprefixer": "^10.4.21",

24
pdm.lock generated
View File

@@ -5,7 +5,7 @@
groups = ["default", "dev"] groups = ["default", "dev"]
strategy = ["inherit_metadata"] strategy = ["inherit_metadata"]
lock_version = "4.5.0" lock_version = "4.5.0"
content_hash = "sha256:cb37fedd6d022515dde14e475588a8da2144ba22e41dfdfacfe3f7a7d14486ca" content_hash = "sha256:1a65c1e288d2c6827fc6866d3bfe6a9b8707b2ca895d488f4a9b11cd579c4359"
[[metadata.targets]] [[metadata.targets]]
requires_python = ">=3.10" requires_python = ">=3.10"
@@ -1174,6 +1174,17 @@ files = [
{file = "hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca"}, {file = "hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca"},
] ]
[[package]]
name = "html2text"
version = "2025.4.15"
requires_python = ">=3.9"
summary = "Turn HTML into equivalent Markdown-structured text."
groups = ["default"]
files = [
{file = "html2text-2025.4.15-py3-none-any.whl", hash = "sha256:00569167ffdab3d7767a4cdf589b7f57e777a5ed28d12907d8c58769ec734acc"},
{file = "html2text-2025.4.15.tar.gz", hash = "sha256:948a645f8f0bc3abe7fd587019a2197a12436cd73d0d4908af95bfc8da337588"},
]
[[package]] [[package]]
name = "httpcore" name = "httpcore"
version = "1.0.9" version = "1.0.9"
@@ -2303,6 +2314,17 @@ files = [
{file = "propcache-0.3.1.tar.gz", hash = "sha256:40d980c33765359098837527e18eddefc9a24cea5b45e078a7f3bb5b032c6ecf"}, {file = "propcache-0.3.1.tar.gz", hash = "sha256:40d980c33765359098837527e18eddefc9a24cea5b45e078a7f3bb5b032c6ecf"},
] ]
[[package]]
name = "proxy-py"
version = "2.4.10"
requires_python = ">=3.6"
summary = "\\u26a1 Fast \\u2022 \\U0001fab6 Lightweight \\u2022 \\U0001f51f Dependency \\u2022 \\U0001f50c Pluggable \\u2022 \\U0001f608 TLS interception \\u2022 \\U0001f512 DNS-over-HTTPS \\u2022 \\U0001f525 Poor Mans VPN \\u2022 \\u23ea Reverse & \\u23e9 Forward \\u2022 \\U0001f46e\\U0001f3ff Proxy Server framework \\u2022 \\U0001f310 Web Server framework \\u2022 \\u27b5 \\u27b6 \\u27b7 \\u27a0 PubSub framework \\u2022 \\U0001f477 Work acceptor & executor framework."
groups = ["default"]
files = [
{file = "proxy.py-2.4.10-py3-none-any.whl", hash = "sha256:ef3a31f6ef3be6ff78559c0e68198523bfe2fb1e820bb16686750c1bb5baf9e8"},
{file = "proxy_py-2.4.10.tar.gz", hash = "sha256:41b9e9d3aae6f80e2304d3726e8e9c583a510d8de224eada53d115f48a63a9ce"},
]
[[package]] [[package]]
name = "ptyprocess" name = "ptyprocess"
version = "0.7.0" version = "0.7.0"

View File

@@ -41,6 +41,8 @@ dependencies = [
"apscheduler>=3.11.0", "apscheduler>=3.11.0",
"playwright>=1.52.0", "playwright>=1.52.0",
"camoufox>=0.4.11", "camoufox>=0.4.11",
"html2text>=2025.4.15",
"proxy-py>=2.4.10",
] ]
requires-python = ">=3.10" requires-python = ">=3.10"
readme = "README.md" readme = "README.md"
@@ -97,9 +99,9 @@ strictSetInference = true
[tool.isort] [tool.isort]
length_sort = "1" length_sort = true
profile = "black" profile = "black"
sections = "STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER" sections = ["STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
import_heading_stdlib = "STL" import_heading_stdlib = "STL"
import_heading_thirdparty = "PDM" import_heading_thirdparty = "PDM"
import_heading_firstparty = "LOCAL" import_heading_firstparty = "LOCAL"

19
scripts/version.sh Executable file
View File

@@ -0,0 +1,19 @@
#!/bin/bash
# ARGS
VERSION_TYPE=$1 # patch, minor, major
# Get the current version from the Chart.yaml file
current_version=$(grep -oP 'version:\s*\K[0-9]+\.[0-9]+\.[0-9]+' helm/Chart.yaml | tr -d '[:space:]')
# Increment the version number
if [ "$VERSION_TYPE" == "patch" ]; then
new_version=$(echo $current_version | awk -F. -v OFS=. '{$NF++; print}')
elif [ "$VERSION_TYPE" == "minor" ]; then
new_version=$(echo $current_version | awk -F. -v OFS=. '{$2++; $3=0; print}')
elif [ "$VERSION_TYPE" == "major" ]; then
new_version=$(echo $current_version | awk -F. -v OFS=. '{$1++; $2=0; $3=0; print}')
fi
# Output the new version
echo "$new_version"

View File

@@ -1,5 +0,0 @@
import React from "react";
export const Chat = () => {
return <h1>Chat</h1>;
};

View File

@@ -1,133 +0,0 @@
import React, { useState, useEffect, Dispatch, useRef } from "react";
import { Job } from "../../types";
import { fetchJobs } from "../../lib";
import Box from "@mui/material/Box";
import InputLabel from "@mui/material/InputLabel";
import FormControl from "@mui/material/FormControl";
import Select from "@mui/material/Select";
import Popover from "@mui/material/Popover";
import { Typography, MenuItem, useTheme } from "@mui/material";
import { SxProps } from "@mui/material";
interface Props {
sxProps: SxProps;
setSelectedJob: Dispatch<React.SetStateAction<Job | null>>;
selectedJob: Job | null;
setJobs: Dispatch<React.SetStateAction<Job[]>>;
jobs: Job[];
}
export const JobSelector = ({
sxProps,
selectedJob,
setSelectedJob,
setJobs,
jobs,
}: Props) => {
const [anchorEl, setAnchorEl] = useState<HTMLElement | null>(null);
const [popoverJob, setPopoverJob] = useState<Job | null>(null);
const theme = useTheme();
const handlePopoverOpen = (
event: React.MouseEvent<HTMLElement>,
job: Job
) => {
setAnchorEl(event.currentTarget);
setPopoverJob(job);
};
const handlePopoverClose = () => {
setAnchorEl(null);
setPopoverJob(null);
};
const open = Boolean(anchorEl);
return (
<Box sx={sxProps}>
<FormControl fullWidth>
{jobs.length ? (
<>
<InputLabel id="select-job">Job</InputLabel>
<Select
labelId="select-job"
id="select-job"
value={selectedJob?.id || ""}
label="Job"
onChange={(e) => {
setSelectedJob(
jobs.find((job) => job.id === e.target.value) || null
);
}}
>
{jobs.map((job) => (
<MenuItem
key={job.id}
value={job.id}
aria-owns={open ? "mouse-over-popover" : undefined}
aria-haspopup="true"
onMouseEnter={(e) => handlePopoverOpen(e, job)}
onMouseLeave={handlePopoverClose}
onClick={handlePopoverClose}
>
{job.id}
</MenuItem>
))}
</Select>
</>
) : null}
</FormControl>
<Popover
id="mouse-over-popover"
sx={{
pointerEvents: "none",
padding: 0,
}}
open={open}
anchorEl={anchorEl}
anchorOrigin={{
vertical: "bottom",
horizontal: "left",
}}
transformOrigin={{
vertical: "top",
horizontal: "left",
}}
onClose={handlePopoverClose}
>
{popoverJob && (
<Box
sx={{
border:
theme.palette.mode === "light"
? "2px solid black"
: "2px solid white",
}}
>
<Typography
variant="body1"
sx={{ paddingLeft: 1, paddingRight: 1 }}
>
{popoverJob.url}
</Typography>
<div className="flex flex-row w-full justify-end mb-1">
<Typography
variant="body2"
sx={{
paddingLeft: 1,
paddingRight: 1,
color: theme.palette.mode === "dark" ? "#d3d7e6" : "#5b5d63",
fontStyle: "italic",
}}
>
{popoverJob.time_created
? new Date(popoverJob.time_created).toLocaleString()
: "Unknown"}
</Typography>
</div>
</Box>
)}
</Popover>
</Box>
);
};

View File

@@ -1,2 +0,0 @@
export * from "./Chat";
export * from "./JobSelector";

View File

@@ -1,44 +1,51 @@
import { Box, Link, Typography } from "@mui/material";
import { SetStateAction, Dispatch, useState } from "react";
import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog";
import { RawJobOptions } from "@/types"; import { RawJobOptions } from "@/types";
import SettingsIcon from "@mui/icons-material/Settings";
import { Box, Button, Typography } from "@mui/material";
import { Dispatch, SetStateAction, useState } from "react";
import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog";
export type AdvancedJobOptionsProps = { export type AdvancedJobOptionsProps = {
jobOptions: RawJobOptions; jobOptions: RawJobOptions;
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>; setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
multiPageScrapeEnabled?: boolean;
}; };
export const AdvancedJobOptions = ({ export const AdvancedJobOptions = ({
jobOptions, jobOptions,
setJobOptions, setJobOptions,
multiPageScrapeEnabled = true,
}: AdvancedJobOptionsProps) => { }: AdvancedJobOptionsProps) => {
const [open, setOpen] = useState(false); const [open, setOpen] = useState(false);
return ( return (
<Box sx={{ mb: 2 }}> <Box sx={{ display: "flex", alignItems: "center", gap: 1 }}>
<Link <Button
component="button" variant="outlined"
variant="body2"
onClick={() => setOpen(true)} onClick={() => setOpen(true)}
startIcon={<SettingsIcon />}
sx={{ sx={{
textDecoration: "none", textTransform: "none",
color: "primary.main", borderRadius: 2,
px: 2,
py: 1,
borderColor: "divider",
color: "text.secondary",
"&:hover": { "&:hover": {
color: "primary.dark", borderColor: "primary.main",
textDecoration: "underline", color: "primary.main",
bgcolor: "action.hover",
}, },
paddingLeft: 1,
display: "inline-flex",
alignItems: "center",
gap: 0.5,
}} }}
> >
<Typography variant="body2">Advanced Job Options</Typography> <Typography variant="body2">Advanced Options</Typography>
</Link> </Button>
<AdvancedJobOptionsDialog <AdvancedJobOptionsDialog
open={open} open={open}
onClose={() => setOpen(false)} onClose={() => setOpen(false)}
jobOptions={jobOptions} jobOptions={jobOptions}
setJobOptions={setJobOptions} setJobOptions={setJobOptions}
multiPageScrapeEnabled={multiPageScrapeEnabled}
/> />
</Box> </Box>
); );

View File

@@ -1,3 +1,11 @@
import { ExpandedTableInput } from "@/components/common/expanded-table-input";
import { RawJobOptions } from "@/types";
import {
Code as CodeIcon,
ExpandMore as ExpandMoreIcon,
InfoOutlined,
Settings,
} from "@mui/icons-material";
import { import {
Accordion, Accordion,
AccordionDetails, AccordionDetails,
@@ -17,21 +25,14 @@ import {
Typography, Typography,
useTheme, useTheme,
} from "@mui/material"; } from "@mui/material";
import { import { Dispatch, SetStateAction, useEffect, useState } from "react";
ExpandMore as ExpandMoreIcon,
InfoOutlined,
Code as CodeIcon,
Settings,
} from "@mui/icons-material";
import { Dispatch, SetStateAction } from "react";
import { RawJobOptions } from "@/types";
import { ExpandedTableInput } from "../../expanded-table-input";
export type AdvancedJobOptionsDialogProps = { export type AdvancedJobOptionsDialogProps = {
open: boolean; open: boolean;
onClose: () => void; onClose: () => void;
jobOptions: RawJobOptions; jobOptions: RawJobOptions;
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>; setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
multiPageScrapeEnabled?: boolean;
}; };
export const AdvancedJobOptionsDialog = ({ export const AdvancedJobOptionsDialog = ({
@@ -39,33 +40,39 @@ export const AdvancedJobOptionsDialog = ({
onClose, onClose,
jobOptions, jobOptions,
setJobOptions, setJobOptions,
multiPageScrapeEnabled = true,
}: AdvancedJobOptionsDialogProps) => { }: AdvancedJobOptionsDialogProps) => {
const theme = useTheme(); const theme = useTheme();
const handleMultiPageScrapeChange = () => { const [localJobOptions, setLocalJobOptions] =
setJobOptions((prevJobOptions) => ({ useState<RawJobOptions>(jobOptions);
useEffect(() => {
setLocalJobOptions(jobOptions);
}, [jobOptions]);
const handleCheckboxChange = (key: keyof RawJobOptions) => {
setLocalJobOptions((prevJobOptions) => ({
...prevJobOptions, ...prevJobOptions,
multi_page_scrape: !prevJobOptions.multi_page_scrape, [key]: !prevJobOptions[key],
})); }));
}; };
const handleProxiesChange = (e: React.ChangeEvent<HTMLInputElement>) => { const handleProxiesChange = (e: React.ChangeEvent<HTMLInputElement>) => {
setJobOptions((prevJobOptions) => ({ setLocalJobOptions((prevJobOptions) => ({
...prevJobOptions, ...prevJobOptions,
proxies: e.target.value, proxies: e.target.value,
})); }));
}; };
const handleCollectMediaChange = () => { const handleClose = () => {
setJobOptions((prevJobOptions) => ({ setJobOptions(localJobOptions);
...prevJobOptions, onClose();
collect_media: !prevJobOptions.collect_media,
}));
}; };
return ( return (
<Dialog <Dialog
open={open} open={open}
onClose={onClose} onClose={handleClose}
maxWidth="md" maxWidth="md"
fullWidth fullWidth
PaperProps={{ PaperProps={{
@@ -120,14 +127,21 @@ export const AdvancedJobOptionsDialog = ({
<FormControlLabel <FormControlLabel
control={ control={
<Checkbox <Checkbox
checked={jobOptions.multi_page_scrape} checked={localJobOptions.multi_page_scrape}
onChange={handleMultiPageScrapeChange} onChange={() => handleCheckboxChange("multi_page_scrape")}
disabled={!multiPageScrapeEnabled}
/> />
} }
label={ label={
<Box sx={{ display: "flex", alignItems: "center" }}> <Box sx={{ display: "flex", alignItems: "center" }}>
<Typography>Multi Page Scrape</Typography> <Typography>Multi Page Scrape</Typography>
<Tooltip title="Enable crawling through multiple pages"> <Tooltip
title={
multiPageScrapeEnabled
? "Enable crawling through multiple pages"
: "Multi page scrape is disabled"
}
>
<IconButton size="small"> <IconButton size="small">
<InfoOutlined fontSize="small" /> <InfoOutlined fontSize="small" />
</IconButton> </IconButton>
@@ -135,11 +149,13 @@ export const AdvancedJobOptionsDialog = ({
</Box> </Box>
} }
/> />
<FormControlLabel <FormControlLabel
control={ control={
<Checkbox <Checkbox
checked={jobOptions.collect_media} checked={localJobOptions.collect_media}
onChange={handleCollectMediaChange} onChange={() => handleCheckboxChange("collect_media")}
data-cy="collect-media-checkbox"
/> />
} }
label={ label={
@@ -153,6 +169,26 @@ export const AdvancedJobOptionsDialog = ({
</Box> </Box>
} }
/> />
<FormControlLabel
control={
<Checkbox
checked={localJobOptions.return_html}
onChange={() => handleCheckboxChange("return_html")}
data-cy="return-html-checkbox"
/>
}
label={
<Box sx={{ display: "flex", alignItems: "center" }}>
<Typography>Return HTML</Typography>
<Tooltip title="Return the HTML of the page">
<IconButton size="small">
<InfoOutlined fontSize="small" />
</IconButton>
</Tooltip>
</Box>
}
/>
</FormGroup> </FormGroup>
</Box> </Box>
@@ -223,7 +259,7 @@ export const AdvancedJobOptionsDialog = ({
fullWidth fullWidth
variant="outlined" variant="outlined"
size="small" size="small"
value={jobOptions.proxies} value={localJobOptions.proxies}
onChange={handleProxiesChange} onChange={handleProxiesChange}
InputProps={{ InputProps={{
startAdornment: ( startAdornment: (
@@ -241,8 +277,9 @@ export const AdvancedJobOptionsDialog = ({
label="Custom Headers" label="Custom Headers"
placeholder='{"User-Agent": "CustomAgent", "Accept": "*/*"}' placeholder='{"User-Agent": "CustomAgent", "Accept": "*/*"}'
urlParam="custom_headers" urlParam="custom_headers"
name="custom_headers"
onChange={(value) => { onChange={(value) => {
setJobOptions((prevJobOptions) => ({ setLocalJobOptions((prevJobOptions) => ({
...prevJobOptions, ...prevJobOptions,
custom_headers: value, custom_headers: value,
})); }));
@@ -254,8 +291,9 @@ export const AdvancedJobOptionsDialog = ({
label="Custom Cookies" label="Custom Cookies"
placeholder='[{"name": "value", "name2": "value2"}]' placeholder='[{"name": "value", "name2": "value2"}]'
urlParam="custom_cookies" urlParam="custom_cookies"
name="custom_cookies"
onChange={(value) => { onChange={(value) => {
setJobOptions((prevJobOptions) => ({ setLocalJobOptions((prevJobOptions) => ({
...prevJobOptions, ...prevJobOptions,
custom_cookies: value, custom_cookies: value,
})); }));

View File

@@ -1,17 +1,17 @@
import React, { useState } from "react";
import { import {
alpha,
Box,
Paper,
Table, Table,
TableBody, TableBody,
TableCell, TableCell,
TableContainer, TableContainer,
TableHead, TableHead,
TableRow, TableRow,
Paper,
Box,
Typography, Typography,
useTheme, useTheme,
alpha,
} from "@mui/material"; } from "@mui/material";
import React, { useState } from "react";
export type CsvRow = { export type CsvRow = {
[key: string]: string; [key: string]: string;
@@ -131,8 +131,9 @@ export const CsvTable: React.FC<CsvTableProps> = ({ csv, className }) => {
<Typography variant="body2" color="text.secondary"> <Typography variant="body2" color="text.secondary">
{row.text {row.text
? row.text ? row.text
.replace(/(\r\n|\n|\r)/g, " ") .replace(/[\n\t\r]+/g, " ")
.replace(/\t/g, " ") .replace(/\s+/g, " ")
.trim()
: "No text available"} : "No text available"}
</Typography> </Typography>
</Paper> </Paper>

View File

@@ -0,0 +1,30 @@
import { Box } from "@mui/material";
export type DisabledProps = {
message: string;
};
export const Disabled = ({ message }: DisabledProps) => {
return (
<Box
bgcolor="background.default"
minHeight="100vh"
display="flex"
justifyContent="center"
alignItems="center"
data-testid="disabled-message"
>
<h4
style={{
color: "#fff",
padding: "20px",
borderRadius: "8px",
background: "rgba(0, 0, 0, 0.6)",
boxShadow: "0 4px 8px rgba(0, 0, 0, 0.2)",
}}
>
{message}
</h4>
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./disabled";

View File

@@ -1,28 +1,29 @@
import { parseJsonToEntries } from "@/lib/helpers/parse-json-to-entries";
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
import { import {
Accordion, Accordion,
AccordionSummary,
TableCell,
TableRow,
Paper,
TableBody,
useTheme,
TextField,
Box,
Typography,
AccordionDetails, AccordionDetails,
TableHead, AccordionSummary,
TableContainer, Box,
Paper,
Table, Table,
TableBody,
TableCell,
TableContainer,
TableHead,
TableRow,
TextField,
Typography,
useTheme,
} from "@mui/material"; } from "@mui/material";
import { useEffect, useState } from "react"; import { useEffect, useState } from "react";
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
import { parseJsonToEntries } from "@/lib/helpers/parse-json-to-entries";
export type ExpandedTableInputProps = { export type ExpandedTableInputProps = {
label: string; label: string;
onChange: (value: any) => void; onChange: (value: any) => void;
placeholder: string; placeholder: string;
urlParam: string; urlParam: string;
name: string;
}; };
export const ExpandedTableInput = ({ export const ExpandedTableInput = ({
@@ -30,6 +31,7 @@ export const ExpandedTableInput = ({
onChange, onChange,
placeholder, placeholder,
urlParam, urlParam,
name,
}: ExpandedTableInputProps) => { }: ExpandedTableInputProps) => {
const theme = useTheme(); const theme = useTheme();
const [value, setValue] = useState(""); const [value, setValue] = useState("");
@@ -150,6 +152,7 @@ export const ExpandedTableInput = ({
size="small" size="small"
error={jsonError !== null} error={jsonError !== null}
helperText={jsonError ?? ""} helperText={jsonError ?? ""}
name={name}
/> />
{parsedHeaders && parsedHeaders.length > 0 && ( {parsedHeaders && parsedHeaders.length > 0 && (

Some files were not shown because too many files have changed in this diff Show More