65 Commits

Author SHA1 Message Date
Jayden Pyles
d4edb9d93e chore: update chart version [skip ci] 2025-05-19 20:46:19 -05:00
Jayden Pyles
5ebd96b62b feat: add agent mode (#81)
* chore: wip agent mode

* wip: add agent mode frontend

* wip: add agent mode frontend

* chore: cleanup code

* chore: cleanup code

* chore: cleanup code
2025-05-19 20:44:41 -05:00
Jayden Pyles
d602d3330a fix: site map
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
2025-05-17 17:05:37 -05:00
Jayden Pyles
6639e8b48f chore: update chart version [skip ci] 2025-05-17 16:33:18 -05:00
Jayden Pyles
263e46ba4d feat: add media viewer + other fixes (#79)
* feat: add media viewer + other fixes

* chore: remove logging [skip ci]

* chore: remove logging [skip ci]

* feat: add unit test for media

* feat: add unit test for media

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* chore: update docs [skip ci]
2025-05-17 16:31:34 -05:00
Jayden Pyles
f815a58efc chore: update docker version [skip ci] 2025-05-16 22:04:46 -05:00
Jayden Pyles
50ec5df657 chore: update chart version [skip ci] 2025-05-16 21:39:04 -05:00
Jayden Pyles
28de0f362c feat: add recording viewer and vnc (#78)
* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* chore: update gitignore [skip ci]

* chore: update dev compose [skip ci]

* fix: only run manually
2025-05-16 21:37:09 -05:00
Jayden Pyles
6b33723cac feat: update version
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
2025-05-16 14:15:53 -05:00
Jayden Pyles
5c89e4d7d2 feat: allow custom cookies (#77)
* feat: working new advanced job options

* feat: working new advanced job options

* feat: add tests for adding custom cookies/headers
2025-05-16 14:13:58 -05:00
Jayden Pyles
ed0828a585 fix: deployment
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
2025-05-13 21:03:21 -05:00
Jayden Pyles
1b8c8c779a Feature: Allow Multiple Download Options (#75)
* feat: allow downloading in MD format

* fix: unit tests

* fix: deployments [skip ci]

* fix: deployment
2025-05-13 18:23:59 -05:00
Jayden Pyles
267cc73657 docs: update docs [skip ci] 2025-05-13 13:11:52 -05:00
Jayden Pyles
92ff16d9c3 docs: update docs [skip ci] 2025-05-12 21:37:37 -05:00
Jayden Pyles
8b2e5dc9c3 Feat/add helm chart (#69)
* chore: start on helm chart

* chore: start on helm chart

* chore: start on helm chart

* chore: start on helm chart

* chore: start on helm chart

* chore: start on helm chart

* chore: start on helm chart

* chore: start on helm chart
2025-05-12 21:19:17 -05:00
Jayden Pyles
7f1bc295ac Feat/add data reader (#68)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
* feat: working new data view

* feat: working new data view

* fix: remove unused deps

* fix: typing

* chore: cleanup code
2025-05-12 17:58:45 -05:00
Jayden Pyles
031572325f Fix/UI and backend fixes (#67)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
* chore: wip

* chore: wip

* chore: wip

* fix: cypress test

* chore: cleanup code
2025-05-11 17:33:29 -05:00
Jayden Pyles
48d3bf9214 chore: docs [skip ci] 2025-05-11 13:46:21 -05:00
Jayden Pyles
e07abcd089 chore: docs [skip ci] 2025-05-11 13:42:37 -05:00
Jayden Pyles
8a933b88a7 feat: add notification channels (#66) 2025-05-11 13:13:42 -05:00
Jayden Pyles
863dbcd044 fix: database 2025-05-11 11:38:22 -05:00
Jayden Pyles
de40181a6f chore: docs [skip ci] 2025-05-11 11:24:19 -05:00
Jayden Pyles
8703f706a1 feat: add in optional registration (#65)
* feat: add in optional registration

* fix: issue with registration var

* fix: issue with registration var

* fix: issue with registration var
2025-05-11 11:11:19 -05:00
Jayden Pyles
b40d378bbf fix: chat jobs not loading (#64)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
2025-05-10 18:34:42 -05:00
Jayden Pyles
8123e1f149 docs: update README [skip ci] 2025-05-10 15:24:16 -05:00
Jayden Pyles
8cd30599fa feat: add in media downloading (#62)
* feat: add in media downloading

* fix: build issue
2025-05-10 15:14:54 -05:00
Jayden Pyles
a58212b214 feat: add authentication test 2025-05-10 14:22:06 -05:00
Jayden Pyles
a6ab6ec71d fix: vulns 2025-05-10 12:04:39 -05:00
Jayden Pyles
c5c9427af4 fix: vulns 2025-05-10 11:49:24 -05:00
Jayden Pyles
e8d80c1a77 fix: add cypress tests to CI [skip ci] 2025-05-10 11:29:20 -05:00
Jayden Pyles
ee8047ac78 fix: add cypress tests to CI [skip ci] 2025-05-10 10:46:05 -05:00
Jayden Pyles
e74c4f392c fix: add cypress tests to CI [skip ci] 2025-05-10 10:41:54 -05:00
Jayden Pyles
6b484952a3 fix: add cypress tests to CI [skip ci] 2025-05-10 10:35:31 -05:00
Jayden Pyles
2283808605 fix: add cypress tests to CI [skip ci] 2025-05-10 10:17:22 -05:00
Jayden Pyles
ee5ada70f7 fix: add cypress tests to CI [skip ci] 2025-05-10 10:04:55 -05:00
Jayden Pyles
56cc457e6e fix: add cypress tests to CI [skip ci] 2025-05-10 09:48:54 -05:00
Jayden Pyles
21a38181de fix: add cypress tests to CI [skip ci] 2025-05-10 09:44:43 -05:00
Jayden Pyles
3063bc0d53 fix: add cypress tests to CI [skip ci] 2025-05-10 09:41:43 -05:00
Jayden Pyles
f42e7ed531 fix: add cypress tests to CI [skip ci] 2025-05-10 09:39:44 -05:00
Jayden Pyles
c197f2becd fix: add cypress tests to CI [skip ci] 2025-05-10 09:38:11 -05:00
Jayden Pyles
a534129702 fix: swap to using chrome driver manager [skip ci] 2025-05-10 09:24:48 -05:00
Jayden Pyles
455ed049c9 fix: allow workflow dispatch [skip ci] 2025-05-10 09:16:41 -05:00
Jayden Pyles
de4ccfbf3a fix: only allow cron on logged in
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
2025-04-24 22:14:00 -05:00
Jayden Pyles
3475d66995 Add cron jobs (#60)
* feat: finish up cron jobs

* feat: clean up
2025-04-24 22:03:28 -05:00
Jayden Pyles
186b4a0231 Merge branch 'master' of github.com:jaypyles/Scraperr 2025-04-24 22:02:06 -05:00
Jayden Pyles
0af0ebf5b5 feat: fix authentication 2025-04-24 18:24:19 -05:00
c3Nz
ef35db00d7 fix: Python handler Fixed (#51)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
* Fix: Python handler Fixed

* fix: Python handler Fixed without comment
2024-11-26 10:05:43 -06:00
Jayden Pyles
d65e600ec3 Merge branch 'master' of github.com:jaypyles/Scraperr
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
2024-11-21 18:13:18 -06:00
Jayden Pyles
6fe145f649 chore: remove uneeded files [skip ci] 2024-11-21 18:12:46 -06:00
Jayden Pyles
563ca2245e Refactor: Drop MongoDB (#48)
* feat: replace mongodb with sqllite

* feat: update docker compose to drop mongo

* chore: drop logs

* chore: cleanup

* fix: unit tests

* fix: workflow

* fix: workflow run
2024-11-21 18:11:46 -06:00
Jayden Pyles
d54fdbd405 fix: workflow ruin [skip ci] 2024-11-21 18:11:31 -06:00
Jayden Pyles
7169755cd2 fix: workflow 2024-11-21 18:03:40 -06:00
Jayden Pyles
15b56b5704 fix: unit tests 2024-11-21 18:00:57 -06:00
Jayden Pyles
bf6b740005 chore: cleanup 2024-11-21 17:43:20 -06:00
Jayden Pyles
c339e75e06 chore: drop logs 2024-11-21 17:36:47 -06:00
Jayden Pyles
b6ed40e6cf feat: update docker compose to drop mongo 2024-11-21 17:36:22 -06:00
Jayden Pyles
3085f9d31a feat: replace mongodb with sqllite 2024-11-20 21:32:27 -06:00
Jayden Pyles
7d80ff5c7f Feat: Site Mapping (#46)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
* wip: add site mapping

* chore: cleanup
2024-11-16 20:55:23 -06:00
Jayden Pyles
3a0762f1e3 fix: headers
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
2024-11-12 22:07:30 -06:00
Jayden Pyles
dc4d219205 fix: make calls to next server 2024-11-12 21:34:47 -06:00
Jayden Pyles
b3bf780eda Refactor: Remove Proxy Dependency (#44)
Some checks are pending
Unit Tests / unit-tests (push) Waiting to run
2024-11-12 17:30:07 -06:00
Jayden Pyles
1dfd3ca92a Update issue templates [skip ci] 2024-11-10 16:33:29 -06:00
Jayden Pyles
fe51140a0e fix: ci
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
2024-11-09 21:38:03 -06:00
Jayden Pyles
dd6cec6679 fix: ci 2024-11-09 21:34:41 -06:00
Jayden Pyles
2339ba1b77 fix: ci 2024-11-09 21:33:21 -06:00
193 changed files with 15515 additions and 25580 deletions

4
.dockerignore Normal file
View File

@@ -0,0 +1,4 @@
node_modules
npm-debug.log
Dockerfile
.dockerignore

32
.github/ISSUE_TEMPLATE/bug_report.md vendored Normal file
View File

@@ -0,0 +1,32 @@
---
name: Bug report
about: 'Bug reporting '
title: ''
labels: ''
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug is.
**To Reproduce**
Steps to reproduce the behavior:
1. Go to '...'
2. Click on '....'
3. Scroll down to '....'
4. See error
**Expected behavior**
A clear and concise description of what you expected to happen.
**Screenshots**
If applicable, add screenshots to help explain your problem.
**Desktop (please complete the following information):**
- OS: [e.g. iOS]
- Browser [e.g. chrome, safari]
- Version [e.g. 22]
**Additional context**
Add any other context about the problem here.

View File

@@ -0,0 +1,50 @@
name: Publish Helm Chart
description: Publish a Helm chart to a target repository
inputs:
app-repo-token:
required: true
description: "The token for the target repository"
runs:
using: 'composite'
steps:
- name: Checkout app repo
uses: actions/checkout@v4
- name: Set up Helm
uses: azure/setup-helm@v3
- name: Package Helm chart
run: |
mkdir -p packaged
helm package helm -d packaged
shell: bash
- name: Clone target Helm repo
run: |
git clone https://github.com/jaypyles/helm.git target-repo
cd target-repo
git config user.name "github-actions"
git config user.email "github-actions@github.com"
git fetch origin gh-pages # Fetch gh-pages explicitly
git checkout gh-pages # Checkout gh-pages branch
git pull origin gh-pages # Pull latest changes from gh-pages
shell: bash
- name: Copy package and update index
run: |
APP_NAME="scraperr"
mkdir -p target-repo/charts/$APP_NAME
cp packaged/*.tgz target-repo/charts/$APP_NAME/
cd target-repo/charts/$APP_NAME
helm repo index . --url https://jaypyles.github.io/helm/charts/$APP_NAME
shell: bash
- name: Commit and push to target repo
run: |
cd target-repo
git add charts/
git commit -m "Update $APP_NAME chart $(date +'%Y-%m-%d %H:%M:%S')" || echo "No changes"
git push https://x-access-token:${{ inputs.app-repo-token }}@github.com/jaypyles/helm.git gh-pages
shell: bash

View File

@@ -0,0 +1,58 @@
name: Run Cypress Tests
description: Run Cypress tests
runs:
using: "composite"
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 22
- name: Setup Docker project
shell: bash
run: make build-ci up-ci
- name: Install dependencies
shell: bash
run: yarn install
- name: Wait for frontend to be ready
shell: bash
run: |
for i in {1..10}; do
curl -s http://127.0.0.1:80 && echo "Frontend is ready" && exit 0
echo "Waiting for frontend to be ready... attempt $i"
sleep 1
done
echo "Frontend failed to be ready after 10 retries"
exit 1
- name: Wait for backend to be ready
shell: bash
run: |
for i in {1..10}; do
curl -s http://127.0.0.1:8000 && echo "Backend is ready" && exit 0
echo "Waiting for backend to be ready... attempt $i"
sleep 1
done
echo "Backend failed to be ready after 10 retries"
exit 1
- name: Show backend logs on failure
if: failure()
shell: bash
run: |
echo "== Docker Containers =="
docker ps -a
echo "== Backend Logs =="
docker logs $(docker ps -a --filter "name=scraperr_api" --format "{{.Names}}") || echo "Could not get backend logs"
- name: Run Cypress tests
shell: bash
run: npm run cy:run

View File

@@ -1,9 +1,6 @@
name: ci
requires:
- unit-tests
name: Docker Image
on:
push:
branches: ["master"]
workflow_dispatch:
jobs:
build:
@@ -12,6 +9,12 @@ jobs:
- name: Checkout
uses: actions/checkout@v4
- name: Get version from helm chart
run: |
VERSION=$(grep "version:" ./helm/Chart.yaml | cut -d: -f2 | tr -d ' ')
echo "VERSION=$VERSION" >> $GITHUB_ENV
echo "Version is $VERSION"
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
@@ -27,7 +30,9 @@ jobs:
context: .
file: ./docker/frontend/Dockerfile
push: true
tags: ${{ secrets.DOCKERHUB_USERNAME }}/${{ secrets.DOCKERHUB_REPO }}:latest
tags: |
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:latest
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:${{ env.VERSION }}
- name: Build and push api
uses: docker/build-push-action@v5
@@ -35,4 +40,36 @@ jobs:
context: .
file: ./docker/api/Dockerfile
push: true
tags: ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
tags: |
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:${{ env.VERSION }}
push-helm-chart:
runs-on: ubuntu-latest
needs:
- build
steps:
- uses: actions/checkout@v4
- name: Push Helm Chart
uses: ./.github/actions/push-to-helm
with:
app-repo-token: ${{ secrets.GPAT_TOKEN }}
success-message:
runs-on: ubuntu-latest
needs:
- build
- push-helm-chart
steps:
- name: Send Discord Message
uses: jaypyles/discord-webhook-action@v1.0.0
with:
webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
content: "Scraperr Successfully Built Docker Images"
username: "Scraperr CI"
embed-title: "✅ Deployment Status"
embed-description: "Scraperr successfully built docker images."
embed-color: 3066993 # Green
embed-footer-text: "Scraperr CI"
embed-timestamp: ${{ github.event.head_commit.timestamp }}

View File

@@ -4,9 +4,11 @@ on:
push:
branches:
- master
pull_request:
branches:
- master
types: [opened, synchronize, reopened]
workflow_dispatch:
jobs:
unit-tests:
@@ -15,11 +17,41 @@ jobs:
- name: Checkout
uses: actions/checkout@v4
- name: Set env
run: echo "ENV=test" >> $GITHUB_ENV
- name: Install pdm
run: pip install pdm
- name: Install project dependencies
run: pdm install
- name: Install playwright
run: pdm run playwright install
- name: Run tests
run: PYTHONPATH=. pdm run pytest api/backend/tests
run: PYTHONPATH=. pdm run pytest -v -ra api/backend/tests
cypress-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/run-cypress-tests
success-message:
runs-on: ubuntu-latest
needs:
- unit-tests
- cypress-tests
steps:
- name: Send Discord Message
uses: jaypyles/discord-webhook-action@v1.0.0
with:
webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
content: "Scraperr Successfully Passed Tests"
username: "Scraperr CI"
embed-title: "✅ Deployment Status"
embed-description: "Scraperr successfully passed all tests."
embed-color: 3066993 # Green
embed-footer-text: "Scraperr CI"
embed-timestamp: ${{ github.event.head_commit.timestamp }}

16
.gitignore vendored
View File

@@ -187,3 +187,19 @@ cython_debug/
postgres_data
.vscode
ollama
data
media/images
media/videos
media/audio
media/pdfs
media/spreadsheets
media/presentations
media/documents
media/recordings
media/download_summary.txt
cypress/screenshots
cypress/videos
docker-compose.dev.local.yml

2
.prettierignore Normal file
View File

@@ -0,0 +1,2 @@
*.yaml
*.yml

1
.python-version Normal file
View File

@@ -0,0 +1 @@
3.10.12

1
FUNDING.yml Normal file
View File

@@ -0,0 +1 @@
custom: ["https://www.buymeacoffee.com/jaypyles"]

View File

@@ -1,6 +1,6 @@
.DEFAULT_GOAL := help
COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.yml
COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.local.yml
COMPOSE_PROD = docker compose -f docker-compose.yml
.PHONY: help deps build pull up up-dev down setup deploy
@@ -17,6 +17,7 @@ help:
@echo " make down - Stop and remove containers, networks, images, and volumes"
@echo " make setup - Setup server with dependencies and clone repo"
@echo " make deploy - Deploy site onto server"
@echo " make cypress-start - Start Cypress"
@echo ""
logs:
@@ -51,3 +52,12 @@ setup:
deploy:
ansible-playbook -i ./ansible/inventory.yaml ./ansible/deploy_site.yaml -v
build-ci:
docker compose -f docker-compose.yml -f docker-compose.dev.yml build
up-ci:
docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d --force-recreate
cypress-start:
DISPLAY=:0 npx cypress open

193
README.md
View File

@@ -1,178 +1,71 @@
![logo_picture](https://github.com/jaypyles/www-scrape/blob/master/docs/logo_picture.png)
<div align="center">
<img src="https://img.shields.io/badge/MongoDB-%234ea94b.svg?style=for-the-badge&logo=mongodb&logoColor=white" alt="MongoDB" />
<img src="https://img.shields.io/badge/FastAPI-005571?style=for-the-badge&logo=fastapi" alt="FastAPI" />
<img src="https://img.shields.io/badge/Next-black?style=for-the-badge&logo=next.js&logoColor=white" alt="Next JS" />
<img src="https://img.shields.io/badge/tailwindcss-%2338B2AC.svg?style=for-the-badge&logo=tailwind-css&logoColor=white" alt="TailwindCSS" />
<img src="https://github.com/jaypyles/www-scrape/blob/master/docs/logo_picture.png" alt="Scraperr Logo" width="250px">
**A powerful self-hosted web scraping solution**
<div>
<img src="https://img.shields.io/badge/MongoDB-%234ea94b.svg?style=for-the-badge&logo=mongodb&logoColor=white" alt="MongoDB" />
<img src="https://img.shields.io/badge/FastAPI-005571?style=for-the-badge&logo=fastapi" alt="FastAPI" />
<img src="https://img.shields.io/badge/Next-black?style=for-the-badge&logo=next.js&logoColor=white" alt="Next JS" />
<img src="https://img.shields.io/badge/tailwindcss-%2338B2AC.svg?style=for-the-badge&logo=tailwind-css&logoColor=white" alt="TailwindCSS" />
</div>
</div>
# Summary
## 📋 Overview
Scraperr is a self-hosted web application that allows users to scrape data from web pages by specifying elements via XPath. Users can submit URLs and the corresponding elements to be scraped, and the results will be displayed in a table.
Scrape websites without writing a single line of code.
From the table, users can download an excel sheet of the job's results, along with an option to rerun the job.
> 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information.
View the [docs](https://scraperr-docs.pages.dev).
<div align="center">
<img src="https://github.com/jaypyles/www-scrape/blob/master/docs/main_page.png" alt="Scraperr Main Interface" width="800px">
</div>
## Features
## ✨ Key Features
### Submitting URLs for Scraping
- **XPath-Based Extraction**: Precisely target page elements
- **Queue Management**: Submit and manage multiple scraping jobs
- **Domain Spidering**: Option to scrape all pages within the same domain
- **Custom Headers**: Add JSON headers to your scraping requests
- **Media Downloads**: Automatically download images, videos, and other media
- **Results Visualization**: View scraped data in a structured table format
- **Data Export**: Export your results in markdown and csv formats
- **Notifcation Channels**: Send completion notifcations, through various channels
- Submit/Queue URLs for web scraping
- Add and manage elements to scrape using XPath
- Scrape all pages within same domain
- Add custom json headers to send in requests to URLs
- Display results of scraped data
## 🚀 Getting Started
![main_page](https://github.com/jaypyles/www-scrape/blob/master/docs/main_page.png)
### Docker
### Managing Previous Jobs
- Download csv containing results
- Rerun jobs
- View status of queued jobs
- Favorite and view favorited jobs
![job_page](https://github.com/jaypyles/www-scrape/blob/master/docs/job_page.png)
### User Management
- User login/signup to organize jobs (optional)
![login](https://github.com/jaypyles/www-scrape/blob/master/docs/login.png)
### Log Viewing
- View app logs inside of web ui
![logs](https://github.com/jaypyles/www-scrape/blob/master/docs/log_page.png)
### Statistics View
- View a small statistics view of jobs ran
![statistics](https://github.com/jaypyles/www-scrape/blob/master/docs/stats_page.png)
### AI Integration
- Include the results of a selected job into the context of a conversation
- Currently supports:
1. Ollama
2. OpenAI
![chat](https://github.com/jaypyles/www-scrape/blob/master/docs/chat_page.png)
## Installation
1. Clone the repository:
```sh
git clone https://github.com/jaypyles/scraperr.git
```
2. Set environmental variables and labels in `docker-compose.yml`.
```yaml
scraperr:
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr.rule=Host(`localhost`)" # change this to your domain, if not running on localhost
- "traefik.http.routers.scraperr.entrypoints=web" # websecure if using https
- "traefik.http.services.scraperr.loadbalancer.server.port=3000"
scraperr_api:
environment:
- LOG_LEVEL=INFO
- MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB
- SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string)
- ALGORITHM=HS256 # authentication encoding algorithm
- ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr_api.rule=Host(`localhost`) && PathPrefix(`/api`)" # change this to your domain, if not running on localhost
- "traefik.http.routers.scraperr_api.entrypoints=web" # websecure if using https
- "traefik.http.middlewares.api-stripprefix.stripprefix.prefixes=/api"
- "traefik.http.routers.scraperr_api.middlewares=api-stripprefix"
- "traefik.http.services.scraperr_api.loadbalancer.server.port=8000"
mongo:
environment:
MONGO_INITDB_ROOT_USERNAME: root
MONGO_INITDB_ROOT_PASSWORD: example
```
Don't want to use `traefik`? This configuration can be used in other reverse proxies, as long as the API is proxied to `/api` of the frontend container. This is currently
not able to be used without a reverse proxy, due to limitations of runtime client-side environmental variables in `next.js`.
3. Deploy
```sh
```bash
make up
```
The app provides its own `traefik` configuration to use independently, but can easily be reverse-proxied by any other app, or your own reverse-proxy.
### Helm
## Usage
> Refer to the docs for helm deployment: https://scraperr-docs.pages.dev/guides/helm-deployment
1. Open the application in your browser at `http://localhost`.
2. Enter the URL you want to scrape in the URL field.
3. Add elements to scrape by specifying a name and the corresponding XPath.
4. Click the "Submit" button to queue URL to be scraped.
5. View queue in the "Previous Jobs" section.
## ⚖️ Legal and Ethical Guidelines
## API Endpoints
When using Scraperr, please remember to:
Use this service as an API for your own projects. Due to this using FastAPI, a docs page is available at `/docs` for the API.
1. **Respect `robots.txt`**: Always check a website's `robots.txt` file to verify which pages permit scraping
2. **Terms of Service**: Adhere to each website's Terms of Service regarding data extraction
3. **Rate Limiting**: Implement reasonable delays between requests to avoid overloading servers
![docs](https://github.com/jaypyles/www-scrape/blob/master/docs/docs_page.png)
> **Disclaimer**: Scraperr is intended for use only on websites that explicitly permit scraping. The creator accepts no responsibility for misuse of this tool.
## AI
## 💬 Join the Community
Currently supports either an Ollama instance or OpenAI's ChatGPT, using your own API key. Setting up is easy as either setting the Ollama url or the OpenAI API key in the API's environmental variables in the `docker-compose.yml` file:
Get support, report bugs, and chat with other users and contributors.
```yaml
scraperr_api:
environment:
- OLLAMA_URL=http://ollama:11434
- OLLAMA_MODEL=llama3.1
# or
- OPENAI_KEY=<your_key>
- OPENAI_MODEL=gpt3.5-turbo
```
👉 [Join the Scraperr Discord](https://discord.gg/89q7scsGEK)
The model's names are taken from the documentation of their respective technologies.
## Troubleshooting
Q: When running Scraperr, I'm met with "404 Page not found".
A: This is probably an issue with MongoDB related to running Scraperr in a VM. You should see something liks this in `make logs`:
```
WARNING: MongoDB 5.0+ requires a CPU with AVX support, and your current system does not appear to have that!
```
To resolve this issue, simply set CPU host type to `host`. This can be done in Proxmox in the VM settings > Processor. [Related issue](https://github.com/jaypyles/Scraperr/issues/9).
## Legal and Ethical Considerations
When using Scraperr, please ensure that you:
1. **Check Robots.txt**: Verify allowed pages by reviewing the `robots.txt` file of the target website.
2. **Compliance**: Always comply with the website's Terms of Service (ToS) regarding web scraping.
**Disclaimer**: This tool is intended for use only on websites that permit scraping. The author is not responsible for any misuse of this tool.
## License
## 📄 License
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
### Contributions
## 👏 Contributions
Development made easy by developing from [webapp template](https://github.com/jaypyles/webapp-template). View documentation for extra information.
Development made easier with the [webapp template](https://github.com/jaypyles/webapp-template).
Start development server:
`make deps build up-dev`
To get started, simply run `make build up-dev`.

View File

@@ -1,3 +0,0 @@
github_repo: https://github.com/jaypyles/webapp-template.git
deploy_path: /home/admin/site-test6
deploy_command: make pull up-prd

View File

@@ -1,10 +0,0 @@
- name: Deploy site
hosts: all
become: true
vars_files:
- ./config.yaml
tasks:
- name: Deploy
command: "{{deploy_command}}"
args:
chdir: "{{deploy_path}}"

View File

@@ -1,6 +0,0 @@
all:
hosts:
host1:
ansible_host: 192.168.0.1
ansible_user: admin
ansible_ssh_private_key_file: private_key.pem

View File

@@ -1,54 +0,0 @@
- name: Install Docker and run make pull up
hosts: all
become: true
vars_files:
- ./config.yaml
tasks:
- name: Update apt cache
apt:
update_cache: yes
- name: Install required packages
apt:
name:
- apt-transport-https
- ca-certificates
- curl
- gnupg-agent
- software-properties-common
- rsync
- make
state: present
- name: Add Dockers official GPG key
apt_key:
url: https://download.docker.com/linux/ubuntu/gpg
state: present
- name: Add Docker APT repository
apt_repository:
repo: deb [arch=amd64] https://download.docker.com/linux/ubuntu focal stable
state: present
- name: Update apt cache again after adding Docker repo
apt:
update_cache: yes
- name: Install Docker
apt:
name: docker-ce
state: present
- name: Start and enable Docker service
systemd:
name: docker
enabled: yes
state: started
- name: Install Docker Compose
apt:
name: docker-compose-plugin
state: present
- name: Verify Docker is installed
command: docker --version
register: docker_version
- name: Display Docker version
debug:
msg: "Docker version: {{ docker_version.stdout }}"
- name: Clone repo
ansible.builtin.git:
repo: "{{github_repo}}"
dest: "{{deploy_path}}"

View File

@@ -0,0 +1,6 @@
from typing_extensions import TypedDict
class Action(TypedDict):
type: str
url: str

View File

@@ -0,0 +1,94 @@
import random
from typing import Any
from camoufox import AsyncCamoufox
from playwright.async_api import Page
from api.backend.ai.agent.utils import (
capture_elements,
convert_to_markdown,
parse_response,
)
from api.backend.ai.clients import ask_open_ai, ask_ollama, open_ai_key
from api.backend.ai.agent.prompts import (
ELEMENT_EXTRACTION_PROMPT,
EXTRACT_ELEMENTS_PROMPT,
)
from api.backend.job.scraping.collect_media import collect_media
from api.backend.worker.logger import LOG
from api.backend.job.scraping.add_custom import add_custom_items
from api.backend.models import CapturedElement
ask_ai = ask_open_ai if open_ai_key else ask_ollama
async def scrape_with_agent(agent_job: dict[str, Any]):
LOG.info(f"Starting work for agent job: {agent_job}")
pages = set()
if agent_job["job_options"]["proxies"]:
proxy = random.choice(agent_job["job_options"]["proxies"])
LOG.info(f"Using proxy: {proxy}")
async with AsyncCamoufox(headless=True) as browser:
page: Page = await browser.new_page()
await add_custom_items(
agent_job["url"],
page,
agent_job["job_options"]["custom_cookies"],
agent_job["job_options"]["custom_headers"],
)
try:
await page.set_viewport_size({"width": 1920, "height": 1080})
await page.goto(agent_job["url"], timeout=60000)
if agent_job["job_options"]["collect_media"]:
await collect_media(agent_job["id"], page)
html_content = await page.content()
markdown_content = convert_to_markdown(html_content)
response = await ask_ai(
ELEMENT_EXTRACTION_PROMPT.format(
extraction_prompt=EXTRACT_ELEMENTS_PROMPT,
webpage=markdown_content,
prompt=agent_job["prompt"],
)
)
xpaths = parse_response(response)
captured_elements = await capture_elements(page, xpaths)
final_url = page.url
pages.add((html_content, final_url))
finally:
await page.close()
await browser.close()
name_to_elements = {}
for page in pages:
for element in captured_elements:
if element.name not in name_to_elements:
name_to_elements[element.name] = []
name_to_elements[element.name].append(element)
scraped_elements: list[dict[str, dict[str, list[CapturedElement]]]] = [
{
page[1]: name_to_elements,
}
for page in pages
]
return scraped_elements

View File

@@ -0,0 +1,58 @@
EXTRACT_ELEMENTS_PROMPT = """
You are an assistant that extracts XPath expressions from webpages.
You will receive HTML content in markdown format.
Each element in the markdown has their xpath shown above them in a path like:
<!-- //div -->
Respond only with a list of general XPath expressions inside `<xpaths>...</xpaths>` tags.
You will also decide the decision of what to do next. If there is no decision available, return nothing for that section.
"""
ELEMENT_EXTRACTION_PROMPT = """
{extraction_prompt}
**Guidelines:**
- Prefer shorter, more general XPaths like `//div[...]` or `//span[...]`.
- Avoid overly specific or deep paths like `//div[3]/ul/li[2]/a`.
- Do **not** chain multiple elements deeply (e.g., `//div/span/a`).
- Use XPaths further down the tree when possible.
- Do not include any extra explanation or text.
- One XPath is acceptable if that's all that's needed.
- Try and limit it down to 1 - 3 xpaths.
- Include a name for each xpath.
<important>
- USE THE MOST SIMPLE XPATHS POSSIBLE.
- USE THE MOST GENERAL XPATHS POSSIBLE.
- USE THE MOST SPECIFIC XPATHS POSSIBLE.
- USE THE MOST GENERAL XPATHS POSSIBLE.
</important>
**Example Format:**
```xml
<xpaths>
- <name: insert_name_here>: <xpath: //div>
- <name: insert_name_here>: <xpath: //span>
- <name: insert_name_here>: <xpath: //span[contains(@text, 'example')]>
- <name: insert_name_here>: <xpath: //div[contains(@text, 'example')]>
- <name: insert_name_here>: <xpath: //a[@href]>
- etc
</xpaths>
<decision>
<next_page>
- //a[@href='next_page_url']
</next_page>
</decision>
```
**Input webpage:**
{webpage}
**Target content:**
{prompt}
"""

View File

@@ -0,0 +1,252 @@
from lxml import html, etree
import re
from playwright.async_api import Page
from api.backend.models import CapturedElement
from api.backend.job.scraping.scraping_utils import clean_format_characters
def convert_to_markdown(html_str: str):
parser = html.HTMLParser()
tree = html.fromstring(html_str, parser=parser)
root = tree.getroottree()
def format_attributes(el: etree._Element) -> str:
"""Convert element attributes into a string."""
return " ".join(f'{k}="{v}"' for k, v in el.attrib.items())
def is_visible(el: etree._Element) -> bool:
style = el.attrib.get("style", "").lower()
class_ = el.attrib.get("class", "").lower()
# Check for visibility styles
if "display: none" in style or "visibility: hidden" in style:
return False
if "opacity: 0" in style or "opacity:0" in style:
return False
if "height: 0" in style or "width: 0" in style:
return False
# Check for common hidden classes
if any(
hidden in class_
for hidden in ["hidden", "invisible", "truncate", "collapse"]
):
return False
# Check for hidden attributes
if el.attrib.get("hidden") is not None:
return False
if el.attrib.get("aria-hidden") == "true":
return False
# Check for empty or whitespace-only content
if not el.text and len(el) == 0:
return False
return True
def is_layout_or_decorative(el: etree._Element) -> bool:
tag = el.tag.lower()
# Layout elements
if tag in {"nav", "footer", "header", "aside", "main", "section"}:
return True
# Decorative elements
if tag in {"svg", "path", "circle", "rect", "line", "polygon", "polyline"}:
return True
# Check id and class for layout/decorative keywords
id_class = " ".join(
[el.attrib.get("id", ""), el.attrib.get("class", "")]
).lower()
layout_keywords = {
"sidebar",
"nav",
"header",
"footer",
"menu",
"advert",
"ads",
"breadcrumb",
"container",
"wrapper",
"layout",
"grid",
"flex",
"row",
"column",
"section",
"banner",
"hero",
"card",
"modal",
"popup",
"tooltip",
"dropdown",
"overlay",
}
return any(keyword in id_class for keyword in layout_keywords)
# Tags to ignore in the final markdown output
included_tags = {
"div",
"span",
"a",
"p",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"img",
"button",
"input",
"textarea",
"ul",
"ol",
"li",
"table",
"tr",
"td",
"th",
"input",
"textarea",
"select",
"option",
"optgroup",
"fieldset",
"legend",
}
special_elements = []
normal_elements = []
for el in tree.iter():
if el.tag is etree.Comment:
continue
tag = el.tag.lower()
if tag not in included_tags:
continue
if not is_visible(el):
continue
if is_layout_or_decorative(el):
continue
path = root.getpath(el)
attrs = format_attributes(el)
attrs_str = f" {attrs}" if attrs else ""
text = el.text.strip() if el.text else ""
if not text and not attrs:
continue
# input elements
if tag == "button":
prefix = "🔘 **<button>**"
special_elements.append(f"<!-- {path} -->\n{prefix} {text}")
elif tag == "a":
href = el.attrib.get("href", "")
prefix = f"🔗 **<a href='{href}'>**"
special_elements.append(f"<!-- {path} -->\n{prefix} {text}")
elif tag == "input":
input_type = el.attrib.get("type", "text")
prefix = f"📝 **<input type='{input_type}'>**"
special_elements.append(f"<!-- {path} -->\n{prefix}")
else:
prefix = f"**<{tag}{attrs_str}>**"
if text:
normal_elements.append(f"<!-- {path} -->\n{prefix} {text}")
return "\n\n".join(normal_elements + special_elements) # type: ignore
def parse_response(text: str) -> list[dict[str, str]]:
xpaths = re.findall(r"<xpaths>(.*?)</xpaths>", text, re.DOTALL)
results = []
if xpaths:
lines = xpaths[0].strip().splitlines()
for line in lines:
if line.strip().startswith("-"):
name = re.findall(r"<name: (.*?)>", line)[0]
xpath = re.findall(r"<xpath: (.*?)>", line)[0]
results.append({"name": name, "xpath": xpath})
else:
results.append({"name": "", "xpath": line.strip()})
return results
def parse_next_page(text: str) -> str | None:
next_page = re.findall(r"<next_page>(.*?)</next_page>", text, re.DOTALL)
if next_page:
lines = next_page[0].strip().splitlines()
next_page = [
line.strip().lstrip("-").strip()
for line in lines
if line.strip().startswith("-")
]
return next_page[0] if next_page else None
async def capture_elements(
page: Page, xpaths: list[dict[str, str]]
) -> list[CapturedElement]:
captured_elements = []
seen_texts = set()
for xpath in xpaths:
try:
locator = page.locator(f"xpath={xpath['xpath']}")
count = await locator.count()
for i in range(count):
element_text = ""
element_handle = await locator.nth(i).element_handle()
if not element_handle:
continue
link = await element_handle.get_attribute("href") or ""
text = await element_handle.text_content()
if text:
element_text += text
if link:
element_text += f" ({link})"
cleaned = clean_format_characters(element_text)
if cleaned in seen_texts:
continue
seen_texts.add(cleaned)
captured_elements.append(
CapturedElement(
name=xpath["name"],
text=cleaned,
xpath=xpath["xpath"],
)
)
except Exception as e:
print(f"Error processing xpath {xpath}: {e}")
return captured_elements

View File

@@ -1,32 +1,29 @@
# STL
import os
import logging
from collections.abc import Iterable, AsyncGenerator
# PDM
from openai import OpenAI
from fastapi import APIRouter
from fastapi.responses import JSONResponse, StreamingResponse
from openai.types.chat import ChatCompletionMessageParam
# LOCAL
from ollama import Message, AsyncClient
from ollama import Message
from api.backend.models import AI
from api.backend.ai.clients import (
llama_client,
llama_model,
openai_client,
open_ai_model,
open_ai_key,
)
LOG = logging.getLogger(__name__)
ai_router = APIRouter()
# Load environment variables
open_ai_key = os.getenv("OPENAI_KEY")
open_ai_model = os.getenv("OPENAI_MODEL")
llama_url = os.getenv("OLLAMA_URL")
llama_model = os.getenv("OLLAMA_MODEL")
# Initialize clients
openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None
llama_client = AsyncClient(host=llama_url) if llama_url else None
async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]:
if llama_client and llama_model:
@@ -43,6 +40,14 @@ async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]:
async def openai_chat(
chat_messages: Iterable[ChatCompletionMessageParam],
) -> AsyncGenerator[str, None]:
if openai_client and not open_ai_model:
LOG.error("OpenAI model is not set")
yield "An error occurred while processing your request."
if not openai_client:
LOG.error("OpenAI client is not set")
yield "An error occurred while processing your request."
if openai_client and open_ai_model:
try:
response = openai_client.chat.completions.create(
@@ -67,4 +72,4 @@ async def ai(c: AI):
@ai_router.get("/ai/check")
async def check():
return JSONResponse(content=bool(open_ai_key or llama_model))
return JSONResponse(content={"ai_enabled": bool(open_ai_key or llama_model)})

38
api/backend/ai/clients.py Normal file
View File

@@ -0,0 +1,38 @@
import os
from openai import OpenAI
from ollama import AsyncClient
# Load environment variables
open_ai_key = os.getenv("OPENAI_KEY")
open_ai_model = os.getenv("OPENAI_MODEL")
llama_url = os.getenv("OLLAMA_URL")
llama_model = os.getenv("OLLAMA_MODEL")
# Initialize clients
openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None
llama_client = AsyncClient(host=llama_url) if llama_url else None
async def ask_open_ai(prompt: str) -> str:
if not openai_client:
raise ValueError("OpenAI client not initialized")
response = openai_client.chat.completions.create(
model=open_ai_model or "gpt-4.1-mini",
messages=[{"role": "user", "content": prompt}],
)
return response.choices[0].message.content or ""
async def ask_ollama(prompt: str) -> str:
if not llama_client:
raise ValueError("Ollama client not initialized")
response = await llama_client.chat(
model=llama_model or "", messages=[{"role": "user", "content": prompt}]
)
return response.message.content or ""

View File

@@ -1,9 +1,14 @@
# STL
import os
import logging
import apscheduler # type: ignore
from contextlib import asynccontextmanager
# PDM
from fastapi import FastAPI
import apscheduler.schedulers
import apscheduler.schedulers.background
from fastapi import FastAPI, Request, status
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
# LOCAL
@@ -11,8 +16,12 @@ from api.backend.ai.ai_router import ai_router
from api.backend.auth.auth_router import auth_router
from api.backend.utils import get_log_level
from api.backend.routers.job_router import job_router
from api.backend.routers.log_router import log_router
from api.backend.routers.stats_router import stats_router
from api.backend.database.startup import init_database
from fastapi.responses import JSONResponse
from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
from api.backend.scheduler import scheduler
log_level = os.getenv("LOG_LEVEL")
LOG_LEVEL = get_log_level(log_level)
@@ -25,7 +34,30 @@ logging.basicConfig(
LOG = logging.getLogger(__name__)
app = FastAPI(title="api")
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup
LOG.info("Starting application...")
init_database()
LOG.info("Starting cron scheduler...")
start_cron_scheduler(scheduler)
scheduler.start()
LOG.info("Cron scheduler started successfully")
yield
# Shutdown
LOG.info("Shutting down application...")
LOG.info("Stopping cron scheduler...")
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
LOG.info("Cron scheduler stopped")
LOG.info("Application shutdown complete")
app = FastAPI(title="api", root_path="/api", lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
@@ -35,9 +67,17 @@ app.add_middleware(
allow_headers=["*"],
)
app.include_router(auth_router)
app.include_router(ai_router)
app.include_router(job_router)
app.include_router(log_router)
app.include_router(stats_router)
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(request: Request, exc: RequestValidationError):
exc_str = f"{exc}".replace("\n", " ").replace(" ", " ")
logging.error(f"{request}: {exc_str}")
content = {"status_code": 10422, "message": exc_str, "data": None}
return JSONResponse(
content=content, status_code=status.HTTP_422_UNPROCESSABLE_ENTITY
)

View File

@@ -1,5 +1,6 @@
# STL
from datetime import timedelta
import os
# PDM
from fastapi import Depends, APIRouter, HTTPException, status
@@ -7,7 +8,6 @@ from fastapi.security import OAuth2PasswordRequestForm
# LOCAL
from api.backend.schemas import User, Token, UserCreate
from api.backend.database import get_user_collection
from api.backend.auth.auth_utils import (
ACCESS_TOKEN_EXPIRE_MINUTES,
get_current_user,
@@ -15,9 +15,14 @@ from api.backend.auth.auth_utils import (
get_password_hash,
create_access_token,
)
import logging
from api.backend.database.common import update
auth_router = APIRouter()
LOG = logging.getLogger("auth_router")
@auth_router.post("/auth/token", response_model=Token)
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
@@ -43,15 +48,26 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends(
@auth_router.post("/auth/signup", response_model=User)
async def create_user(user: UserCreate):
users_collection = get_user_collection()
hashed_password = get_password_hash(user.password)
user_dict = user.model_dump()
user_dict["hashed_password"] = hashed_password
del user_dict["password"]
_ = await users_collection.insert_one(user_dict)
query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)"
_ = update(query, (user_dict["email"], hashed_password, user_dict["full_name"]))
return user_dict
@auth_router.get("/auth/users/me", response_model=User)
async def read_users_me(current_user: User = Depends(get_current_user)):
return current_user
@auth_router.get("/auth/check")
async def check_auth():
return {
"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True",
"recordings_enabled": os.environ.get("RECORDINGS_ENABLED", "true").lower()
== "true",
}

View File

@@ -1,7 +1,5 @@
# STL
import os
from gc import disable
from queue import Empty
from typing import Any, Optional
from datetime import datetime, timedelta
import logging
@@ -15,15 +13,16 @@ from fastapi.security import OAuth2PasswordBearer
# LOCAL
from api.backend.schemas import User, UserInDB, TokenData
from api.backend.database import get_user_collection
from api.backend.database.common import query
LOG = logging.getLogger(__name__)
_ = load_dotenv()
SECRET_KEY = os.getenv("SECRET_KEY") or ""
ALGORITHM = os.getenv("ALGORITHM") or ""
ACCESS_TOKEN_EXPIRE_MINUTES = os.getenv("ACCESS_TOKEN_EXPIRE_MINUTES")
SECRET_KEY = os.getenv("SECRET_KEY") or "secret"
ALGORITHM = os.getenv("ALGORITHM") or "HS256"
ACCESS_TOKEN_EXPIRE_MINUTES = os.getenv("ACCESS_TOKEN_EXPIRE_MINUTES") or 600
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="auth/token")
@@ -40,8 +39,8 @@ def get_password_hash(password: str):
async def get_user(email: str):
user_collection = get_user_collection()
user = await user_collection.find_one({"email": email})
user_query = "SELECT * FROM users WHERE email = ?"
user = query(user_query, (email,))[0]
if not user:
return
@@ -77,27 +76,42 @@ def create_access_token(
async def get_current_user(token: str = Depends(oauth2_scheme)):
LOG.info(f"Getting current user with token: {token}")
LOG.debug(f"Getting current user with token: {token}")
if not token:
LOG.debug("No token provided")
return EMPTY_USER
if len(token.split(".")) != 3:
LOG.error(f"Malformed token: {token}")
return EMPTY_USER
try:
LOG.debug(
f"Decoding token: {token} with secret key: {SECRET_KEY} and algorithm: {ALGORITHM}"
)
if token.startswith("Bearer "):
token = token.split(" ")[1]
payload: Optional[dict[str, Any]] = jwt.decode(
token, SECRET_KEY, algorithms=[ALGORITHM]
)
if not payload:
LOG.error("No payload found in token")
return EMPTY_USER
email = payload.get("sub")
if email is None:
LOG.error("No email found in payload")
return EMPTY_USER
token_data = TokenData(email=email)
except JWTError:
except JWTError as e:
LOG.error(f"JWTError occurred: {e}")
return EMPTY_USER
except Exception as e:
@@ -105,7 +119,6 @@ async def get_current_user(token: str = Depends(oauth2_scheme)):
return EMPTY_USER
user = await get_user(email=token_data.email)
if user is None:
return EMPTY_USER

16
api/backend/constants.py Normal file
View File

@@ -0,0 +1,16 @@
from pathlib import Path
import os
DATABASE_PATH = "data/database.db"
RECORDINGS_DIR = Path("media/recordings")
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
MEDIA_DIR = Path("media")
MEDIA_TYPES = [
"audio",
"documents",
"images",
"pdfs",
"presentations",
"spreadsheets",
"videos",
]

View File

@@ -1,23 +0,0 @@
# STL
import os
from typing import Any
# PDM
from dotenv import load_dotenv
from motor.motor_asyncio import AsyncIOMotorClient
_ = load_dotenv()
MONGODB_URI = os.getenv("MONGODB_URI")
def get_user_collection():
client: AsyncIOMotorClient[dict[str, Any]] = AsyncIOMotorClient(MONGODB_URI)
db = client["scrape"]
return db["users"]
def get_job_collection():
client: AsyncIOMotorClient[dict[str, Any]] = AsyncIOMotorClient(MONGODB_URI)
db = client["scrape"]
return db["jobs"]

View File

@@ -0,0 +1,3 @@
from .common import insert, QUERIES, update
__all__ = ["insert", "QUERIES", "update"]

View File

@@ -0,0 +1,92 @@
import sqlite3
from typing import Any, Optional
from api.backend.constants import DATABASE_PATH
from api.backend.utils import format_json, format_sql_row_to_python
from api.backend.database.schema import INIT_QUERY
from api.backend.database.queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
import logging
LOG = logging.getLogger(__name__)
def connect():
connection = sqlite3.connect(DATABASE_PATH)
connection.set_trace_callback(print)
cursor = connection.cursor()
return cursor
def insert(query: str, values: tuple[Any, ...]):
connection = sqlite3.connect(DATABASE_PATH)
cursor = connection.cursor()
copy = list(values)
format_json(copy)
try:
_ = cursor.execute(query, copy)
connection.commit()
except sqlite3.Error as e:
LOG.error(f"An error occurred: {e}")
finally:
cursor.close()
connection.close()
def query(query: str, values: Optional[tuple[Any, ...]] = None):
connection = sqlite3.connect(DATABASE_PATH)
connection.row_factory = sqlite3.Row
cursor = connection.cursor()
rows = []
try:
if values:
_ = cursor.execute(query, values)
else:
_ = cursor.execute(query)
rows = cursor.fetchall()
finally:
cursor.close()
connection.close()
formatted_rows: list[dict[str, Any]] = []
for row in rows:
row = dict(row)
formatted_row = format_sql_row_to_python(row)
formatted_rows.append(formatted_row)
return formatted_rows
def update(query: str, values: Optional[tuple[Any, ...]] = None):
connection = sqlite3.connect(DATABASE_PATH)
cursor = connection.cursor()
copy = None
if values:
copy = list(values)
format_json(copy)
try:
if copy:
res = cursor.execute(query, copy)
else:
res = cursor.execute(query)
connection.commit()
return res.rowcount
except sqlite3.Error as e:
LOG.error(f"An error occurred: {e}")
finally:
cursor.close()
connection.close()
return 0
QUERIES = {
"init": INIT_QUERY,
"insert_job": JOB_INSERT_QUERY,
"delete_job": DELETE_JOB_QUERY,
}

View File

@@ -0,0 +1,3 @@
from .queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
__all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"]

View File

@@ -0,0 +1,9 @@
JOB_INSERT_QUERY = """
INSERT INTO jobs
(id, url, elements, user, time_created, result, status, chat, job_options, agent_mode, prompt)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
"""
DELETE_JOB_QUERY = """
DELETE FROM jobs WHERE id IN ()
"""

View File

@@ -0,0 +1,3 @@
from .schema import INIT_QUERY
__all__ = ["INIT_QUERY"]

View File

@@ -0,0 +1,33 @@
INIT_QUERY = """
CREATE TABLE IF NOT EXISTS jobs (
id STRING PRIMARY KEY NOT NULL,
url STRING NOT NULL,
elements JSON NOT NULL,
user STRING,
time_created DATETIME NOT NULL,
result JSON NOT NULL,
status STRING NOT NULL,
chat JSON,
job_options JSON
);
CREATE TABLE IF NOT EXISTS users (
email STRING PRIMARY KEY NOT NULL,
hashed_password STRING NOT NULL,
full_name STRING,
disabled BOOLEAN
);
CREATE TABLE IF NOT EXISTS cron_jobs (
id STRING PRIMARY KEY NOT NULL,
user_email STRING NOT NULL,
job_id STRING NOT NULL,
cron_expression STRING NOT NULL,
time_created DATETIME NOT NULL,
time_updated DATETIME NOT NULL,
FOREIGN KEY (job_id) REFERENCES jobs(id)
);
ALTER TABLE jobs ADD COLUMN agent_mode BOOLEAN NOT NULL DEFAULT FALSE;
ALTER TABLE jobs ADD COLUMN prompt STRING;
"""

View File

@@ -0,0 +1,55 @@
import os
from api.backend.database.common import connect, QUERIES, insert
import logging
import sqlite3
from api.backend.auth.auth_utils import get_password_hash
LOG = logging.getLogger(__name__)
def init_database():
cursor = connect()
for query in QUERIES["init"].strip().split(";"):
query = query.strip()
if not query:
continue
try:
LOG.info(f"Executing query: {query}")
_ = cursor.execute(query)
except sqlite3.OperationalError as e:
if "duplicate column name" in str(e).lower():
LOG.warning(f"Skipping duplicate column error: {e}")
continue
else:
LOG.error(f"Error executing query: {query}")
raise
if os.environ.get("REGISTRATION_ENABLED", "true").lower() == "false":
default_user_email = os.environ.get("DEFAULT_USER_EMAIL")
default_user_password = os.environ.get("DEFAULT_USER_PASSWORD")
default_user_full_name = os.environ.get("DEFAULT_USER_FULL_NAME")
if (
not default_user_email
or not default_user_password
or not default_user_full_name
):
LOG.error(
"DEFAULT_USER_EMAIL, DEFAULT_USER_PASSWORD, or DEFAULT_USER_FULL_NAME is not set!"
)
exit(1)
query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)"
_ = insert(
query,
(
default_user_email,
get_password_hash(default_user_password),
default_user_full_name,
),
)
cursor.close()

View File

@@ -1,119 +0,0 @@
# STL
import logging
from typing import Any, Optional
# PDM
from pymongo import DESCENDING
# LOCAL
from api.backend.models import FetchOptions
from api.backend.database import get_job_collection
LOG = logging.getLogger(__name__)
async def insert(item: dict[str, Any]) -> None:
collection = get_job_collection()
i = await collection.insert_one(item)
LOG.info(f"Inserted item: {i}")
async def get_queued_job():
collection = get_job_collection()
return await collection.find_one(
{"status": "Queued"}, sort=[("created_at", DESCENDING)]
)
async def query(
filter: dict[str, Any], fetch_options: Optional[FetchOptions] = None
) -> list[dict[str, Any]]:
collection = get_job_collection()
cursor = collection.find(filter)
results: list[dict[str, Any]] = []
async for document in cursor:
del document["_id"]
if fetch_options and not fetch_options.chat and document.get("chat"):
del document["chat"]
results.append(document)
return results
async def update_job(ids: list[str], field: str, value: Any):
collection = get_job_collection()
for id in ids:
_ = await collection.update_one(
{"id": id},
{"$set": {field: value}},
)
async def delete_jobs(jobs: list[str]):
collection = get_job_collection()
result = await collection.delete_many({"id": {"$in": jobs}})
LOG.info(f"{result.deleted_count} documents deleted")
return True if result.deleted_count > 0 else False
async def average_elements_per_link(user: str):
collection = get_job_collection()
pipeline = [
{"$match": {"status": "Completed", "user": user}},
{
"$project": {
"date": {
"$dateToString": {"format": "%Y-%m-%d", "date": "$time_created"}
},
"num_elements": {"$size": "$elements"},
}
},
{
"$group": {
"_id": "$date",
"average_elements": {"$avg": "$num_elements"},
"count": {"$sum": 1},
}
},
{"$sort": {"_id": 1}},
]
cursor = collection.aggregate(pipeline)
results: list[dict[str, Any]] = []
async for document in cursor:
results.append(
{
"date": document["_id"],
"average_elements": document["average_elements"],
"count": document["count"],
}
)
return results
async def get_jobs_per_day(user: str):
collection = get_job_collection()
pipeline = [
{"$match": {"status": "Completed", "user": user}},
{
"$project": {
"date": {
"$dateToString": {"format": "%Y-%m-%d", "date": "$time_created"}
}
}
},
{"$group": {"_id": "$date", "job_count": {"$sum": 1}}},
{"$sort": {"_id": 1}},
]
cursor = collection.aggregate(pipeline)
results: list[dict[str, Any]] = []
async for document in cursor:
results.append({"date": document["_id"], "job_count": document["job_count"]})
return results

View File

@@ -0,0 +1,17 @@
from .job import (
insert,
update_job,
delete_jobs,
get_jobs_per_day,
get_queued_job,
average_elements_per_link,
)
__all__ = [
"insert",
"update_job",
"delete_jobs",
"get_jobs_per_day",
"get_queued_job",
"average_elements_per_link",
]

View File

@@ -0,0 +1,100 @@
import datetime
from typing import Any
import uuid
from api.backend.database.common import insert, query
from api.backend.models import CronJob
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
from apscheduler.triggers.cron import CronTrigger # type: ignore
from api.backend.job import insert as insert_job
import logging
LOG = logging.getLogger("Cron Scheduler")
def insert_cron_job(cron_job: CronJob):
query = """
INSERT INTO cron_jobs (id, user_email, job_id, cron_expression, time_created, time_updated)
VALUES (?, ?, ?, ?, ?, ?)
"""
values = (
cron_job.id,
cron_job.user_email,
cron_job.job_id,
cron_job.cron_expression,
cron_job.time_created,
cron_job.time_updated,
)
insert(query, values)
return True
def delete_cron_job(id: str, user_email: str):
query = """
DELETE FROM cron_jobs
WHERE id = ? AND user_email = ?
"""
values = (id, user_email)
insert(query, values)
return True
def get_cron_jobs(user_email: str):
cron_jobs = query("SELECT * FROM cron_jobs WHERE user_email = ?", (user_email,))
return cron_jobs
def get_all_cron_jobs():
cron_jobs = query("SELECT * FROM cron_jobs")
return cron_jobs
def insert_job_from_cron_job(job: dict[str, Any]):
insert_job(
{
**job,
"id": uuid.uuid4().hex,
"status": "Queued",
"result": "",
"chat": None,
"time_created": datetime.datetime.now(),
"time_updated": datetime.datetime.now(),
}
)
def get_cron_job_trigger(cron_expression: str):
expression_parts = cron_expression.split()
if len(expression_parts) != 5:
print(f"Invalid cron expression: {cron_expression}")
return None
minute, hour, day, month, day_of_week = expression_parts
return CronTrigger(
minute=minute, hour=hour, day=day, month=month, day_of_week=day_of_week
)
def start_cron_scheduler(scheduler: BackgroundScheduler):
cron_jobs = get_all_cron_jobs()
LOG.info(f"Cron jobs: {cron_jobs}")
for job in cron_jobs:
queried_job = query("SELECT * FROM jobs WHERE id = ?", (job["job_id"],))
LOG.info(f"Adding job: {queried_job}")
scheduler.add_job(
insert_job_from_cron_job,
get_cron_job_trigger(job["cron_expression"]),
id=job["id"],
args=[queried_job[0]],
)

99
api/backend/job/job.py Normal file
View File

@@ -0,0 +1,99 @@
# STL
import logging
from typing import Any
# LOCAL
from api.backend.utils import format_list_for_query
from api.backend.database.common import (
insert as common_insert,
query as common_query,
QUERIES,
update as common_update,
)
LOG = logging.getLogger(__name__)
def insert(item: dict[str, Any]) -> None:
common_insert(
QUERIES["insert_job"],
(
item["id"],
item["url"],
item["elements"],
item["user"],
item["time_created"],
item["result"],
item["status"],
item["chat"],
item["job_options"],
item["agent_mode"],
item["prompt"],
),
)
LOG.info(f"Inserted item: {item}")
async def get_queued_job():
query = (
"SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
)
res = common_query(query)
LOG.info(f"Got queued job: {res}")
return res[0] if res else None
async def update_job(ids: list[str], field: str, value: Any):
query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
res = common_update(query, tuple([value] + ids))
LOG.info(f"Updated job: {res}")
async def delete_jobs(jobs: list[str]):
if not jobs:
LOG.info("No jobs to delete.")
return False
query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}"
res = common_update(query, tuple(jobs))
return res > 0
async def average_elements_per_link(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
AVG(json_array_length(elements)) AS average_elements,
COUNT(*) AS count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = common_query(job_query, (user,))
return results
async def get_jobs_per_day(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
COUNT(*) AS job_count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = common_query(job_query, (user,))
return results

View File

@@ -0,0 +1,3 @@
from .job_options import JobOptions
__all__ = ["JobOptions"]

View File

@@ -0,0 +1,16 @@
from pydantic import BaseModel
from typing import Any, Optional
from api.backend.job.models.site_map import SiteMap
class FetchOptions(BaseModel):
chat: Optional[bool] = None
class JobOptions(BaseModel):
multi_page_scrape: bool = False
custom_headers: dict[str, Any] = {}
proxies: list[str] = []
site_map: Optional[SiteMap] = None
collect_media: bool = False
custom_cookies: list[dict[str, Any]] = []

View File

@@ -0,0 +1,14 @@
from pydantic import BaseModel
from typing import Literal
class Action(BaseModel):
type: Literal["click", "input"]
xpath: str
name: str
input: str = ""
do_once: bool = True
class SiteMap(BaseModel):
actions: list[Action]

View File

@@ -0,0 +1,48 @@
from typing import Any, Optional
from urllib.parse import urlparse
from playwright.async_api import Page, BrowserContext
import logging
LOG = logging.getLogger(__name__)
async def add_custom_cookies(
custom_cookies: list[dict[str, Any]],
url: str,
context: BrowserContext,
) -> None:
parsed_url = urlparse(url)
domain = parsed_url.netloc
for cookie in custom_cookies:
cookie_dict = {
"name": cookie.get("name", "default_name"),
"value": cookie.get("value", "default_value"),
"domain": domain,
"path": "/",
}
LOG.info(f"Adding cookie: {cookie_dict}")
await context.add_cookies([cookie_dict]) # type: ignore
async def add_custom_headers(
custom_headers: dict[str, Any],
page: Page,
) -> None:
await page.set_extra_http_headers(custom_headers)
async def add_custom_items(
url: str,
page: Page,
cookies: Optional[list[dict[str, Any]]] = None,
headers: Optional[dict[str, Any]] = None,
) -> None:
if cookies:
await add_custom_cookies(cookies, url, page.context)
if headers:
await add_custom_headers(headers, page)

View File

@@ -0,0 +1,110 @@
import os
from pathlib import Path
import re
from urllib.parse import urljoin, urlparse
from typing import Dict, List
import aiohttp
from playwright.async_api import Page
from api.backend.utils import LOG
async def collect_media(id: str, page: Page) -> dict[str, list[dict[str, str]]]:
media_types = {
"images": "img",
"videos": "video",
"audio": "audio",
"pdfs": 'a[href$=".pdf"], a[href*=".pdf#page="]',
"documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]',
"presentations": 'a[href$=".ppt"], a[href$=".pptx"]',
"spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]',
}
base_dir = Path("media")
base_dir.mkdir(exist_ok=True)
media_urls = {}
async with aiohttp.ClientSession() as session:
for media_type, selector in media_types.items():
elements = await page.query_selector_all(selector)
urls: List[Dict[str, str]] = []
media_dir = base_dir / media_type
media_dir.mkdir(exist_ok=True)
for element in elements:
if media_type == "images":
url = await element.get_attribute("src")
elif media_type == "videos":
url = await element.get_attribute(
"src"
) or await element.get_attribute("data-src")
else:
url = await element.get_attribute("href")
if url and url.startswith("/"):
root_url = urlparse(page.url)
root_domain = f"{root_url.scheme}://{root_url.netloc}"
url = f"{root_domain}{url}"
if url and re.match(r"^[\w\-]+/", url):
root_url = urlparse(page.url)
root_domain = f"{root_url.scheme}://{root_url.netloc}"
url = urljoin(root_domain + "/", url)
if url and url.startswith(("http://", "https://")):
try:
parsed = urlparse(url)
filename = (
os.path.basename(parsed.path) or f"{media_type}_{len(urls)}"
)
if "." not in filename:
ext = {
"images": ".jpg",
"videos": ".mp4",
"audio": ".mp3",
"pdfs": ".pdf",
"documents": ".doc",
"presentations": ".ppt",
"spreadsheets": ".xls",
}.get(media_type, "")
filename += ext
if not os.path.exists(media_dir / id):
os.makedirs(media_dir / id, exist_ok=True)
file_path = media_dir / id / f"{filename}"
async with session.get(url) as response:
response.raise_for_status()
with open(file_path, "wb") as f:
while True:
chunk = await response.content.read(8192)
if not chunk:
break
f.write(chunk)
urls.append({"url": url, "local_path": str(file_path)})
LOG.info(f"Downloaded {filename} to {file_path}")
except Exception as e:
LOG.error(f"Error downloading {url}: {str(e)}")
continue
media_urls[media_type] = urls
# Write summary
with open(base_dir / "download_summary.txt", "w") as f:
for media_type, downloads in media_urls.items():
if downloads:
f.write(f"\n=== {media_type.upper()} ===\n")
for download in downloads:
f.write(f"URL: {download['url']}\n")
f.write(f"Saved to: {download['local_path']}\n\n")
return media_urls

View File

@@ -0,0 +1,45 @@
import asyncio
from typing import Set, Tuple
from playwright.async_api import Page
from api.backend.utils import LOG
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
async def scrape_content(
id: str, page: Page, pages: Set[Tuple[str, str]], collect_media: bool
) -> str:
last_height = await page.evaluate("document.body.scrollHeight")
while True:
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
await asyncio.sleep(3)
new_height = await page.evaluate("document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
html = await page.content()
pages.add((html, page.url))
if collect_media:
LOG.info("Collecting media")
await collect_media_utils(id, page)
return html
def clean_format_characters(text: str) -> str:
text = text.strip()
text = text.replace("\n", " ")
text = text.replace("\t", " ")
text = text.replace("\r", " ")
text = text.replace("\f", " ")
text = text.replace("\v", " ")
text = text.replace("\b", " ")
text = text.replace("\a", " ")
return text

View File

View File

@@ -0,0 +1,77 @@
import logging
import asyncio
from copy import deepcopy
from typing import Any
from playwright.async_api import Page
from api.backend.job.models.site_map import Action, SiteMap
from api.backend.job.scraping.scraping_utils import scrape_content
LOG = logging.getLogger(__name__)
def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]:
"""Clear all actions that have been clicked."""
cleared_site_map = deepcopy(site_map)
cleared_site_map["actions"] = [
action for action in cleared_site_map["actions"] if not action["do_once"]
]
return cleared_site_map
async def handle_input(action: Action, page: Page) -> bool:
try:
element = page.locator(f"xpath={action.xpath}")
LOG.info(f"Sending keys: {action.input} to element: {action.xpath}")
await element.fill(action.input)
return True
except Exception as e:
LOG.warning(f"Error handling input for xpath '{action.xpath}': {e}")
return False
async def handle_click(action: Action, page: Page) -> bool:
try:
element = page.locator(f"xpath={action.xpath}")
LOG.info(f"Clicking element: {action.xpath}")
await element.click()
return True
except Exception as e:
LOG.warning(f"Error clicking element at xpath '{action.xpath}': {e}")
return False
ACTION_MAP = {
"click": handle_click,
"input": handle_input,
}
async def handle_site_mapping(
id: str,
site_map_dict: dict[str, Any],
page: Page,
pages: set[tuple[str, str]],
collect_media: bool = False,
):
site_map = SiteMap(**site_map_dict)
for action in site_map.actions:
action_handler = ACTION_MAP[action.type]
success = await action_handler(action, page)
if not success:
return
await asyncio.sleep(2)
await scrape_content(id, page, pages, collect_media=collect_media)
cleared_site_map_dict = clear_done_actions(site_map_dict)
if cleared_site_map_dict["actions"]:
await handle_site_mapping(
id, cleared_site_map_dict, page, pages, collect_media=collect_media
)

View File

@@ -0,0 +1,36 @@
from typing import Any
from api.backend.utils import clean_text
def clean_job_format(jobs: list[dict[str, Any]]) -> dict[str, Any]:
"""
Convert a single job to a dictionary format.
"""
headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"]
cleaned_rows = []
for job in jobs:
for res in job["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
cleaned_rows.append(
{
"id": job.get("id", ""),
"url": url,
"element_name": element_name,
"xpath": value.get("xpath", ""),
"text": text,
"user": job.get("user", ""),
"time_created": job.get("time_created", ""),
}
)
return {
"headers": headers,
"rows": cleaned_rows,
}

View File

@@ -0,0 +1,24 @@
from typing import Any
from api.backend.utils import clean_text
def stream_md_from_job_results(jobs: list[dict[str, Any]]):
md = "# Job Results Summary\n\n"
for i, job in enumerate(jobs, start=1):
md += f"## Job #{i}\n"
yield f"- **Job URL:** {job.get('url', 'N/A')}\n"
yield f"- **Timestamp:** {job.get('time_created', 'N/A')}\n"
yield f"- **ID:** {job.get('id', 'N/A')}\n"
yield "### Extracted Results:\n"
for res in job.get("result", []):
for url, elements in res.items():
yield f"\n#### URL: {url}\n"
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
yield f"- **Element:** `{element_name}`\n"
yield f" - **Text:** {text}\n"
yield "\n---\n"

View File

@@ -1,15 +1,14 @@
# STL
from typing import Any, Optional, Union
from typing import Any, Literal, Optional, Union
from datetime import datetime
# LOCAL
from api.backend.job.models.job_options import JobOptions
# PDM
import pydantic
class FetchOptions(pydantic.BaseModel):
chat: Optional[bool] = None
class Element(pydantic.BaseModel):
name: str
xpath: str
@@ -22,18 +21,13 @@ class CapturedElement(pydantic.BaseModel):
name: str
class JobOptions(pydantic.BaseModel):
multi_page_scrape: bool = False
custom_headers: Optional[dict[str, Any]] = {}
proxies: Optional[list[str]] = []
class RetrieveScrapeJobs(pydantic.BaseModel):
user: str
class DownloadJob(pydantic.BaseModel):
ids: list[str]
job_format: Literal["csv", "md"]
class DeleteScrapeJobs(pydantic.BaseModel):
@@ -64,3 +58,19 @@ class Job(pydantic.BaseModel):
job_options: JobOptions
status: str = "Queued"
chat: Optional[str] = None
agent_mode: bool = False
prompt: Optional[str] = None
class CronJob(pydantic.BaseModel):
id: Optional[str] = None
user_email: str
job_id: str
cron_expression: str
time_created: Optional[Union[datetime, str]] = None
time_updated: Optional[Union[datetime, str]] = None
class DeleteCronJob(pydantic.BaseModel):
id: str
user_email: str

View File

@@ -1,4 +1,5 @@
# STL
import datetime
import uuid
import traceback
from io import StringIO
@@ -9,25 +10,39 @@ import random
# PDM
from fastapi import Depends, APIRouter
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse, StreamingResponse
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
from api.backend.scheduler import scheduler
from apscheduler.triggers.cron import CronTrigger # type: ignore
# LOCAL
from api.backend.job import (
query,
insert,
update_job,
delete_jobs,
)
from api.backend.job import insert, update_job, delete_jobs
from api.backend.models import (
DeleteCronJob,
UpdateJobs,
DownloadJob,
FetchOptions,
DeleteScrapeJobs,
Job,
CronJob,
)
from api.backend.schemas import User
from api.backend.auth.auth_utils import get_current_user
from api.backend.utils import clean_text
from api.backend.utils import clean_text, format_list_for_query
from api.backend.job.models.job_options import FetchOptions
from api.backend.database.common import query
from api.backend.job.cron_scheduling.cron_scheduling import (
delete_cron_job,
get_cron_job_trigger,
insert_cron_job,
get_cron_jobs,
insert_job_from_cron_job,
)
from api.backend.job.utils.clean_job_format import clean_job_format
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
from api.backend.constants import MEDIA_DIR, MEDIA_TYPES, RECORDINGS_DIR
LOG = logging.getLogger(__name__)
@@ -47,10 +62,11 @@ async def submit_scrape_job(job: Job):
job.id = uuid.uuid4().hex
job_dict = job.model_dump()
await insert(job_dict)
insert(job_dict)
return JSONResponse(content={"id": job.id})
except Exception as e:
LOG.error(f"Exception occurred: {traceback.format_exc()}")
return JSONResponse(content={"error": str(e)}, status_code=500)
@@ -59,8 +75,11 @@ async def retrieve_scrape_jobs(
fetch_options: FetchOptions, user: User = Depends(get_current_user)
):
LOG.info(f"Retrieving jobs for account: {user.email}")
ATTRIBUTES = "chat" if fetch_options.chat else "*"
try:
results = await query({"user": user.email}, fetch_options=fetch_options)
job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?"
results = query(job_query, (user.email,))
return JSONResponse(content=jsonable_encoder(results[::-1]))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
@@ -72,8 +91,8 @@ async def job(id: str, user: User = Depends(get_current_user)):
LOG.info(f"Retrieving jobs for account: {user.email}")
try:
filter = {"user": user.email, "id": id}
results = await query(filter)
job_query = "SELECT * FROM jobs WHERE user = ? AND id = ?"
results = query(job_query, (user.email, id))
return JSONResponse(content=jsonable_encoder(results))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
@@ -85,43 +104,77 @@ async def download(download_job: DownloadJob):
LOG.info(f"Downloading job with ids: {download_job.ids}")
try:
results = await query({"id": {"$in": download_job.ids}})
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"]
csv_writer.writerow(headers)
for result in results:
for res in result["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
csv_writer.writerow(
[
result.get("id", "")
+ "-"
+ str(random.randint(0, 1000000)),
url,
element_name,
value.get("xpath", ""),
text,
result.get("user", ""),
result.get("time_created", ""),
]
)
_ = csv_buffer.seek(0)
response = StreamingResponse(
csv_buffer,
media_type="text/csv",
job_query = (
f"SELECT * FROM jobs WHERE id IN {format_list_for_query(download_job.ids)}"
)
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
results = query(job_query, tuple(download_job.ids))
if download_job.job_format == "csv":
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
headers = [
"id",
"url",
"element_name",
"xpath",
"text",
"user",
"time_created",
]
csv_writer.writerow(headers)
for result in results:
for res in result["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
csv_writer.writerow(
[
result.get("id", "")
+ "-"
+ str(random.randint(0, 1000000)),
url,
element_name,
value.get("xpath", ""),
text,
result.get("user", ""),
result.get("time_created", ""),
]
)
_ = csv_buffer.seek(0)
response = StreamingResponse(
csv_buffer,
media_type="text/csv",
)
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
elif download_job.job_format == "md":
response = StreamingResponse(
stream_md_from_job_results(results),
media_type="text/markdown",
)
response.headers["Content-Disposition"] = "attachment; filename=export.md"
return response
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
return {"error": str(e)}
@job_router.get("/job/{id}/convert-to-csv")
async def convert_to_csv(id: str):
try:
job_query = f"SELECT * FROM jobs WHERE id = ?"
results = query(job_query, (id,))
return JSONResponse(content=clean_job_format(results))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
@@ -136,3 +189,85 @@ async def delete(delete_scrape_jobs: DeleteScrapeJobs):
if result
else JSONResponse({"error": "Jobs not deleted."})
)
@job_router.post("/schedule-cron-job")
async def schedule_cron_job(cron_job: CronJob):
if not cron_job.id:
cron_job.id = uuid.uuid4().hex
if not cron_job.time_created:
cron_job.time_created = datetime.datetime.now()
if not cron_job.time_updated:
cron_job.time_updated = datetime.datetime.now()
insert_cron_job(cron_job)
queried_job = query("SELECT * FROM jobs WHERE id = ?", (cron_job.job_id,))
scheduler.add_job(
insert_job_from_cron_job,
get_cron_job_trigger(cron_job.cron_expression),
id=cron_job.id,
args=[queried_job[0]],
)
return JSONResponse(content={"message": "Cron job scheduled successfully."})
@job_router.post("/delete-cron-job")
async def delete_cron_job_request(request: DeleteCronJob):
if not request.id:
return JSONResponse(
content={"error": "Cron job id is required."}, status_code=400
)
delete_cron_job(request.id, request.user_email)
scheduler.remove_job(request.id)
return JSONResponse(content={"message": "Cron job deleted successfully."})
@job_router.get("/cron-jobs")
async def get_cron_jobs_request(user: User = Depends(get_current_user)):
cron_jobs = get_cron_jobs(user.email)
return JSONResponse(content=jsonable_encoder(cron_jobs))
@job_router.get("/recordings/{id}")
async def get_recording(id: str):
path = RECORDINGS_DIR / f"{id}.mp4"
if not path.exists():
return JSONResponse(content={"error": "Recording not found."}, status_code=404)
return FileResponse(
path, headers={"Content-Type": "video/mp4", "Accept-Ranges": "bytes"}
)
@job_router.get("/get-media")
async def get_media(id: str):
try:
files: dict[str, list[str]] = {}
for media_type in MEDIA_TYPES:
path = MEDIA_DIR / media_type / f"{id}"
files[media_type] = [file.name for file in path.glob("*")]
return JSONResponse(content={"files": files})
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
return JSONResponse(content={"error": str(e)}, status_code=500)
@job_router.get("/media")
async def get_media_file(id: str, type: str, file: str):
path = MEDIA_DIR / type / f"{id}" / file
if not path.exists():
return JSONResponse(content={"error": "Media file not found."}, status_code=404)
return FileResponse(path)

View File

@@ -1,46 +0,0 @@
# STL
import logging
import docker
# PDM
from fastapi import APIRouter, HTTPException
from fastapi.responses import JSONResponse, StreamingResponse
LOG = logging.getLogger(__name__)
log_router = APIRouter()
client = docker.from_env()
@log_router.get("/initial_logs")
async def get_initial_logs():
container_id = "scraperr_api"
try:
container = client.containers.get(container_id)
log_stream = container.logs(stream=False).decode("utf-8")
return JSONResponse(content={"logs": log_stream})
except Exception as e:
raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
@log_router.get("/logs")
async def get_own_logs():
container_id = "scraperr_api"
try:
container = client.containers.get(container_id)
log_stream = container.logs(stream=True, follow=True)
def log_generator():
try:
for log in log_stream:
yield f"data: {log.decode('utf-8')}\n\n"
except Exception as e:
yield f"data: {str(e)}\n\n"
return StreamingResponse(log_generator(), media_type="text/event-stream")
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

3
api/backend/scheduler.py Normal file
View File

@@ -0,0 +1,3 @@
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
scheduler = BackgroundScheduler()

View File

@@ -1,26 +1,27 @@
import logging
from typing import Any, Optional
import time
import random
from typing import Any, Optional, cast
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag
from lxml import etree
from seleniumwire import webdriver
from lxml.etree import _Element # type: ignore [reportPrivateImport]
from fake_useragent import UserAgent
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options as ChromeOptions
from camoufox import AsyncCamoufox
from playwright.async_api import Page
from urllib.parse import urlparse, urljoin
from api.backend.models import Element, CapturedElement
from api.backend.job.scraping.scraping_utils import (
clean_format_characters,
scrape_content,
)
from api.backend.job.site_mapping.site_mapping import handle_site_mapping
from api.backend.job.scraping.add_custom import add_custom_items
from api.backend.constants import RECORDINGS_ENABLED
LOG = logging.getLogger(__name__)
class HtmlElement(_Element): ...
def is_same_domain(url: str, original_url: str) -> bool:
parsed_url = urlparse(url)
parsed_original_url = urlparse(original_url)
@@ -29,184 +30,164 @@ def is_same_domain(url: str, original_url: str) -> bool:
def clean_xpath(xpath: str) -> str:
parts = xpath.split("/")
clean_parts: list[str] = []
for part in parts:
if part == "":
clean_parts.append("/")
else:
clean_parts.append(part)
clean_xpath = "//".join(clean_parts).replace("////", "//")
clean_xpath = clean_xpath.replace("'", "\\'")
clean_parts = ["/" if part == "" else part for part in parts]
clean_xpath = "//".join(clean_parts).replace("////", "//").replace("'", "\\'")
LOG.info(f"Cleaned xpath: {clean_xpath}")
return clean_xpath
def sxpath(context: _Element, xpath: str) -> list[HtmlElement]:
return context.xpath(xpath) # pyright: ignore [reportReturnType]
def interceptor(headers: dict[str, Any]):
def _interceptor(request: Any):
for key, val in headers.items():
if request.headers.get(key):
del request.headers[key]
request.headers[key] = val
if "sec-ch-ua" in request.headers:
original_value = request.headers["sec-ch-ua"]
del request.headers["sec-ch-ua"]
modified_value = original_value.replace("HeadlessChrome", "Chrome")
request.headers["sec-ch-ua"] = modified_value
return _interceptor
def create_driver(proxies: Optional[list[str]] = []):
ua = UserAgent()
chrome_options = ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument(f"user-agent={ua.random}")
sw_options = {}
if proxies:
selected_proxy = proxies[random.randint(0, len(proxies) - 1)]
LOG.info(f"Using proxy: {selected_proxy}")
sw_options = {
"proxy": {
"https": f"https://{selected_proxy}",
"http": f"http://{selected_proxy}",
}
}
driver = webdriver.Chrome(
options=chrome_options,
seleniumwire_options=sw_options,
)
return driver
def sxpath(context: etree._Element, xpath: str):
return context.xpath(xpath)
async def make_site_request(
id: str,
url: str,
headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False,
visited_urls: set[str] = set(),
pages: set[tuple[str, str]] = set(),
original_url: str = "",
proxies: Optional[list[str]] = [],
) -> None:
"""Make basic `GET` request to site using Selenium."""
# Check if URL has already been visited
proxies: Optional[list[str]] = None,
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
custom_cookies: Optional[list[dict[str, Any]]] = None,
):
if url in visited_urls:
return
driver = create_driver(proxies)
driver.implicitly_wait(10)
proxy = None
if headers:
driver.request_interceptor = interceptor(headers)
if proxies:
proxy = random.choice(proxies)
LOG.info(f"Using proxy: {proxy}")
async with AsyncCamoufox(headless=not RECORDINGS_ENABLED, proxy=proxy) as browser:
page: Page = await browser.new_page()
await page.set_viewport_size({"width": 1920, "height": 1080})
# Add cookies and headers
await add_custom_items(url, page, custom_cookies, headers)
try:
LOG.info(f"Visiting URL: {url}")
driver.get(url)
final_url = driver.current_url
visited_urls.add(url)
visited_urls.add(final_url)
_ = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
try:
await page.goto(url, timeout=60000)
await page.wait_for_load_state("networkidle")
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
final_url = page.url
time.sleep(3) # Wait for the page to load
new_height = driver.execute_script("return document.body.scrollHeight")
visited_urls.add(url)
visited_urls.add(final_url)
if new_height == last_height:
break
html_content = await scrape_content(id, page, pages, collect_media)
last_height = new_height
html_content = await page.content()
pages.add((html_content, final_url))
final_height = driver.execute_script("return document.body.scrollHeight")
if site_map:
await handle_site_mapping(
id, site_map, page, pages, collect_media=collect_media
)
page_source = driver.page_source
LOG.debug(f"Page source for url: {url}\n{page_source}")
pages.add((page_source, final_url))
finally:
driver.quit()
finally:
await page.close()
await browser.close()
if not multi_page_scrape:
return
soup = BeautifulSoup(page_source, "html.parser")
soup = BeautifulSoup(html_content, "html.parser")
for a_tag in soup.find_all("a"):
link = a_tag.get("href")
if not isinstance(a_tag, Tag):
continue
if link:
if not urlparse(link).netloc:
base_url = "{0.scheme}://{0.netloc}".format(urlparse(final_url))
link = urljoin(base_url, link)
link = cast(str, a_tag.get("href", ""))
if link not in visited_urls and is_same_domain(link, original_url):
await make_site_request(
link,
headers=headers,
multi_page_scrape=multi_page_scrape,
visited_urls=visited_urls,
pages=pages,
original_url=original_url,
)
if not link:
continue
if not urlparse(link).netloc:
base_url = "{0.scheme}://{0.netloc}".format(urlparse(final_url))
link = urljoin(base_url, link)
if link not in visited_urls and is_same_domain(link, original_url):
await make_site_request(
id,
link,
headers=headers,
multi_page_scrape=multi_page_scrape,
visited_urls=visited_urls,
pages=pages,
original_url=original_url,
proxies=proxies,
site_map=site_map,
collect_media=collect_media,
custom_cookies=custom_cookies,
)
async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]):
soup = BeautifulSoup(page[0], "lxml")
root = etree.HTML(str(soup))
elements: dict[str, list[CapturedElement]] = dict()
elements: dict[str, list[CapturedElement]] = {}
for elem in xpaths:
el = sxpath(root, elem.xpath)
for e in el:
text = "\t".join(str(t) for t in e.itertext())
for e in el: # type: ignore
text = (
" ".join(str(t) for t in e.itertext())
if isinstance(e, etree._Element)
else str(e) # type: ignore
)
text = clean_format_characters(text)
captured_element = CapturedElement(
xpath=elem.xpath, text=text, name=elem.name
)
if elem.name in elements:
elements[elem.name].append(captured_element)
continue
elements[elem.name] = [captured_element]
else:
elements[elem.name] = [captured_element]
return {page[1]: elements}
async def scrape(
id: str,
url: str,
xpaths: list[Element],
headers: Optional[dict[str, Any]],
headers: Optional[dict[str, Any]] = None,
multi_page_scrape: bool = False,
proxies: Optional[list[str]] = [],
proxies: Optional[list[str]] = None,
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
custom_cookies: Optional[list[dict[str, Any]]] = None,
):
visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set()
_ = await make_site_request(
await make_site_request(
id,
url,
headers,
headers=headers,
multi_page_scrape=multi_page_scrape,
visited_urls=visited_urls,
pages=pages,
original_url=url,
proxies=proxies,
site_map=site_map,
collect_media=collect_media,
custom_cookies=custom_cookies,
)
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()
elements: list[dict[str, dict[str, list[CapturedElement]]]] = []
for page in pages:
elements.append(await collect_scraped_elements(page, xpaths))

View File

@@ -21,7 +21,7 @@ async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
mock_randint.return_value = mocked_random_int
# Create a DownloadJob instance
download_job = DownloadJob(ids=[mocked_job["id"]])
download_job = DownloadJob(ids=[mocked_job["id"]], job_format="csv")
# Make a POST request to the /download endpoint
response = client.post("/download", json=download_job.model_dump())

View File

@@ -1,33 +1,53 @@
import pytest
from unittest.mock import AsyncMock, patch, MagicMock
from api.backend.tests.factories.job_factory import create_job
from api.backend.models import JobOptions
from api.backend.scraping import create_driver
import logging
from typing import Dict
from playwright.async_api import async_playwright, Cookie, Route
from api.backend.job.scraping.add_custom import add_custom_items
mocked_job = create_job(
job_options=JobOptions(
multi_page_scrape=False, custom_headers={}, proxies=["127.0.0.1:8080"]
)
).model_dump()
logging.basicConfig(level=logging.DEBUG)
LOG = logging.getLogger(__name__)
@pytest.mark.asyncio
@patch("seleniumwire.webdriver.Chrome.get")
async def test_proxy(mock_get: AsyncMock):
# Mock the response of the requests.get call
mock_response = MagicMock()
mock_get.return_value = mock_response
async def test_add_custom_items():
test_cookies = [{"name": "big", "value": "cookie"}]
test_headers = {"User-Agent": "test-agent", "Accept": "application/json"}
driver = create_driver(proxies=["127.0.0.1:8080"])
assert driver is not None
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
# Simulate a request
driver.get("http://example.com")
response = driver.last_request
# Set up request interception
captured_headers: Dict[str, str] = {}
# Check if the proxy header is set correctly
if response:
assert response.headers["Proxy"] == "127.0.0.1:8080"
async def handle_route(route: Route) -> None:
nonlocal captured_headers
captured_headers = route.request.headers
await route.continue_()
driver.quit()
await page.route("**/*", handle_route)
await add_custom_items(
url="http://example.com",
page=page,
cookies=test_cookies,
headers=test_headers,
)
# Navigate to example.com
await page.goto("http://example.com")
# Verify cookies were added
cookies: list[Cookie] = await page.context.cookies()
test_cookie = next((c for c in cookies if c.get("name") == "big"), None)
assert test_cookie is not None
assert test_cookie.get("value") == "cookie"
assert test_cookie.get("path") == "/" # Default path should be set
assert test_cookie.get("sameSite") == "Lax" # Default sameSite should be set
# Verify headers were added
assert captured_headers.get("user-agent") == "test-agent"
await browser.close()

View File

@@ -1,5 +1,8 @@
from typing import Optional
from typing import Any, Optional
import logging
import json
LOG = logging.getLogger(__name__)
def clean_text(text: str):
@@ -17,3 +20,30 @@ def get_log_level(level_name: Optional[str]) -> int:
level = getattr(logging, level_name, logging.INFO)
return level
def format_list_for_query(ids: list[str]):
return (
f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)"
)
def format_sql_row_to_python(row: dict[str, Any]):
new_row: dict[str, Any] = {}
for key, value in row.items():
if isinstance(value, str):
try:
new_row[key] = json.loads(value)
except json.JSONDecodeError:
new_row[key] = value
else:
new_row[key] = value
return new_row
def format_json(items: list[Any]):
for idx, item in enumerate(items):
if isinstance(item, (dict, list)):
formatted_item = json.dumps(item)
items[idx] = formatted_item

View File

@@ -1,30 +1,97 @@
import os
import json
from pathlib import Path
from api.backend.job import get_queued_job, update_job
from api.backend.scraping import scrape
from api.backend.models import Element
from fastapi.encoders import jsonable_encoder
import subprocess
import asyncio
import logging
import sys
import traceback
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
LOG = logging.getLogger(__name__)
from api.backend.database.startup import init_database
from api.backend.worker.post_job_complete.post_job_complete import post_job_complete
from api.backend.worker.logger import LOG
from api.backend.ai.agent.agent import scrape_with_agent
NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "")
NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "")
SCRAPERR_FRONTEND_URL = os.getenv("SCRAPERR_FRONTEND_URL", "")
EMAIL = os.getenv("EMAIL", "")
TO = os.getenv("TO", "")
SMTP_HOST = os.getenv("SMTP_HOST", "")
SMTP_PORT = int(os.getenv("SMTP_PORT", 587))
SMTP_USER = os.getenv("SMTP_USER", "")
SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "")
USE_TLS = os.getenv("USE_TLS", "false").lower() == "true"
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
RECORDINGS_DIR = Path("/project/app/media/recordings")
async def process_job():
job = await get_queued_job()
ffmpeg_proc = None
status = "Queued"
if job:
LOG.info(f"Beginning processing job: {job}.")
try:
output_path = RECORDINGS_DIR / f"{job['id']}.mp4"
if RECORDINGS_ENABLED:
ffmpeg_proc = subprocess.Popen(
[
"ffmpeg",
"-y",
"-video_size",
"1280x1024",
"-framerate",
"15",
"-f",
"x11grab",
"-i",
":99",
"-codec:v",
"libx264",
"-preset",
"ultrafast",
output_path,
]
)
_ = await update_job([job["id"]], field="status", value="Scraping")
scraped = await scrape(
job["url"],
[Element(**j) for j in job["elements"]],
job["job_options"]["custom_headers"],
job["job_options"]["multi_page_scrape"],
job["job_options"]["proxies"],
)
proxies = job["job_options"]["proxies"]
if proxies and isinstance(proxies[0], str) and proxies[0].startswith("{"):
try:
proxies = [json.loads(p) for p in proxies]
except json.JSONDecodeError:
LOG.error(f"Failed to parse proxy JSON: {proxies}")
proxies = []
if job["agent_mode"]:
scraped = await scrape_with_agent(job)
else:
scraped = await scrape(
job["id"],
job["url"],
[Element(**j) for j in job["elements"]],
job["job_options"]["custom_headers"],
job["job_options"]["multi_page_scrape"],
proxies,
job["job_options"]["site_map"],
job["job_options"]["collect_media"],
job["job_options"]["custom_cookies"],
)
LOG.info(
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
)
@@ -32,14 +99,43 @@ async def process_job():
[job["id"]], field="result", value=jsonable_encoder(scraped)
)
_ = await update_job([job["id"]], field="status", value="Completed")
status = "Completed"
except Exception as e:
_ = await update_job([job["id"]], field="status", value="Failed")
_ = await update_job([job["id"]], field="result", value=e)
LOG.error(f"Exception as occured: {e}\n{traceback.print_exc()}")
status = "Failed"
finally:
job["status"] = status
await post_job_complete(
job,
{
"channel": NOTIFICATION_CHANNEL,
"webhook_url": NOTIFICATION_WEBHOOK_URL,
"scraperr_frontend_url": SCRAPERR_FRONTEND_URL,
"email": EMAIL,
"to": TO,
"smtp_host": SMTP_HOST,
"smtp_port": SMTP_PORT,
"smtp_user": SMTP_USER,
"smtp_password": SMTP_PASSWORD,
"use_tls": USE_TLS,
},
)
if ffmpeg_proc:
ffmpeg_proc.terminate()
ffmpeg_proc.wait()
async def main():
LOG.info("Starting job worker...")
init_database()
RECORDINGS_DIR.mkdir(parents=True, exist_ok=True)
while True:
await process_job()
await asyncio.sleep(5)

View File

@@ -0,0 +1,12 @@
import logging
import os
from api.backend.utils import get_log_level
logging.basicConfig(
level=get_log_level(os.getenv("LOG_LEVEL")),
format="%(levelname)s: %(asctime)s - %(name)s - %(message)s",
handlers=[logging.StreamHandler()],
)
LOG = logging.getLogger(__name__)

View File

@@ -0,0 +1,56 @@
import json
from typing import Any
import requests
from api.backend.worker.logger import LOG
from api.backend.worker.post_job_complete.models import (
PostJobCompleteOptions,
JOB_COLOR_MAP,
)
def discord_notification(job: dict[str, Any], options: PostJobCompleteOptions):
webhook_url = options["webhook_url"]
scraperr_frontend_url = options["scraperr_frontend_url"]
LOG.info(f"Sending discord notification to {webhook_url}")
embed = {
"title": "Job Completed",
"description": "Scraping job has been completed.",
"color": JOB_COLOR_MAP[job["status"]],
"url": f"{scraperr_frontend_url}/jobs?search={job['id']}&type=id",
"image": {
"url": "https://github.com/jaypyles/Scraperr/raw/master/docs/logo_picture.png",
},
"author": {
"name": "Scraperr",
"url": "https://github.com/jaypyles/Scraperr",
},
"fields": [
{
"name": "Status",
"value": "Completed",
"inline": True,
},
{
"name": "URL",
"value": job["url"],
"inline": True,
},
{
"name": "ID",
"value": job["id"],
"inline": False,
},
{
"name": "Options",
"value": f"```json\n{json.dumps(job['job_options'], indent=4)}\n```",
"inline": False,
},
],
}
payload = {"embeds": [embed]}
requests.post(webhook_url, json=payload)

View File

@@ -0,0 +1,97 @@
import smtplib
import ssl
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import json
from typing import Any
from api.backend.worker.logger import LOG
from api.backend.worker.post_job_complete.models import (
JOB_COLOR_MAP,
PostJobCompleteOptions,
)
def send_job_complete_email(
job: dict[str, Any],
options: PostJobCompleteOptions,
):
status = job["status"]
status_color = JOB_COLOR_MAP.get(status, 0x808080)
job_url = job["url"]
job_id = job["id"]
job_options_json = json.dumps(job["job_options"], indent=4)
frontend_url = options["scraperr_frontend_url"]
subject = "📦 Job Completed - Scraperr Notification"
html = f"""
<html>
<body style="font-family: Arial, sans-serif;">
<h2 style="color: #{status_color:06x};">✅ Job Completed</h2>
<p>Scraping job has been completed successfully.</p>
<a href="{frontend_url}/jobs?search={job_id}&type=id" target="_blank">
<img src="https://github.com/jaypyles/Scraperr/raw/master/docs/logo_picture.png" alt="Scraperr Logo" width="200">
</a>
<h3>Job Info:</h3>
<ul>
<li><strong>Status:</strong> {status}</li>
<li><strong>Job URL:</strong> <a href="{job_url}">{job_url}</a></li>
<li><strong>Job ID:</strong> {job_id}</li>
</ul>
<h3>Options:</h3>
<pre style="background-color:#f4f4f4; padding:10px; border-radius:5px;">
{job_options_json}
</pre>
<h3>View your job here:</h3>
<a href="{options['scraperr_frontend_url']}/jobs?search={job_id}&type=id">Scraperr Job</a>
<p style="font-size: 12px; color: gray;">
Sent by <a href="https://github.com/jaypyles/Scraperr" target="_blank">Scraperr</a>
</p>
</body>
</html>
"""
# Create email
message = MIMEMultipart("alternative")
message["From"] = options["email"]
message["To"] = options["to"]
message["Subject"] = subject
message.attach(
MIMEText(
"Job completed. View this email in HTML format for full details.", "plain"
)
)
message.attach(MIMEText(html, "html"))
context = ssl.create_default_context()
try:
if options["use_tls"]:
with smtplib.SMTP(options["smtp_host"], options["smtp_port"]) as server:
server.starttls(context=context)
server.login(options["smtp_user"], options["smtp_password"])
server.sendmail(
from_addr=options["email"],
to_addrs=options["to"],
msg=message.as_string(),
)
else:
with smtplib.SMTP_SSL(
options["smtp_host"], options["smtp_port"], context=context
) as server:
server.login(options["smtp_user"], options["smtp_password"])
server.sendmail(
from_addr=options["email"],
to_addrs=options["to"],
msg=message.as_string(),
)
LOG.info("✅ Email sent successfully!")
except Exception as e:
LOG.error(f"❌ Failed to send email: {e}")

View File

@@ -0,0 +1,22 @@
from typing import TypedDict
class PostJobCompleteOptions(TypedDict):
channel: str
webhook_url: str
scraperr_frontend_url: str
email: str
to: str
smtp_host: str
smtp_port: int
smtp_user: str
smtp_password: str
use_tls: bool
JOB_COLOR_MAP = {
"Queued": 0x0000FF,
"Scraping": 0x0000FF,
"Completed": 0x00FF00,
"Failed": 0xFF0000,
}

View File

@@ -0,0 +1,24 @@
from typing import Any
from api.backend.worker.post_job_complete.models import PostJobCompleteOptions
from api.backend.worker.post_job_complete.email_notifcation import (
send_job_complete_email,
)
from api.backend.worker.post_job_complete.discord_notification import (
discord_notification,
)
async def post_job_complete(job: dict[str, Any], options: PostJobCompleteOptions):
if options["channel"] == "":
return
if not options.values():
return
if options["channel"] == "discord":
discord_notification(job, options)
elif options["channel"] == "email":
send_job_complete_email(job, options)
else:
raise ValueError(f"Invalid channel: {options['channel']}")

View File

@@ -0,0 +1,60 @@
describe("Authentication", () => {
it("should register", () => {
cy.intercept("POST", "/api/signup").as("signup");
cy.visit("/").then(() => {
cy.get("button").contains("Login").click();
cy.url().should("include", "/login");
cy.get("form").should("be.visible");
cy.get("button")
.contains("No Account? Sign up")
.should("be.visible")
.click();
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("input[name='fullName']").type("John Doe");
cy.get("button[type='submit']").contains("Signup").click();
cy.wait("@signup").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("signup request did not return a response");
}
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
expect(interception.response.statusCode).to.eq(200);
});
});
});
it("should login", () => {
cy.intercept("POST", "/api/token").as("token");
cy.visit("/").then(() => {
cy.get("button")
.contains("Login")
.click()
.then(() => {
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("button[type='submit']").contains("Login").click();
cy.wait("@token").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("token request did not return a response");
}
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
expect(interception.response.statusCode).to.eq(200);
});
});
});
});
});

View File

@@ -1,19 +1,88 @@
describe("Job", () => {
describe.only("Job", () => {
it("should create a job", () => {
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
cy.visit("/");
const input = cy.get('[data-cy="url-input"]');
input.type("https://example.com");
cy.get('[data-cy="url-input"]').type("https://example.com");
cy.get('[data-cy="name-field"]').type("example");
cy.get('[data-cy="xpath-field"]').type("//body");
cy.get('[data-cy="add-button"]').click();
const nameField = cy.get('[data-cy="name-field"]');
const xPathField = cy.get('[data-cy="xpath-field"]');
const addButton = cy.get('[data-cy="add-button"]');
cy.contains("Submit").click();
nameField.type("example");
xPathField.type("//body");
addButton.click();
cy.wait("@submitScrapeJob").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
cy.log("Request body: " + JSON.stringify(interception.request?.body));
throw new Error("submitScrapeJob request did not return a response");
}
const submit = cy.contains("Submit");
submit.click();
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
expect(interception.response.statusCode).to.eq(200);
});
cy.get("li").contains("Jobs").click();
cy.contains("div", "https://example.com", { timeout: 10000 }).should(
"exist"
);
cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
cy.get("tbody tr")
.first()
.within(() => {
cy.get('input[type="checkbox"]').click();
});
cy.get("[data-testid='DeleteIcon']").click();
cy.contains("div", "https://example.com", { timeout: 10000 }).should(
"not.exist"
);
});
it("should create a job with advanced options (media)", () => {
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
cy.visit("/");
cy.get("button").contains("Advanced Job Options").click();
cy.get('[data-cy="collect-media-checkbox"]').click();
cy.get("body").type("{esc}");
cy.get('[data-cy="url-input"]').type("https://books.toscrape.com");
cy.get('[data-cy="name-field"]').type("example");
cy.get('[data-cy="xpath-field"]').type("//body");
cy.get('[data-cy="add-button"]').click();
cy.get("button").contains("Submit").click();
cy.get("li").contains("Jobs").click();
cy.contains("div", "https://books.toscrape.com", { timeout: 10000 }).should(
"exist"
);
cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
cy.get("li").contains("Media").click();
cy.get("div[id='select-job']").click();
cy.get("li[role='option']").click();
cy.get("[data-testid='media-grid']", { timeout: 10000 }).should("exist");
cy.get("li").contains("Jobs").click();
cy.get("tbody tr")
.first()
.within(() => {
cy.get('input[type="checkbox"]').click();
});
cy.get("[data-testid='DeleteIcon']").click();
});
});

View File

@@ -34,4 +34,4 @@
// visit(originalFn: CommandOriginalFn, url: string, options: Partial<VisitOptions>): Chainable<Element>
// }
// }
// }
// }

View File

@@ -1,13 +1,10 @@
version: "3"
services:
scraperr:
build:
context: .
dockerfile: docker/frontend/Dockerfile
command: ["npm", "run", "dev"]
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr.rule=Host(`localhost`)"
- "traefik.http.routers.scraperr.entrypoints=web"
- "traefik.http.services.scraperr.loadbalancer.server.port=3000"
- "traefik.http.routers.scraperr.tls=false"
volumes:
- "$PWD/src:/app/src"
- "$PWD/public:/app/public"
@@ -16,7 +13,12 @@ services:
- "$PWD/package-lock.json:/app/package-lock.json"
- "$PWD/tsconfig.json:/app/tsconfig.json"
scraperr_api:
ports:
- "8000:8000"
build:
context: .
dockerfile: docker/api/Dockerfile
environment:
- LOG_LEVEL=INFO
volumes:
- "$PWD/api:/project/api"
- "$PWD/api:/project/app/api"
ports:
- "5900:5900"

View File

@@ -1,66 +1,28 @@
services:
scraperr:
image: jpyles0524/scraperr:latest
build:
context: .
dockerfile: docker/frontend/Dockerfile
container_name: scraperr
command: ["npm", "run", "start"]
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr.rule=Host(`localhost`)" # change this to your domain, if not running on localhost
- "traefik.http.routers.scraperr.entrypoints=web" # websecure if using https
- "traefik.http.services.scraperr.loadbalancer.server.port=3000"
environment:
- NEXT_PUBLIC_API_URL=http://scraperr_api:8000 # your API URL
- SERVER_URL=http://scraperr_api:8000 # your docker container API URL
ports:
- 80:3000
networks:
- web
scraperr_api:
init: True
image: jpyles0524/scraperr_api:latest
build:
context: .
dockerfile: docker/api/Dockerfile
environment:
- LOG_LEVEL=INFO
- OLLAMA_URL=http://ollama:11434
- OLLAMA_MODEL=phi3
- MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB
- SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string)
- ALGORITHM=HS256 # authentication encoding algorithm
- ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes
container_name: scraperr_api
volumes:
- /var/run/docker.sock:/var/run/docker.sock
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr_api.rule=Host(`localhost`) && PathPrefix(`/api`)" # change this to your domain, if not running on localhost
- "traefik.http.routers.scraperr_api.entrypoints=web" # websecure if using https
- "traefik.http.middlewares.api-stripprefix.stripprefix.prefixes=/api"
- "traefik.http.routers.scraperr_api.middlewares=api-stripprefix"
- "traefik.http.services.scraperr_api.loadbalancer.server.port=8000"
networks:
- web
traefik:
image: traefik:latest
container_name: traefik
command:
- "--providers.docker=true"
- "--entrypoints.web.address=:80"
- "--entrypoints.websecure.address=:443"
ports:
- 80:80
- 443:443
- 8000:8000
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro"
networks:
- web
mongo:
container_name: webscrape-mongo
image: mongo
restart: always
environment:
MONGO_INITDB_ROOT_USERNAME: root
MONGO_INITDB_ROOT_PASSWORD: example
- "$PWD/data:/project/app/data"
- "$PWD/media:/project/app/media"
networks:
- web
networks:
web:

View File

@@ -1,36 +1,42 @@
# Build python dependencies
FROM python:3.10.12-slim as pybuilder
RUN apt update && apt install -y uvicorn
RUN apt-get update && \
apt-get install -y curl && \
apt-get install -y x11vnc xvfb uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 ffmpeg && \
curl -LsSf https://astral.sh/uv/install.sh | sh && \
apt-get remove -y curl && \
apt-get autoremove -y && \
rm -rf /var/lib/apt/lists/*
RUN python -m pip --no-cache-dir install pdm
RUN pdm config python.use_venv false
WORKDIR /project/app
COPY pyproject.toml pdm.lock /project/app/
RUN pdm install
RUN pdm install -v --frozen-lockfile
RUN pdm run playwright install --with-deps
RUN pdm run camoufox fetch
COPY ./api/ /project/app/api
# Create final image
FROM python:3.10.12-slim
RUN apt-get update
RUN apt-get install -y wget gnupg supervisor
RUN wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add -
RUN sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list'
RUN apt-get update
RUN apt-get install -y google-chrome-stable
ENV PYTHONPATH=/project/pkgs
COPY --from=pybuilder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
COPY --from=pybuilder /usr/local/bin /usr/local/bin
COPY --from=pybuilder /project/app /project/
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
EXPOSE 8000
WORKDIR /project/
WORKDIR /project/app
RUN mkdir -p /project/app/media
RUN mkdir -p /project/app/data
RUN touch /project/app/data/database.db
EXPOSE 5900
COPY start.sh /project/app/start.sh
CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ]

View File

@@ -1,10 +1,14 @@
# Build next dependencies
FROM node:latest
FROM node:23.1-slim
WORKDIR /app
COPY package*.json ./
RUN npm install
# Copy package files first to leverage Docker cache
COPY package.json yarn.lock ./
# Install dependencies in a separate layer
RUN yarn install --frozen-lockfile
# Copy the rest of the application
COPY tsconfig.json /app/tsconfig.json
COPY tailwind.config.js /app/tailwind.config.js
COPY next.config.mjs /app/next.config.mjs
@@ -13,8 +17,7 @@ COPY postcss.config.js /app/postcss.config.js
COPY public /app/public
COPY src /app/src
RUN npm run build
# Build the application
RUN yarn build
EXPOSE 3000
# CMD [ "npm", "run" ]
EXPOSE 3000

Binary file not shown.

Before

Width:  |  Height:  |  Size: 46 KiB

After

Width:  |  Height:  |  Size: 48 KiB

View File

@@ -1,4 +0,0 @@
tls:
certificates:
- certFile: /etc/certs/ssl-cert.pem
keyFile: /etc/certs/ssl-cert.key

23
helm/.helmignore Normal file
View File

@@ -0,0 +1,23 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/

24
helm/Chart.yaml Normal file
View File

@@ -0,0 +1,24 @@
apiVersion: v2
name: scraperr
description: A Helm chart for Kubernetes
# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 1.1.0
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "1.16.0"

View File

@@ -0,0 +1,56 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: scraperr
spec:
replicas: {{ .Values.replicaCount }}
selector:
matchLabels:
app: scraperr
template:
metadata:
labels:
app: scraperr
spec:
containers:
- name: scraperr
{{ if .Values.scraperr.image.repository }}
image: "{{ .Values.scraperr.image.repository }}:{{ .Values.scraperr.image.tag }}"
{{ else }}
image: "{{ .Chart.Name }}:{{ .Chart.Version }}"
{{ end }}
imagePullPolicy: {{ .Values.scraperr.image.pullPolicy }}
command: {{ .Values.scraperr.containerCommand | toJson }}
ports:
- containerPort: {{ .Values.scraperr.containerPort }}
env: {{ toYaml .Values.scraperr.env | nindent 12 }}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: scraperr-api
spec:
replicas: {{ .Values.replicaCount }}
selector:
matchLabels:
app: scraperr-api
template:
metadata:
labels:
app: scraperr-api
spec:
containers:
- name: scraperr-api
{{ if .Values.scraperrApi.image.repository }}
image: "{{ .Values.scraperrApi.image.repository }}:{{ .Values.scraperrApi.image.tag }}"
{{ else }}
image: "{{ .Chart.Name }}:{{ .Chart.Version }}"
{{ end }}
imagePullPolicy: {{ .Values.scraperrApi.image.pullPolicy }}
ports:
- containerPort: {{ .Values.scraperrApi.containerPort }}
env: {{ toYaml .Values.scraperrApi.env | nindent 12 }}
volumeMounts: {{ toYaml .Values.scraperrApi.volumeMounts | nindent 12 }}
volumes: {{ toYaml .Values.scraperrApi.volumes | nindent 12 }}

View File

@@ -0,0 +1,37 @@
---
apiVersion: v1
kind: Service
metadata:
name: scraperr
spec:
type: {{ .Values.scraperr.serviceType }}
selector:
app: scraperr
ports:
{{- range .Values.scraperr.ports }}
- port: {{ .port }}
targetPort: {{ .targetPort }}
{{- if .nodePort }}
nodePort: {{ .nodePort }}
{{- end }}
protocol: {{ .protocol | default "TCP" }}
{{- end }}
---
apiVersion: v1
kind: Service
metadata:
name: scraperr-api
spec:
type: {{ .Values.scraperrApi.serviceType }}
selector:
app: scraperr-api
ports:
{{- range .Values.scraperrApi.ports }}
- port: {{ .port }}
targetPort: {{ .targetPort }}
{{- if .nodePort }}
nodePort: {{ .nodePort }}
{{- end }}
protocol: {{ .protocol | default "TCP" }}
{{- end }}

47
helm/values.yaml Normal file
View File

@@ -0,0 +1,47 @@
scraperr:
image:
repository: jpyles0524/scraperr
tag: latest
pullPolicy: IfNotPresent
containerCommand: ["npm", "run","start"]
containerPort: 3000
serviceType: NodePort
ports:
- port: 80
targetPort: 3000
nodePort: 32300
protocol: TCP
env:
- name: NEXT_PUBLIC_API_URL
value: "http://scraperr-api:8000"
- name: SERVER_URL
value: "http://scraperr-api:8000"
scraperrApi:
image:
repository: jpyles0524/scraperr_api
tag: latest
pullPolicy: IfNotPresent
containerPort: 8000
serviceType: ClusterIP
ports:
- port: 8000
targetPort: 8000
protocol: TCP
env:
- name: LOG_LEVEL
value: "INFO"
volumeMounts:
- name: data
mountPath: /project/app/data
- name: media
mountPath: /project/app/media
volumes:
- name: data
hostPath:
path: /data/scraperr/data
type: DirectoryOrCreate
- name: media
hostPath:
path: /data/scraperr/media
replicaCount: 1

View File

@@ -1,37 +0,0 @@
# STL
import os
# PDM
import boto3
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
def test_insert_and_delete():
# Get environment variables
region_name = os.getenv("AWS_REGION")
# Initialize DynamoDB resource
dynamodb = boto3.resource("dynamodb", region_name=region_name)
table = dynamodb.Table("scrape")
# Item to insert
item = {
"id": "123", # Replace with the appropriate id value
"attribute1": "value1",
"attribute2": "value2",
# Add more attributes as needed
}
# Insert the item
table.put_item(Item=item)
print(f"Inserted item: {item}")
# Delete the item
table.delete_item(Key={"id": "123"}) # Replace with the appropriate id value
print(f"Deleted item with id: {item['id']}")
if __name__ == "__main__":
test_insert_and_delete()

23271
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -12,13 +12,16 @@
"@minchat/react-chat-ui": "^0.16.2",
"@mui/icons-material": "^5.15.3",
"@mui/material": "^5.16.0",
"@reduxjs/toolkit": "^2.8.2",
"@testing-library/jest-dom": "^5.16.5",
"@testing-library/react": "^13.4.0",
"@testing-library/user-event": "^13.5.0",
"@types/react": "^18.3.21",
"axios": "^1.7.2",
"bootstrap": "^5.3.0",
"chart.js": "^4.4.3",
"cookie": "^0.6.0",
"dotenv": "^16.5.0",
"framer-motion": "^4.1.17",
"js-cookie": "^3.0.5",
"next": "^14.2.4",
@@ -29,17 +32,18 @@
"react-dom": "^18.3.1",
"react-markdown": "^9.0.0",
"react-modal-image": "^2.6.0",
"react-redux": "^9.2.0",
"react-router": "^6.14.1",
"react-router-dom": "^6.14.1",
"react-scripts": "^5.0.1",
"react-spinners": "^0.14.1",
"redux-persist": "^6.0.0",
"typescript": "^4.9.5",
"web-vitals": "^2.1.4"
},
"scripts": {
"dev": "next dev",
"build": "next build",
"start": "next start",
"dev": "yarn next dev",
"build": "yarn next build",
"start": "yarn next start",
"serve": "serve -s ./dist",
"cy:open": "cypress open",
"cy:run": "cypress run"
@@ -63,12 +67,18 @@
]
},
"devDependencies": {
"@types/cypress": "^0.1.6",
"@types/cypress": "^1.1.6",
"@types/js-cookie": "^3.0.6",
"cypress": "^13.15.0",
"autoprefixer": "^10.4.21",
"cypress": "^13.17.0",
"eslint": "^9.26.0",
"postcss": "^8.5.3",
"tailwindcss": "^3.3.5"
},
"overrides": {
"react-refresh": "0.11.0"
},
"resolutions": {
"postcss": "^8.4.31"
}
}

2637
pdm.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -2,9 +2,7 @@
name = "web-scrape"
version = "0.1.0"
description = ""
authors = [
{name = "Jayden Pyles", email = "jpylesbuisness@gmail.com"},
]
authors = [{ name = "Jayden Pyles", email = "jpylesbuisness@gmail.com" }]
dependencies = [
"uvicorn>=0.30.1",
"fastapi>=0.111.0",
@@ -18,7 +16,6 @@ dependencies = [
"lxml-stubs>=0.5.1",
"fake-useragent>=1.5.1",
"requests-html>=0.10.0",
"selenium>=4.22.0",
"webdriver-manager>=4.0.1",
"pydantic[email]>=2.9.2",
"pandas>=2.2.2",
@@ -39,20 +36,22 @@ dependencies = [
"exceptiongroup>=1.2.2",
"Faker>=30.6.0",
"pytest-asyncio>=0.24.0",
"python-multipart>=0.0.12",
"python-multipart>=0.0.1",
"bcrypt==4.0.1",
"apscheduler>=3.11.0",
"playwright>=1.52.0",
"camoufox>=0.4.11",
"html2text>=2025.4.15",
]
requires-python = ">=3.10"
readme = "README.md"
license = {text = "MIT"}
license = { text = "MIT" }
[tool.pdm]
distribution = true
[tool.pdm.dev-dependencies]
dev = [
"ipython>=8.26.0",
"pytest>=8.3.3",
]
dev = ["ipython>=8.26.0", "pytest>=8.3.3"]
[tool.pyright]
include = ["./api/backend/"]
exclude = ["**/node_modules", "**/__pycache__"]
@@ -60,14 +59,42 @@ ignore = []
defineConstant = { DEBUG = true }
stubPath = ""
reportUnknownMemberType= false
reportMissingImports = true
reportMissingTypeStubs = false
reportAny = false
reportCallInDefaultInitializer = false
# Type checking strictness
typeCheckingMode = "strict" # Enables strict type checking mode
reportPrivateUsage = "none"
reportMissingTypeStubs = "none"
reportUntypedFunctionDecorator = "error"
reportUntypedClassDecorator = "error"
reportUntypedBaseClass = "error"
reportInvalidTypeVarUse = "error"
reportUnnecessaryTypeIgnoreComment = "information"
reportUnknownVariableType = "none"
reportUnknownMemberType = "none"
reportUnknownParameterType = "none"
pythonVersion = "3.9"
pythonPlatform = "Linux"
# Additional checks
reportImplicitStringConcatenation = "error"
reportInvalidStringEscapeSequence = "error"
reportMissingImports = "error"
reportMissingModuleSource = "error"
reportOptionalCall = "error"
reportOptionalIterable = "error"
reportOptionalMemberAccess = "error"
reportOptionalOperand = "error"
reportOptionalSubscript = "error"
reportTypedDictNotRequiredAccess = "error"
# Function return type checking
reportIncompleteStub = "error"
reportIncompatibleMethodOverride = "error"
reportInvalidStubStatement = "error"
reportInconsistentOverload = "error"
# Misc settings
pythonVersion = "3.10" # Matches your Python version from pyproject.toml
strictListInference = true
strictDictionaryInference = true
strictSetInference = true
[tool.isort]

View File

@@ -1,17 +1,23 @@
import React, { useState, useEffect, Dispatch, useRef } from "react";
import React, { useState, Dispatch, useEffect } from "react";
import { Job } from "../../types";
import { fetchJobs } from "../../lib";
import Box from "@mui/material/Box";
import InputLabel from "@mui/material/InputLabel";
import FormControl from "@mui/material/FormControl";
import Select from "@mui/material/Select";
import Popover from "@mui/material/Popover";
import { Typography, MenuItem, useTheme } from "@mui/material";
import {
Typography,
MenuItem,
useTheme,
ClickAwayListener,
} from "@mui/material";
import { SxProps } from "@mui/material";
interface Props {
sxProps: SxProps;
setSelectedJob: Dispatch<React.SetStateAction<Job | null>>;
sxProps?: SxProps;
setSelectedJob:
| Dispatch<React.SetStateAction<Job | null>>
| ((job: Job) => void);
selectedJob: Job | null;
setJobs: Dispatch<React.SetStateAction<Job[]>>;
jobs: Job[];
@@ -28,10 +34,6 @@ export const JobSelector = ({
const [popoverJob, setPopoverJob] = useState<Job | null>(null);
const theme = useTheme();
useEffect(() => {
fetchJobs(setJobs, { chat: true });
}, []);
const handlePopoverOpen = (
event: React.MouseEvent<HTMLElement>,
job: Job
@@ -47,6 +49,12 @@ export const JobSelector = ({
const open = Boolean(anchorEl);
useEffect(() => {
if (!open) {
setAnchorEl(null);
}
}, [open]);
return (
<Box sx={sxProps}>
<FormControl fullWidth>
@@ -59,9 +67,11 @@ export const JobSelector = ({
value={selectedJob?.id || ""}
label="Job"
onChange={(e) => {
setSelectedJob(
jobs.find((job) => job.id === e.target.value) || null
);
const job = jobs.find((job) => job.id === e.target.value);
if (job) {
setSelectedJob(job);
}
}}
>
{jobs.map((job) => (
@@ -81,55 +91,63 @@ export const JobSelector = ({
</>
) : null}
</FormControl>
<Popover
id="mouse-over-popover"
sx={{
pointerEvents: "none",
padding: 0,
}}
open={open}
anchorEl={anchorEl}
anchorOrigin={{
vertical: "bottom",
horizontal: "left",
}}
transformOrigin={{
vertical: "top",
horizontal: "left",
}}
onClose={handlePopoverClose}
>
{popoverJob && (
<Box
{open && (
<ClickAwayListener onClickAway={handlePopoverClose}>
<Popover
id="mouse-over-popover"
sx={{
border:
theme.palette.mode === "light"
? "2px solid black"
: "2px solid white",
pointerEvents: "none",
padding: 0,
}}
open={open}
anchorEl={anchorEl}
anchorOrigin={{
vertical: "bottom",
horizontal: "left",
}}
transformOrigin={{
vertical: "top",
horizontal: "left",
}}
onClose={handlePopoverClose}
>
<Typography
variant="body1"
sx={{ paddingLeft: 1, paddingRight: 1 }}
>
{popoverJob.url}
</Typography>
<div className="flex flex-row w-full justify-end mb-1">
<Typography
variant="body2"
{popoverJob && (
<Box
sx={{
paddingLeft: 1,
paddingRight: 1,
color: theme.palette.mode === "dark" ? "#d3d7e6" : "#5b5d63",
fontStyle: "italic",
border:
theme.palette.mode === "light"
? "2px solid black"
: "2px solid white",
}}
>
{new Date(popoverJob.time_created).toLocaleString()}
</Typography>
</div>
</Box>
)}
</Popover>
<Typography
variant="body1"
sx={{ paddingLeft: 1, paddingRight: 1 }}
>
{popoverJob.url}
</Typography>
<div className="flex flex-row w-full justify-end mb-1">
<Typography
variant="body2"
sx={{
paddingLeft: 1,
paddingRight: 1,
color:
theme.palette.mode === "dark" ? "#d3d7e6" : "#5b5d63",
fontStyle: "italic",
}}
>
{popoverJob.time_created
? new Date(popoverJob.time_created).toLocaleString()
: "Unknown"}
</Typography>
</div>
</Box>
)}
</Popover>
</ClickAwayListener>
)}
</Box>
);
};

View File

@@ -0,0 +1,48 @@
import { Box, Link, Typography } from "@mui/material";
import { SetStateAction, Dispatch, useState } from "react";
import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog";
import { RawJobOptions } from "@/types";
export type AdvancedJobOptionsProps = {
jobOptions: RawJobOptions;
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
multiPageScrapeEnabled?: boolean;
};
export const AdvancedJobOptions = ({
jobOptions,
setJobOptions,
multiPageScrapeEnabled = true,
}: AdvancedJobOptionsProps) => {
const [open, setOpen] = useState(false);
return (
<Box sx={{ mb: 2 }}>
<Link
component="button"
variant="body2"
onClick={() => setOpen(true)}
sx={{
textDecoration: "none",
color: "primary.main",
"&:hover": {
color: "primary.dark",
textDecoration: "underline",
},
paddingLeft: 1,
display: "inline-flex",
alignItems: "center",
gap: 0.5,
}}
>
<Typography variant="body2">Advanced Job Options</Typography>
</Link>
<AdvancedJobOptionsDialog
open={open}
onClose={() => setOpen(false)}
jobOptions={jobOptions}
setJobOptions={setJobOptions}
multiPageScrapeEnabled={multiPageScrapeEnabled}
/>
</Box>
);
};

View File

@@ -0,0 +1,279 @@
import {
Accordion,
AccordionDetails,
AccordionSummary,
Box,
Checkbox,
Dialog,
DialogContent,
DialogTitle,
Divider,
FormControl,
FormControlLabel,
FormGroup,
IconButton,
TextField,
Tooltip,
Typography,
useTheme,
} from "@mui/material";
import {
ExpandMore as ExpandMoreIcon,
InfoOutlined,
Code as CodeIcon,
Settings,
} from "@mui/icons-material";
import { Dispatch, SetStateAction } from "react";
import { RawJobOptions } from "@/types";
import { ExpandedTableInput } from "../../expanded-table-input";
export type AdvancedJobOptionsDialogProps = {
open: boolean;
onClose: () => void;
jobOptions: RawJobOptions;
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
multiPageScrapeEnabled?: boolean;
};
export const AdvancedJobOptionsDialog = ({
open,
onClose,
jobOptions,
setJobOptions,
multiPageScrapeEnabled = true,
}: AdvancedJobOptionsDialogProps) => {
const theme = useTheme();
const handleMultiPageScrapeChange = () => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
multi_page_scrape: !prevJobOptions.multi_page_scrape,
}));
};
const handleProxiesChange = (e: React.ChangeEvent<HTMLInputElement>) => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
proxies: e.target.value,
}));
};
const handleCollectMediaChange = () => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
collect_media: !prevJobOptions.collect_media,
}));
};
return (
<Dialog
open={open}
onClose={onClose}
maxWidth="md"
fullWidth
PaperProps={{
sx: {
borderRadius: 2,
boxShadow: "0 8px 32px rgba(0, 0, 0, 0.1)",
},
}}
>
<DialogTitle
sx={{
borderBottom: `1px solid ${theme.palette.divider}`,
backgroundColor: theme.palette.background.default,
color: theme.palette.primary.contrastText,
borderRadius: 2,
display: "flex",
alignItems: "center",
justifyContent: "space-between",
padding: "1rem 2rem",
marginRight: 2,
marginLeft: 2,
}}
>
<Typography variant="h6" component="div">
Advanced Job Options
</Typography>
<Settings
sx={{
color: theme.palette.primary.contrastText,
}}
/>
</DialogTitle>
<DialogContent
sx={{ padding: 3, overflowY: "auto", marginTop: 2, height: "60rem" }}
>
<FormControl fullWidth>
<Box sx={{ mb: 3 }}>
<Typography
variant="subtitle1"
sx={{
mb: 1,
fontWeight: "bold",
color: theme.palette.text.primary,
}}
>
Collection Options
</Typography>
<Divider sx={{ mb: 2, backgroundColor: theme.palette.divider }} />
<FormGroup row sx={{ gap: 4, mb: 1 }}>
<FormControlLabel
control={
<Checkbox
checked={jobOptions.multi_page_scrape}
onChange={handleMultiPageScrapeChange}
disabled={!multiPageScrapeEnabled}
/>
}
label={
<Box sx={{ display: "flex", alignItems: "center" }}>
<Typography>Multi Page Scrape</Typography>
<Tooltip
title={
multiPageScrapeEnabled
? "Enable crawling through multiple pages"
: "Multi page scrape is disabled"
}
>
<IconButton size="small">
<InfoOutlined fontSize="small" />
</IconButton>
</Tooltip>
</Box>
}
/>
<FormControlLabel
control={
<Checkbox
checked={jobOptions.collect_media}
onChange={handleCollectMediaChange}
data-cy="collect-media-checkbox"
/>
}
label={
<Box sx={{ display: "flex", alignItems: "center" }}>
<Typography>Collect Media</Typography>
<Tooltip title="Download images and other media">
<IconButton size="small">
<InfoOutlined fontSize="small" />
</IconButton>
</Tooltip>
</Box>
}
/>
</FormGroup>
</Box>
<Box sx={{ mb: 3 }}>
<Typography
variant="subtitle1"
sx={{
mb: 1,
fontWeight: "bold",
color: theme.palette.text.primary,
}}
>
Custom Options
</Typography>
<Divider sx={{ mb: 2, backgroundColor: theme.palette.divider }} />
{/* Proxies Section */}
<Accordion
defaultExpanded
elevation={0}
sx={{
mb: 2,
border: `1px solid ${theme.palette.divider}`,
"&:before": { display: "none" },
borderRadius: 1,
overflow: "hidden",
padding: 1,
}}
>
<AccordionSummary
expandIcon={<ExpandMoreIcon />}
sx={{
backgroundColor: theme.palette.background.paper,
borderBottom: `1px solid ${theme.palette.divider}`,
"&.Mui-expanded": {
borderBottom: `1px solid ${theme.palette.divider}`,
},
}}
>
<Box sx={{ display: "flex", alignItems: "center" }}>
<div
style={{
display: "flex",
alignItems: "center",
gap: "0.5rem",
}}
>
<Typography
sx={{
fontWeight: 500,
color: theme.palette.text.primary,
}}
>
Proxies
</Typography>
<Tooltip title="Comma separated list of proxies that should follow Playwright proxy format">
<InfoOutlined fontSize="small" />
</Tooltip>
</div>
</Box>
</AccordionSummary>
<AccordionDetails
sx={{ p: 2, backgroundColor: theme.palette.background.default }}
>
<TextField
placeholder='Proxies ([{"server": "proxy.example.com:8080", "username": "username", "password": "password"}])'
fullWidth
variant="outlined"
size="small"
value={jobOptions.proxies}
onChange={handleProxiesChange}
InputProps={{
startAdornment: (
<CodeIcon
sx={{ color: theme.palette.text.secondary, mr: 1 }}
/>
),
}}
/>
</AccordionDetails>
</Accordion>
{/* Custom Headers Section */}
<ExpandedTableInput
label="Custom Headers"
placeholder='{"User-Agent": "CustomAgent", "Accept": "*/*"}'
urlParam="custom_headers"
onChange={(value) => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
custom_headers: value,
}));
}}
/>
{/* Custom Cookies Section */}
<ExpandedTableInput
label="Custom Cookies"
placeholder='[{"name": "value", "name2": "value2"}]'
urlParam="custom_cookies"
onChange={(value) => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
custom_cookies: value,
}));
}}
/>
</Box>
</FormControl>
</DialogContent>
</Dialog>
);
};

View File

@@ -0,0 +1 @@
export * from "./advanced-job-options-dialog";

View File

@@ -0,0 +1 @@
export * from "./advanced-job-options";

View File

@@ -0,0 +1,166 @@
import React, { useState } from "react";
import {
Table,
TableBody,
TableCell,
TableContainer,
TableHead,
TableRow,
Paper,
Box,
Typography,
useTheme,
alpha,
} from "@mui/material";
export type CsvRow = {
[key: string]: string;
};
export type CsvTableProps = {
csv: {
rows: CsvRow[];
headers: string[];
};
className?: string;
};
export const CsvTable: React.FC<CsvTableProps> = ({ csv, className }) => {
const [expandedRow, setExpandedRow] = useState<number | null>(null);
const theme = useTheme();
const handleRowClick = (rowIndex: number) => {
setExpandedRow((prevRow) => (prevRow === rowIndex ? null : rowIndex));
};
return (
<Box
sx={{
height: "100%",
display: "flex",
flexDirection: "column",
overflow: "hidden",
width: "100%",
}}
className={className}
>
{csv.rows.length > 0 ? (
<TableContainer
sx={{
flex: 1,
overflow: "auto",
borderRadius: theme.shape.borderRadius,
boxShadow: theme.shadows[1],
}}
>
<Table stickyHeader size="small" aria-label="csv data table">
<TableHead>
<TableRow>
{csv.headers.map((header, idx) => (
<TableCell
key={idx}
sx={{
fontWeight: "bold",
cursor: "pointer",
whiteSpace: "nowrap",
backgroundColor: theme.palette.background.paper,
color: theme.palette.text.primary,
"&:hover": {
backgroundColor: alpha(theme.palette.primary.main, 0.1),
},
p: { xs: 1, sm: 2 },
}}
>
{header}
</TableCell>
))}
</TableRow>
</TableHead>
<TableBody>
{csv.rows.map((row, rowIndex) => (
<React.Fragment key={rowIndex}>
<TableRow
onClick={() => handleRowClick(rowIndex)}
sx={{
"&:nth-of-type(odd)": {
backgroundColor: alpha(
theme.palette.primary.main,
0.02
),
},
"&:hover": {
backgroundColor: alpha(
theme.palette.primary.main,
0.04
),
},
cursor: "pointer",
}}
>
{Object.values(row).map((col, colIndex) => (
<TableCell
key={colIndex}
sx={{
whiteSpace: "nowrap",
maxWidth: { xs: "150px", sm: "200px", md: "200px" },
overflow: "hidden",
textOverflow: "ellipsis",
p: { xs: 1, sm: 2 },
}}
>
{col}
</TableCell>
))}
</TableRow>
{expandedRow === rowIndex && (
<TableRow>
<TableCell
colSpan={csv.headers.length}
sx={{ padding: 2 }}
>
<Paper
sx={{
padding: 2,
backgroundColor: alpha(
theme.palette.background.paper,
0.5
),
}}
>
<Typography variant="body2" color="text.secondary">
{row.text
? row.text
.replace(/[\n\t\r]+/g, " ")
.replace(/\s+/g, " ")
.trim()
: "No text available"}
</Typography>
</Paper>
</TableCell>
</TableRow>
)}
</React.Fragment>
))}
</TableBody>
</Table>
</TableContainer>
) : (
<Paper
sx={{
p: 4,
display: "flex",
justifyContent: "center",
alignItems: "center",
height: "100%",
borderRadius: theme.shape.borderRadius,
backgroundColor: alpha(theme.palette.background.paper, 0.5),
border: `1px dashed ${theme.palette.divider}`,
}}
>
<Typography color="text.secondary">No data available</Typography>
</Paper>
)}
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./csv-table";

View File

@@ -0,0 +1,29 @@
import { Box } from "@mui/material";
export type DisabledProps = {
message: string;
};
export const Disabled = ({ message }: DisabledProps) => {
return (
<Box
bgcolor="background.default"
minHeight="100vh"
display="flex"
justifyContent="center"
alignItems="center"
>
<h4
style={{
color: "#fff",
padding: "20px",
borderRadius: "8px",
background: "rgba(0, 0, 0, 0.6)",
boxShadow: "0 4px 8px rgba(0, 0, 0, 0.2)",
}}
>
{message}
</h4>
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./disabled";

View File

@@ -0,0 +1,204 @@
import {
Accordion,
AccordionSummary,
TableCell,
TableRow,
Paper,
TableBody,
useTheme,
TextField,
Box,
Typography,
AccordionDetails,
TableHead,
TableContainer,
Table,
} from "@mui/material";
import { useEffect, useState } from "react";
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
import { parseJsonToEntries } from "@/lib/helpers/parse-json-to-entries";
export type ExpandedTableInputProps = {
label: string;
onChange: (value: any) => void;
placeholder: string;
urlParam: string;
};
export const ExpandedTableInput = ({
label,
onChange,
placeholder,
urlParam,
}: ExpandedTableInputProps) => {
const theme = useTheme();
const [value, setValue] = useState("");
const [parsedHeaders, setParsedHeaders] = useState<[string, string][] | null>(
null
);
const [jsonError, setJsonError] = useState<string | null>(null);
const urlParams = new URLSearchParams(window.location.search);
const validateAndParse = (val: string) => {
if (val.trim() === "") {
setParsedHeaders(null);
setJsonError(null);
return null;
}
try {
const parsed = JSON.parse(val);
const entries = parseJsonToEntries(val);
if (entries === null) {
setParsedHeaders(null);
setJsonError("Invalid JSON object");
return null;
} else {
setParsedHeaders(entries);
setJsonError(null);
return parsed;
}
} catch (e) {
setParsedHeaders(null);
setJsonError("Invalid JSON format");
return null;
}
};
const handleChange = (e: React.ChangeEvent<HTMLInputElement>) => {
const val = e.target.value;
setValue(val);
const parsed = validateAndParse(val);
onChange(parsed);
};
useEffect(() => {
const jobOptions = urlParams.get("job_options");
if (!jobOptions) {
setParsedHeaders(null);
setJsonError(null);
return;
}
const jobOptionsObject = JSON.parse(jobOptions || "{}");
let val = jobOptionsObject[urlParam];
if (val.length === 0 || Object.keys(val).length === 0) {
setParsedHeaders(null);
setJsonError(null);
return;
}
if (typeof val === "string") {
try {
val = JSON.parse(val);
} catch {}
}
const finalVal =
typeof val === "string" ? val : val != null ? JSON.stringify(val) : "";
setValue(finalVal);
const parsed = validateAndParse(finalVal);
onChange(parsed);
}, [urlParam]);
return (
<Accordion
defaultExpanded
elevation={0}
sx={{
mb: 2,
border: `1px solid ${theme.palette.divider}`,
"&:before": { display: "none" },
borderRadius: 1,
overflow: "hidden",
padding: 1,
}}
>
<AccordionSummary
expandIcon={<ExpandMoreIcon />}
sx={{
backgroundColor: theme.palette.background.paper,
borderBottom: `1px solid ${theme.palette.divider}`,
"&.Mui-expanded": {
borderBottom: `1px solid ${theme.palette.divider}`,
},
}}
>
<Box sx={{ display: "flex", alignItems: "center" }}>
<Typography
sx={{ fontWeight: 500, color: theme.palette.text.primary }}
>
{label}
</Typography>
</Box>
</AccordionSummary>
<AccordionDetails
sx={{ p: 2, backgroundColor: theme.palette.background.default }}
>
<TextField
placeholder={placeholder}
value={value}
onChange={handleChange}
fullWidth
variant="outlined"
size="small"
error={jsonError !== null}
helperText={jsonError ?? ""}
/>
{parsedHeaders && parsedHeaders.length > 0 && (
<Paper
variant="outlined"
sx={{
marginTop: 1,
border: `1px solid ${theme.palette.divider}`,
borderRadius: 1,
overflow: "hidden",
padding: 0,
}}
>
<TableContainer sx={{ maxHeight: 200 }}>
<Table size="small" stickyHeader>
<TableHead>
<TableRow
sx={{
backgroundColor: theme.palette.background.paper,
}}
>
<TableCell sx={{ fontWeight: "bold" }}>Header</TableCell>
<TableCell sx={{ fontWeight: "bold" }}>Value</TableCell>
</TableRow>
</TableHead>
<TableBody>
{parsedHeaders.map(([key, val]) => (
<TableRow
key={key}
hover
sx={{
"&:nth-of-type(odd)": {
backgroundColor:
theme.palette.mode === "light"
? "rgba(0, 0, 0, 0.02)"
: "rgba(255, 255, 255, 0.02)",
},
}}
>
<TableCell sx={{ fontWeight: 500 }}>{key}</TableCell>
<TableCell>{val}</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</TableContainer>
</Paper>
)}
</AccordionDetails>
</Accordion>
);
};

View File

@@ -0,0 +1 @@
export * from "./expanded-table-input";

View File

@@ -0,0 +1 @@
export * from "./job-download-dialog";

View File

@@ -0,0 +1,95 @@
import {
Dialog,
DialogTitle,
DialogContent,
DialogActions,
Button,
FormControl,
RadioGroup,
FormControlLabel,
Radio,
FormLabel,
Typography,
Box,
} from "@mui/material";
import { useState } from "react";
export type JobDownloadDialogProps = {
open: boolean;
onClose: () => void;
ids: string[];
};
export const JobDownloadDialog = ({
open,
onClose,
ids,
}: JobDownloadDialogProps) => {
const [jobFormat, setJobFormat] = useState<string>("csv");
const handleDownload = async () => {
const response = await fetch("/api/download", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ data: { ids: ids, job_format: jobFormat } }),
});
if (response.ok) {
const blob = await response.blob();
const url = window.URL.createObjectURL(blob);
const a = document.createElement("a");
a.style.display = "none";
a.href = url;
a.download = `job_${ids[0]}.${jobFormat}`;
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
} else {
console.error("Failed to download the file.");
}
};
return (
<Dialog open={open} onClose={onClose}>
<DialogTitle>Download Job</DialogTitle>
<DialogContent>
<FormControl>
<Typography variant="body1">
You are about to download {ids.length} job(s). Please select the
format that you would like to download them in.
</Typography>
<br />
<Box
sx={{
display: "flex",
flexDirection: "column",
backgroundColor: "background.paper",
padding: 2,
border: "1px solid",
}}
>
<FormLabel>Format</FormLabel>
<hr style={{ width: "100%", margin: "10px 0" }} />
<RadioGroup
aria-labelledby="job-download-format-radio-buttons"
name="job-download-format-radio-buttons"
value={jobFormat}
onChange={(e) => setJobFormat(e.target.value)}
>
<FormControlLabel value="csv" control={<Radio />} label="CSV" />
<FormControlLabel
value="md"
control={<Radio />}
label="Markdown"
/>
</RadioGroup>
</Box>
<br />
<Button onClick={handleDownload} size="small">
Download
</Button>
</FormControl>
</DialogContent>
</Dialog>
);
};

View File

@@ -0,0 +1,40 @@
import { Box, Typography } from "@mui/material";
interface AudioViewerProps {
mediaUrl: string;
selectedMedia: string;
onError: () => void;
}
export const AudioViewer = ({
mediaUrl,
selectedMedia,
onError,
}: AudioViewerProps) => {
return (
<Box
sx={{
display: "flex",
justifyContent: "center",
alignItems: "center",
flexDirection: "column",
height: "100%",
gap: 2,
}}
>
<Typography variant="h6">{selectedMedia}</Typography>
<audio
controls
onError={onError}
style={{
width: "80%",
maxWidth: "500px",
}}
>
<source src={mediaUrl} type="audio/mpeg" />
Your browser does not support the audio element.
</audio>
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./audio-viewer";

View File

@@ -0,0 +1,36 @@
import { Box, useTheme } from "@mui/material";
export const ImageViewer = ({
mediaUrl,
selectedMedia,
}: {
mediaUrl: string;
selectedMedia: string;
}) => {
const theme = useTheme();
return (
<Box
sx={{
display: "flex",
justifyContent: "center",
alignItems: "center",
height: "100%",
width: "100%",
overflow: "hidden",
position: "relative",
}}
>
<img
src={mediaUrl}
alt={selectedMedia}
style={{
maxHeight: "100%",
maxWidth: "100%",
objectFit: "contain",
borderRadius: "4px",
boxShadow: theme.shadows[4],
}}
/>
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./image-viewer";

View File

@@ -0,0 +1 @@
export * from "./media-viewer";

View File

@@ -0,0 +1,75 @@
import { Box, Typography } from "@mui/material";
import { ImageViewer } from "./image";
import { VideoViewer } from "./video";
import { AudioViewer } from "./audio";
import { PDFViewer } from "./pdf-viewer";
interface MediaViewerProps {
selectedMedia: string;
activeTab: string;
getMediaUrl: (fileName: string) => string;
onError: (error: string) => void;
}
export const MediaViewer = ({
selectedMedia,
activeTab,
getMediaUrl,
onError,
}: MediaViewerProps) => {
if (!selectedMedia) {
return (
<Box
sx={{
display: "flex",
justifyContent: "center",
alignItems: "center",
height: "100%",
}}
>
<Typography variant="body1" color="textSecondary">
Select a file to view
</Typography>
</Box>
);
}
const mediaUrl = getMediaUrl(selectedMedia);
switch (activeTab) {
case "images":
return <ImageViewer mediaUrl={mediaUrl} selectedMedia={selectedMedia} />;
case "videos":
return (
<VideoViewer
mediaUrl={mediaUrl}
onError={() => onError("Error loading video")}
/>
);
case "audio":
return (
<AudioViewer
mediaUrl={mediaUrl}
selectedMedia={selectedMedia}
onError={() => onError("Error loading audio")}
/>
);
case "pdfs":
return <PDFViewer mediaUrl={mediaUrl} selectedMedia={selectedMedia} />;
default:
return (
<Box
sx={{
display: "flex",
justifyContent: "center",
alignItems: "center",
height: "100%",
}}
>
<Typography variant="body1">
{selectedMedia} - Download this file to view it
</Typography>
</Box>
);
}
};

Some files were not shown because too many files have changed in this diff Show More