66 Commits

Author SHA1 Message Date
github-actions[bot]
44ccad1935 chore: bump version to 1.1.6
Some checks failed
Merge / version (push) Has been cancelled
Merge / build-and-deploy (push) Has been cancelled
2025-07-13 02:13:39 +00:00
Jayden Pyles
308759d70c chore: bump version 2025-07-12 21:13:37 -05:00
github-actions[bot]
6bf130dd4b chore: bump version to 1.1.6 2025-07-13 02:12:41 +00:00
Jayden Pyles
875a3684c9 Feat/swap to sqlalchemy (#99)
* chore: wip swap to sqlalchemy

* feat: swap to sqlalchemy

* feat: swap to sqlalchemy

* feat: swap to sqlalchemy

* feat: swap to sqlalchemy
2025-07-12 21:12:33 -05:00
github-actions[bot]
b096fb1b3c chore: bump version to 1.1.5
Some checks failed
Merge / version (push) Has been cancelled
Merge / build-and-deploy (push) Has been cancelled
2025-07-05 16:30:10 +00:00
Jayden Pyles
5f65125882 chore: push for arm64 2025-07-05 11:30:01 -05:00
github-actions[bot]
327db34683 chore: bump version to 1.1.5 2025-07-05 16:03:12 +00:00
Jayden Pyles
8d0f362a70 chore: push for arm64 2025-07-05 10:47:02 -05:00
github-actions[bot]
24f4b57fea chore: bump version to 1.1.4
Some checks failed
Merge / version (push) Has been cancelled
Merge / build-and-deploy (push) Has been cancelled
2025-06-18 23:06:20 +00:00
Gaurav Agnihotri
1c0dec6db6 fix: pin browserforge version to 1.2.1 (#93)
Co-authored-by: gauravagnihotri <gaagniho@mtu.edu>
2025-06-18 18:06:10 -05:00
github-actions[bot]
e9c60f6338 chore: bump version to 1.1.3
Some checks failed
Merge / version (push) Has been cancelled
Merge / build-and-deploy (push) Has been cancelled
2025-06-12 23:06:34 +00:00
Jayden Pyles
5719a85491 chore: update chart version 2025-06-12 18:07:55 -05:00
github-actions[bot]
052d80de07 chore: bump version to 1.1.3 2025-06-12 23:03:26 +00:00
Jayden Pyles
7047a3c0e3 chore: update chart version 2025-06-12 18:04:47 -05:00
github-actions[bot]
71f603fc62 chore: bump version to 1.1.3 2025-06-12 23:01:57 +00:00
Jayden Pyles
86a77a27df chore: update chart version 2025-06-12 18:03:20 -05:00
github-actions[bot]
b11e263b93 chore: bump version to 1.1.3 2025-06-12 23:00:47 +00:00
Jayden Pyles
91dc13348d feat: add import/export for job configurations (#91)
* chore: wip add upload/import

* chore: wip add upload/import

* feat: update job rerunning

* fix: update workflow

* fix: update workflow

* chore: temp disable workflow
2025-06-12 18:00:39 -05:00
github-actions[bot]
93b0c83381 chore: bump version to 1.1.2
Some checks failed
Merge / tests (push) Has been cancelled
Merge / version (push) Has been cancelled
Merge / build-and-deploy (push) Has been cancelled
2025-06-08 23:24:17 +00:00
Jayden Pyles
9381ba9232 chore: update workflow 2025-06-08 18:17:03 -05:00
Jayden Pyles
20dccc5527 feat: edit ui + add return html option (#90)
* fix: restyle the element table

* chore: wip ui

* wip: edit styles

* feat: add html return

* fix: build

* fix: workflow

* fix: workflow

* fix: workflow

* fix: workflow

* fix: workflow

* fix: workflow

* fix: workflow

* fix: cypress test

* chore: update photo [skip ci]
2025-06-08 18:14:02 -05:00
Jayden Pyles
02619eb184 feat: update workflows [no bump]
Some checks failed
Merge / tests (push) Has been cancelled
Merge / version (push) Has been cancelled
Merge / build-and-deploy (push) Has been cancelled
2025-06-05 22:19:41 -05:00
github-actions[bot]
58c6c09fc9 chore: bump version to 1.1.2 2025-06-06 03:18:03 +00:00
Jayden Pyles
bf896b4c6b feat: update workflows [no bump] 2025-06-05 22:09:49 -05:00
Jayden Pyles
e3b9c11ab7 feat: update workflows [no bump] 2025-06-05 21:59:54 -05:00
github-actions[bot]
32da3375b3 chore: bump version to 1.1.1 2025-06-06 02:56:42 +00:00
Jayden Pyles
b5131cbe4c feat: update workflows [no bump] 2025-06-05 21:47:55 -05:00
github-actions[bot]
47c4c9a7d1 chore: bump version to 1.1.1
Some checks failed
Merge / tests (push) Has been cancelled
Merge / version (push) Has been cancelled
Merge / build-and-deploy (push) Has been cancelled
2025-06-05 01:48:00 +00:00
Jayden Pyles
4352988666 feat: update workflows 2025-06-04 20:40:16 -05:00
Jayden Pyles
00759151e6 feat: update workflos 2025-06-04 20:35:06 -05:00
github-actions[bot]
bfae00ca72 chore: bump version to 1.1.1 2025-06-04 23:06:51 +00:00
Jayden Pyles
e810700569 chore: remove deprecated output version 2025-06-04 17:58:49 -05:00
Jayden Pyles
9857fa96e0 fix: message [skip ci] 2025-06-03 17:12:51 -05:00
github-actions[bot]
b52fbc538d chore: bump version to 1.1.1
Some checks failed
Merge / tests (push) Has been cancelled
Merge / version (push) Has been cancelled
Merge / build-and-deploy (push) Has been cancelled
2025-06-03 16:27:54 +00:00
Jayden Pyles
42c0f3ae79 feat: add auto deploy workflow 2025-06-03 11:20:13 -05:00
github-actions[bot]
9aab2f9b4f chore: bump version to 1.1.1 2025-06-03 15:55:42 +00:00
Jayden Pyles
e182d3e4b8 feat: add auto deploy workflow 2025-06-03 10:48:02 -05:00
github-actions[bot]
53f35989f5 chore: bump version to 1.1.1
Some checks failed
Merge / tests (push) Has been cancelled
Merge / version (push) Has been cancelled
Merge / build-and-deploy (push) Has been cancelled
2025-06-03 03:06:41 +00:00
Jayden Pyles
a67ab34cfa feat: add auto deploy workflow 2025-06-02 21:59:06 -05:00
Jayden Pyles
3bf6657191 Merge branch 'master' of github.com:jaypyles/Scraperr 2025-06-02 21:58:04 -05:00
Jayden Pyles
c38d19a0ca feat: add auto deploy workflow 2025-06-02 21:57:44 -05:00
github-actions[bot]
a53e7e1aa1 chore: bump version to 1.1.1 2025-06-03 02:46:40 +00:00
Jayden Pyles
84368b1f6d feat: add auto deploy workflow 2025-06-02 21:38:25 -05:00
github-actions[bot]
ce4c1ceaa7 chore: bump version to 1.1.1 2025-06-03 02:27:38 +00:00
Jayden Pyles
7e1ce58bb8 feat: add auto deploy workflow 2025-06-02 21:19:43 -05:00
github-actions[bot]
175e7d63bf chore: bump version to 1.1.1 2025-06-03 01:52:57 +00:00
Jayden Pyles
d2c06de247 feat: add auto deploy workflow 2025-06-02 20:45:06 -05:00
github-actions[bot]
e0159bf9d4 chore: bump version to 1.1.1 2025-06-03 01:39:05 +00:00
Jayden Pyles
6d574ddfd2 feat: add auto deploy workflow 2025-06-02 20:31:34 -05:00
github-actions[bot]
b089d72786 chore: bump version to 2025-06-03 01:02:11 +00:00
Jayden Pyles
9ee4d577fd feat: add auto deploy workflow 2025-06-02 19:54:33 -05:00
Jayden Pyles
cddce5164d feat: add auto deploy workflow 2025-06-02 19:45:49 -05:00
Jayden Pyles
bf3163bfba feat: add auto deploy workflow 2025-06-02 19:09:57 -05:00
Jayden Pyles
54b513e92c feat: auto deploy 2025-06-02 19:08:18 -05:00
Jayden Pyles
6c56f2f161 Chore: app refactor (#88)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
* chore: refactor wip

* chore: refactor wip

* chore: refactor wip

* chore: refactor wip

* chore: refactor wip

* chore: refactor wip

* chore: work in progress

* chore: work in progress

* chore: work in progress

* chore: work in progress

* chore: work in progress

* chore: work in progress

* chore: work in progress

* chore: work in progress

* chore: work in progress

* chore: refactor wip

* chore: work in progress

* chore: refactor wip

* chore: refactor wip

* chore: refactor wip

* fix: build

* fix: cypress test

* fix: cypress test

* fix: cypress test

* fix: cypress test

* fix: cypress test

* fix: cypress test

* fix: cypress test

* fix: cypress test

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests

* fix: cypress tests
2025-06-01 15:56:15 -05:00
Jayden Pyles
d4edb9d93e chore: update chart version [skip ci] 2025-05-19 20:46:19 -05:00
Jayden Pyles
5ebd96b62b feat: add agent mode (#81)
* chore: wip agent mode

* wip: add agent mode frontend

* wip: add agent mode frontend

* chore: cleanup code

* chore: cleanup code

* chore: cleanup code
2025-05-19 20:44:41 -05:00
Jayden Pyles
d602d3330a fix: site map
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
2025-05-17 17:05:37 -05:00
Jayden Pyles
6639e8b48f chore: update chart version [skip ci] 2025-05-17 16:33:18 -05:00
Jayden Pyles
263e46ba4d feat: add media viewer + other fixes (#79)
* feat: add media viewer + other fixes

* chore: remove logging [skip ci]

* chore: remove logging [skip ci]

* feat: add unit test for media

* feat: add unit test for media

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* chore: update docs [skip ci]
2025-05-17 16:31:34 -05:00
Jayden Pyles
f815a58efc chore: update docker version [skip ci] 2025-05-16 22:04:46 -05:00
Jayden Pyles
50ec5df657 chore: update chart version [skip ci] 2025-05-16 21:39:04 -05:00
Jayden Pyles
28de0f362c feat: add recording viewer and vnc (#78)
* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* feat: add recording viewer and vnc

* chore: update gitignore [skip ci]

* chore: update dev compose [skip ci]

* fix: only run manually
2025-05-16 21:37:09 -05:00
Jayden Pyles
6b33723cac feat: update version
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
2025-05-16 14:15:53 -05:00
Jayden Pyles
5c89e4d7d2 feat: allow custom cookies (#77)
* feat: working new advanced job options

* feat: working new advanced job options

* feat: add tests for adding custom cookies/headers
2025-05-16 14:13:58 -05:00
Jayden Pyles
ed0828a585 fix: deployment
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
2025-05-13 21:03:21 -05:00
208 changed files with 13572 additions and 14290 deletions

4
.dockerignore Normal file
View File

@@ -0,0 +1,4 @@
node_modules
npm-debug.log
Dockerfile
.dockerignore

View File

@@ -5,6 +5,9 @@ inputs:
app-repo-token:
required: true
description: "The token for the target repository"
version:
required: true
description: "The version of the Helm chart"
runs:
using: 'composite'
@@ -15,6 +18,11 @@ runs:
- name: Set up Helm
uses: azure/setup-helm@v3
- name: Update Helm chart version
run: |
sed -i "s/^version: .*/version: ${{ inputs.version }}/" helm/Chart.yaml
shell: bash
- name: Package Helm chart
run: |
mkdir -p packaged

View File

@@ -2,6 +2,13 @@ name: Run Cypress Tests
description: Run Cypress tests
inputs:
openai_key:
description: "OpenAI API key"
required: true
default: ""
runs:
using: "composite"
steps:
@@ -13,13 +20,25 @@ runs:
with:
node-version: 22
- name: Setup yarn
shell: bash
run: npm install -g yarn
- name: Install xvfb for headless testing
shell: bash
run: |
sudo apt-get update
sudo apt-get install -y xvfb libnss3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libasound2t64 libpango-1.0-0 libcairo2 libgtk-3-0 libgdk-pixbuf2.0-0 libx11-6 libx11-xcb1 libxcb1 libxss1 libxtst6 libnspr4
- name: Setup Docker project
shell: bash
run: make build up-dev
run: |
export OPENAI_KEY="${{ inputs.openai_key }}"
make build-ci up-ci
- name: Install dependencies
shell: bash
run: npm install
run: yarn install
- name: Wait for frontend to be ready
shell: bash
@@ -54,5 +73,8 @@ runs:
- name: Run Cypress tests
shell: bash
run: npm run cy:run
run: |
set -e
npm run cy:run

31
.github/workflows/cypress-tests.yml vendored Normal file
View File

@@ -0,0 +1,31 @@
name: Cypress Tests
on:
workflow_call:
secrets:
openai_key:
required: true
jobs:
cypress-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run Cypress Tests
id: run-tests
uses: ./.github/actions/run-cypress-tests
with:
openai_key: ${{ secrets.openai_key }}
- name: Check container logs on failure
if: steps.run-tests.conclusion == 'failure'
run: |
echo "Cypress tests failed. Dumping container logs..."
docker logs scraperr_api || true
- name: Fail job if Cypress failed
if: steps.run-tests.conclusion == 'failure'
run: exit 1

View File

@@ -1,24 +1,36 @@
name: Docker Image
on:
workflow_run:
workflows: ["Unit Tests"]
types:
- completed
workflow_dispatch:
workflow_call:
inputs:
version:
required: true
type: string
secrets:
dockerhub_username:
required: true
dockerhub_token:
required: true
repo_token:
required: true
discord_webhook_url:
required: true
jobs:
build:
if: ${{ github.event.workflow_run.conclusion == 'success' && github.ref == 'refs/heads/master' }}
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Get version from helm chart
- name: Echo version
run: |
VERSION=$(grep "version:" ./helm/Chart.yaml | cut -d: -f2 | tr -d ' ')
echo "VERSION=$VERSION" >> $GITHUB_ENV
echo "Version is $VERSION"
echo "Version is ${{ inputs.version }}"
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to Docker Hub
uses: docker/login-action@v3
@@ -26,28 +38,27 @@ jobs:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build and push frontend
- name: Build and push frontend (multi-arch)
uses: docker/build-push-action@v5
with:
context: .
file: ./docker/frontend/Dockerfile
push: true
platforms: linux/amd64,linux/arm64
tags: |
${{ secrets.DOCKERHUB_USERNAME }}/${{ secrets.DOCKERHUB_REPO }}:latest
${{ secrets.DOCKERHUB_USERNAME }}/${{ secrets.DOCKERHUB_REPO }}:${{ env.VERSION }}
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:latest
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:${{ inputs.version }}
- name: Build and push api
- name: Build and push api (multi-arch)
uses: docker/build-push-action@v5
with:
context: .
file: ./docker/api/Dockerfile
push: true
platforms: linux/amd64,linux/arm64
tags: |
${{ secrets.DOCKERHUB_USERNAME }}/${{ secrets.DOCKERHUB_REPO }}:latest
${{ secrets.DOCKERHUB_USERNAME }}/${{ secrets.DOCKERHUB_REPO }}:${{ env.VERSION }}
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:${{ inputs.version }}
push-helm-chart:
runs-on: ubuntu-latest
@@ -59,7 +70,8 @@ jobs:
- name: Push Helm Chart
uses: ./.github/actions/push-to-helm
with:
app-repo-token: ${{ secrets.GPAT_TOKEN }}
app-repo-token: ${{ secrets.repo_token }}
version: ${{ inputs.version }}
success-message:
runs-on: ubuntu-latest
@@ -71,7 +83,7 @@ jobs:
uses: jaypyles/discord-webhook-action@v1.0.0
with:
webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
content: "Scraperr Successfully Built Docker Images"
content: "Scraperr Successfully Built Docker Images (v${{ inputs.version }})"
username: "Scraperr CI"
embed-title: "✅ Deployment Status"
embed-description: "Scraperr successfully built docker images."

35
.github/workflows/merge.yml vendored Normal file
View File

@@ -0,0 +1,35 @@
name: Merge
on:
push:
branches:
- master
pull_request:
types: [closed]
branches:
- master
jobs:
# TODO: Renable once browser forge is fixed for camoufox, or else tests will never pass
# tests:
# uses: ./.github/workflows/tests.yml
# secrets:
# openai_key: ${{ secrets.OPENAI_KEY }}
# discord_webhook_url: ${{ secrets.DISCORD_WEBHOOK_URL }}
version:
uses: ./.github/workflows/version.yml
secrets:
git_token: ${{ secrets.GPAT_TOKEN }}
build-and-deploy:
if: needs.version.outputs.version_bump == 'true'
needs: version
uses: ./.github/workflows/docker-image.yml
secrets:
dockerhub_username: ${{ secrets.DOCKERHUB_USERNAME }}
dockerhub_token: ${{ secrets.DOCKERHUB_TOKEN }}
repo_token: ${{ secrets.GPAT_TOKEN }}
discord_webhook_url: ${{ secrets.DISCORD_WEBHOOK_URL }}
with:
version: ${{ needs.version.outputs.version }}

15
.github/workflows/pr.yml vendored Normal file
View File

@@ -0,0 +1,15 @@
name: PR
on:
pull_request:
branches:
- master
types: [opened, synchronize, reopened]
workflow_dispatch:
jobs:
tests:
uses: ./.github/workflows/tests.yml
secrets:
openai_key: ${{ secrets.OPENAI_KEY }}
discord_webhook_url: ${{ secrets.DISCORD_WEBHOOK_URL }}

29
.github/workflows/pytest.yml vendored Normal file
View File

@@ -0,0 +1,29 @@
name: Pytest
on:
workflow_call:
jobs:
unit-tests:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- uses: actions/setup-node@v3
- name: Set env
run: echo "ENV=test" >> $GITHUB_ENV
- name: Install pdm
run: pip install pdm
- name: Install project dependencies
run: pdm install
- name: Install playwright
run: pdm run playwright install --with-deps
- name: Run tests
run: PYTHONPATH=. pdm run pytest -v -ra api/backend/tests

42
.github/workflows/tests.yml vendored Normal file
View File

@@ -0,0 +1,42 @@
name: Reusable PR Tests
on:
workflow_call:
secrets:
openai_key:
required: true
discord_webhook_url:
required: true
jobs:
pytest:
uses: ./.github/workflows/pytest.yml
cypress-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run Cypress Tests
uses: ./.github/actions/run-cypress-tests
with:
openai_key: ${{ secrets.openai_key }}
success-message:
runs-on: ubuntu-latest
needs:
- pytest
- cypress-tests
steps:
- name: Send Discord Message
uses: jaypyles/discord-webhook-action@v1.0.0
with:
webhook-url: ${{ secrets.discord_webhook_url }}
content: "Scraperr Successfully Passed Tests"
username: "Scraperr CI"
embed-title: "✅ Deployment Status"
embed-description: "Scraperr successfully passed all tests."
embed-color: 3066993
embed-footer-text: "Scraperr CI"
embed-timestamp: ${{ github.event.head_commit.timestamp }}

View File

@@ -1,57 +0,0 @@
name: Unit Tests
on:
push:
branches:
- master
pull_request:
types: [opened, synchronize, reopened]
workflow_dispatch:
jobs:
unit-tests:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set env
run: echo "ENV=test" >> $GITHUB_ENV
- name: Install pdm
run: pip install pdm
- name: Install project dependencies
run: pdm install
- name: Install playwright
run: pdm run playwright install
- name: Run tests
run: PYTHONPATH=. pdm run pytest api/backend/tests
cypress-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/run-cypress-tests
success-message:
runs-on: ubuntu-latest
needs:
- unit-tests
- cypress-tests
steps:
- name: Send Discord Message
uses: jaypyles/discord-webhook-action@v1.0.0
with:
webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
content: "Scraperr Successfully Passed Tests"
username: "Scraperr CI"
embed-title: "✅ Deployment Status"
embed-description: "Scraperr successfully passed all tests."
embed-color: 3066993 # Green
embed-footer-text: "Scraperr CI"
embed-timestamp: ${{ github.event.head_commit.timestamp }}

89
.github/workflows/version.yml vendored Normal file
View File

@@ -0,0 +1,89 @@
name: Version
on:
workflow_call:
secrets:
git_token:
required: true
outputs:
version:
description: "The new version number"
value: ${{ jobs.version.outputs.version }}
version_bump:
description: "Whether the version was bumped"
value: ${{ jobs.version.outputs.version_bump }}
jobs:
version:
runs-on: ubuntu-latest
outputs:
version: ${{ steps.set_version.outputs.version }}
version_bump: ${{ steps.check_version_bump.outputs.version_bump }}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Get version bump
id: get_version_type
run: |
COMMIT_MSG=$(git log -1 --pretty=%B)
if [[ $COMMIT_MSG =~ ^feat\(breaking\) ]]; then
VERSION_TYPE="major"
elif [[ $COMMIT_MSG =~ ^feat! ]]; then
VERSION_TYPE="minor"
elif [[ $COMMIT_MSG =~ ^(feat|fix|chore): ]]; then
VERSION_TYPE="patch"
else
VERSION_TYPE="patch"
fi
echo "VERSION_TYPE=$VERSION_TYPE" >> $GITHUB_ENV
- name: Check for version bump
id: check_version_bump
run: |
COMMIT_MSG=$(git log -1 --pretty=%B)
if [[ $COMMIT_MSG =~ .*\[no\ bump\].* ]]; then
echo "version_bump=false" >> $GITHUB_OUTPUT
else
echo "version_bump=true" >> $GITHUB_OUTPUT
fi
- name: Skip version bump
if: steps.check_version_bump.outputs.version_bump == 'false'
run: |
echo "Skipping version bump as requested"
gh run cancel ${{ github.run_id }}
exit 0
env:
GITHUB_TOKEN: ${{ secrets.git_token }}
- name: Set version
if: steps.check_version_bump.outputs.version_bump != 'false'
id: set_version
run: |
VERSION=$(./scripts/version.sh "$VERSION_TYPE")
echo "VERSION=$VERSION" >> $GITHUB_ENV
echo "Version is $VERSION"
echo "version=$VERSION" >> $GITHUB_OUTPUT
env:
VERSION_TYPE: ${{ env.VERSION_TYPE }}
- name: Update chart file
if: steps.check_version_bump.outputs.version_bump != 'false'
run: |
sed -i "s/^version: .*/version: $VERSION/" helm/Chart.yaml
git config --local user.email "github-actions[bot]@users.noreply.github.com"
git config --local user.name "github-actions[bot]"
git add helm/Chart.yaml
git commit -m "chore: bump version to $VERSION"
git push
env:
VERSION: ${{ env.VERSION }}

16
.gitignore vendored
View File

@@ -188,4 +188,18 @@ postgres_data
.vscode
ollama
data
media
media/images
media/videos
media/audio
media/pdfs
media/spreadsheets
media/presentations
media/documents
media/recordings
media/download_summary.txt
cypress/screenshots
cypress/videos
docker-compose.dev.local.yml

View File

@@ -1,6 +1,6 @@
.DEFAULT_GOAL := help
COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.yml
COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.local.yml
COMPOSE_PROD = docker compose -f docker-compose.yml
.PHONY: help deps build pull up up-dev down setup deploy
@@ -17,6 +17,7 @@ help:
@echo " make down - Stop and remove containers, networks, images, and volumes"
@echo " make setup - Setup server with dependencies and clone repo"
@echo " make deploy - Deploy site onto server"
@echo " make cypress-start - Start Cypress"
@echo ""
logs:
@@ -51,3 +52,15 @@ setup:
deploy:
ansible-playbook -i ./ansible/inventory.yaml ./ansible/deploy_site.yaml -v
build-ci:
docker compose -f docker-compose.yml -f docker-compose.dev.yml build
up-ci:
docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d --force-recreate
cypress-start:
DISPLAY=:0 npx cypress open
cypress-run:
npx cypress run

View File

@@ -13,7 +13,7 @@
## 📋 Overview
Scraperr enables you to extract data from websites with precision using XPath selectors. This self-hosted application provides a clean interface to manage scraping jobs, view results, and export data.
Scrape websites without writing a single line of code.
> 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information.
@@ -29,7 +29,7 @@ Scraperr enables you to extract data from websites with precision using XPath se
- **Custom Headers**: Add JSON headers to your scraping requests
- **Media Downloads**: Automatically download images, videos, and other media
- **Results Visualization**: View scraped data in a structured table format
- **Data Export**: Export your results in various formats
- **Data Export**: Export your results in markdown and csv formats
- **Notifcation Channels**: Send completion notifcations, through various channels
## 🚀 Getting Started

147
alembic.ini Normal file
View File

@@ -0,0 +1,147 @@
# A generic, single database configuration.
[alembic]
# path to migration scripts.
# this is typically a path given in POSIX (e.g. forward slashes)
# format, relative to the token %(here)s which refers to the location of this
# ini file
script_location = %(here)s/alembic
# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
# Uncomment the line below if you want the files to be prepended with date and time
# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
# for all available tokens
# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
# sys.path path, will be prepended to sys.path if present.
# defaults to the current working directory. for multiple paths, the path separator
# is defined by "path_separator" below.
prepend_sys_path = .
# timezone to use when rendering the date within the migration file
# as well as the filename.
# If specified, requires the python>=3.9 or backports.zoneinfo library and tzdata library.
# Any required deps can installed by adding `alembic[tz]` to the pip requirements
# string value is passed to ZoneInfo()
# leave blank for localtime
# timezone =
# max length of characters to apply to the "slug" field
# truncate_slug_length = 40
# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false
# set to 'true' to allow .pyc and .pyo files without
# a source .py file to be detected as revisions in the
# versions/ directory
# sourceless = false
# version location specification; This defaults
# to <script_location>/versions. When using multiple version
# directories, initial revisions must be specified with --version-path.
# The path separator used here should be the separator specified by "path_separator"
# below.
# version_locations = %(here)s/bar:%(here)s/bat:%(here)s/alembic/versions
# path_separator; This indicates what character is used to split lists of file
# paths, including version_locations and prepend_sys_path within configparser
# files such as alembic.ini.
# The default rendered in new alembic.ini files is "os", which uses os.pathsep
# to provide os-dependent path splitting.
#
# Note that in order to support legacy alembic.ini files, this default does NOT
# take place if path_separator is not present in alembic.ini. If this
# option is omitted entirely, fallback logic is as follows:
#
# 1. Parsing of the version_locations option falls back to using the legacy
# "version_path_separator" key, which if absent then falls back to the legacy
# behavior of splitting on spaces and/or commas.
# 2. Parsing of the prepend_sys_path option falls back to the legacy
# behavior of splitting on spaces, commas, or colons.
#
# Valid values for path_separator are:
#
# path_separator = :
# path_separator = ;
# path_separator = space
# path_separator = newline
#
# Use os.pathsep. Default configuration used for new projects.
path_separator = os
# set to 'true' to search source files recursively
# in each "version_locations" directory
# new in Alembic version 1.10
# recursive_version_locations = false
# the output encoding used when revision files
# are written from script.py.mako
# output_encoding = utf-8
# database URL. This is consumed by the user-maintained env.py script only.
# other means of configuring database URLs may be customized within the env.py
# file.
sqlalchemy.url = driver://user:pass@localhost/dbname
[post_write_hooks]
# post_write_hooks defines scripts or Python functions that are run
# on newly generated revision scripts. See the documentation for further
# detail and examples
# format using "black" - use the console_scripts runner, against the "black" entrypoint
# hooks = black
# black.type = console_scripts
# black.entrypoint = black
# black.options = -l 79 REVISION_SCRIPT_FILENAME
# lint with attempts to fix using "ruff" - use the module runner, against the "ruff" module
# hooks = ruff
# ruff.type = module
# ruff.module = ruff
# ruff.options = check --fix REVISION_SCRIPT_FILENAME
# Alternatively, use the exec runner to execute a binary found on your PATH
# hooks = ruff
# ruff.type = exec
# ruff.executable = ruff
# ruff.options = check --fix REVISION_SCRIPT_FILENAME
# Logging configuration. This is also consumed by the user-maintained
# env.py script only.
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARNING
handlers = console
qualname =
[logger_sqlalchemy]
level = WARNING
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S

1
alembic/README Normal file
View File

@@ -0,0 +1 @@
Generic single-database configuration.

103
alembic/env.py Normal file
View File

@@ -0,0 +1,103 @@
# STL
import os
import sys
from logging.config import fileConfig
# PDM
from dotenv import load_dotenv
from sqlalchemy import pool, engine_from_config
# LOCAL
from alembic import context
from api.backend.database.base import Base
from api.backend.database.models import Job, User, CronJob # type: ignore
load_dotenv()
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "api")))
# Load the raw async database URL
raw_database_url = os.getenv("DATABASE_URL", "sqlite+aiosqlite:///data/database.db")
# Map async dialects to sync ones
driver_downgrade_map = {
"sqlite+aiosqlite": "sqlite",
"postgresql+asyncpg": "postgresql",
"mysql+aiomysql": "mysql",
}
# Extract scheme and convert if async
for async_driver, sync_driver in driver_downgrade_map.items():
if raw_database_url.startswith(async_driver + "://"):
sync_database_url = raw_database_url.replace(async_driver, sync_driver, 1)
break
else:
# No async driver detected — assume it's already sync
sync_database_url = raw_database_url
# Apply it to Alembic config
config = context.config
config.set_main_option("sqlalchemy.url", sync_database_url)
# Interpret the config file for Python logging.
# This line sets up loggers basically.
if config.config_file_name is not None:
fileConfig(config.config_file_name)
# add your model's MetaData object here
# for 'autogenerate' support
# from myapp import mymodel
# target_metadata = mymodel.Base.metadata
target_metadata = Base.metadata
def run_migrations_offline() -> None:
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online() -> None:
"""Run migrations in 'online' mode.
In this scenario we need to create an Engine
and associate a connection with the context.
"""
connectable = engine_from_config(
config.get_section(config.config_ini_section, {}),
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
with connectable.connect() as connection:
context.configure(connection=connection, target_metadata=target_metadata)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()

28
alembic/script.py.mako Normal file
View File

@@ -0,0 +1,28 @@
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
# revision identifiers, used by Alembic.
revision: str = ${repr(up_revision)}
down_revision: Union[str, Sequence[str], None] = ${repr(down_revision)}
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
def upgrade() -> None:
"""Upgrade schema."""
${upgrades if upgrades else "pass"}
def downgrade() -> None:
"""Downgrade schema."""
${downgrades if downgrades else "pass"}

View File

@@ -0,0 +1,67 @@
"""initial revision
Revision ID: 6aa921d2e637
Revises:
Create Date: 2025-07-12 20:17:44.448034
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = '6aa921d2e637'
down_revision: Union[str, Sequence[str], None] = None
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
"""Upgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('users',
sa.Column('email', sa.String(length=255), nullable=False),
sa.Column('hashed_password', sa.String(length=255), nullable=False),
sa.Column('full_name', sa.String(length=255), nullable=True),
sa.Column('disabled', sa.Boolean(), nullable=True),
sa.PrimaryKeyConstraint('email')
)
op.create_table('jobs',
sa.Column('id', sa.String(length=64), nullable=False),
sa.Column('url', sa.String(length=2048), nullable=False),
sa.Column('elements', sa.JSON(), nullable=False),
sa.Column('user', sa.String(length=255), nullable=True),
sa.Column('time_created', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
sa.Column('result', sa.JSON(), nullable=False),
sa.Column('status', sa.String(length=50), nullable=False),
sa.Column('chat', sa.JSON(), nullable=True),
sa.Column('job_options', sa.JSON(), nullable=True),
sa.Column('agent_mode', sa.Boolean(), nullable=False),
sa.Column('prompt', sa.String(length=1024), nullable=True),
sa.Column('favorite', sa.Boolean(), nullable=False),
sa.ForeignKeyConstraint(['user'], ['users.email'], ),
sa.PrimaryKeyConstraint('id')
)
op.create_table('cron_jobs',
sa.Column('id', sa.String(length=64), nullable=False),
sa.Column('user_email', sa.String(length=255), nullable=False),
sa.Column('job_id', sa.String(length=64), nullable=False),
sa.Column('cron_expression', sa.String(length=255), nullable=False),
sa.Column('time_created', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
sa.Column('time_updated', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
sa.ForeignKeyConstraint(['job_id'], ['jobs.id'], ),
sa.ForeignKeyConstraint(['user_email'], ['users.email'], ),
sa.PrimaryKeyConstraint('id')
)
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table('cron_jobs')
op.drop_table('jobs')
op.drop_table('users')
# ### end Alembic commands ###

View File

@@ -0,0 +1,6 @@
from typing_extensions import TypedDict
class Action(TypedDict):
type: str
url: str

View File

@@ -0,0 +1,96 @@
# STL
import random
from typing import Any
# PDM
from camoufox import AsyncCamoufox
from playwright.async_api import Page
# LOCAL
from api.backend.constants import RECORDINGS_ENABLED
from api.backend.ai.clients import ask_ollama, ask_open_ai, open_ai_key
from api.backend.job.models import CapturedElement
from api.backend.worker.logger import LOG
from api.backend.ai.agent.utils import (
parse_response,
capture_elements,
convert_to_markdown,
)
from api.backend.ai.agent.prompts import (
EXTRACT_ELEMENTS_PROMPT,
ELEMENT_EXTRACTION_PROMPT,
)
from api.backend.job.scraping.add_custom import add_custom_items
from api.backend.job.scraping.collect_media import collect_media
ask_ai = ask_open_ai if open_ai_key else ask_ollama
async def scrape_with_agent(agent_job: dict[str, Any]):
LOG.info(f"Starting work for agent job: {agent_job}")
pages = set()
proxy = None
if agent_job["job_options"]["proxies"]:
proxy = random.choice(agent_job["job_options"]["proxies"])
LOG.info(f"Using proxy: {proxy}")
async with AsyncCamoufox(headless=not RECORDINGS_ENABLED, proxy=proxy) as browser:
page: Page = await browser.new_page()
await add_custom_items(
agent_job["url"],
page,
agent_job["job_options"]["custom_cookies"],
agent_job["job_options"]["custom_headers"],
)
try:
await page.set_viewport_size({"width": 1920, "height": 1080})
await page.goto(agent_job["url"], timeout=60000)
if agent_job["job_options"]["collect_media"]:
await collect_media(agent_job["id"], page)
html_content = await page.content()
markdown_content = convert_to_markdown(html_content)
response = await ask_ai(
ELEMENT_EXTRACTION_PROMPT.format(
extraction_prompt=EXTRACT_ELEMENTS_PROMPT,
webpage=markdown_content,
prompt=agent_job["prompt"],
)
)
xpaths = parse_response(response)
captured_elements = await capture_elements(
page, xpaths, agent_job["job_options"].get("return_html", False)
)
final_url = page.url
pages.add((html_content, final_url))
finally:
await page.close()
await browser.close()
name_to_elements = {}
for page in pages:
for element in captured_elements:
if element.name not in name_to_elements:
name_to_elements[element.name] = []
name_to_elements[element.name].append(element)
scraped_elements: list[dict[str, dict[str, list[CapturedElement]]]] = [
{
page[1]: name_to_elements,
}
for page in pages
]
return scraped_elements

View File

@@ -0,0 +1,58 @@
EXTRACT_ELEMENTS_PROMPT = """
You are an assistant that extracts XPath expressions from webpages.
You will receive HTML content in markdown format.
Each element in the markdown has their xpath shown above them in a path like:
<!-- //div -->
Respond only with a list of general XPath expressions inside `<xpaths>...</xpaths>` tags.
You will also decide the decision of what to do next. If there is no decision available, return nothing for that section.
"""
ELEMENT_EXTRACTION_PROMPT = """
{extraction_prompt}
**Guidelines:**
- Prefer shorter, more general XPaths like `//div[...]` or `//span[...]`.
- Avoid overly specific or deep paths like `//div[3]/ul/li[2]/a`.
- Do **not** chain multiple elements deeply (e.g., `//div/span/a`).
- Use XPaths further down the tree when possible.
- Do not include any extra explanation or text.
- One XPath is acceptable if that's all that's needed.
- Try and limit it down to 1 - 3 xpaths.
- Include a name for each xpath.
<important>
- USE THE MOST SIMPLE XPATHS POSSIBLE.
- USE THE MOST GENERAL XPATHS POSSIBLE.
- USE THE MOST SPECIFIC XPATHS POSSIBLE.
- USE THE MOST GENERAL XPATHS POSSIBLE.
</important>
**Example Format:**
```xml
<xpaths>
- <name: insert_name_here>: <xpath: //div>
- <name: insert_name_here>: <xpath: //span>
- <name: insert_name_here>: <xpath: //span[contains(@text, 'example')]>
- <name: insert_name_here>: <xpath: //div[contains(@text, 'example')]>
- <name: insert_name_here>: <xpath: //a[@href]>
- etc
</xpaths>
<decision>
<next_page>
- //a[@href='next_page_url']
</next_page>
</decision>
```
**Input webpage:**
{webpage}
**Target content:**
{prompt}
"""

View File

@@ -0,0 +1,272 @@
# STL
import re
# PDM
from lxml import html, etree
from playwright.async_api import Page
# LOCAL
from api.backend.job.models import CapturedElement
from api.backend.job.utils.text_utils import clean_text
def convert_to_markdown(html_str: str):
parser = html.HTMLParser()
tree = html.fromstring(html_str, parser=parser)
root = tree.getroottree()
def format_attributes(el: etree._Element) -> str:
"""Convert element attributes into a string."""
return " ".join(f'{k}="{v}"' for k, v in el.attrib.items())
def is_visible(el: etree._Element) -> bool:
style = el.attrib.get("style", "").lower()
class_ = el.attrib.get("class", "").lower()
# Check for visibility styles
if "display: none" in style or "visibility: hidden" in style:
return False
if "opacity: 0" in style or "opacity:0" in style:
return False
if "height: 0" in style or "width: 0" in style:
return False
# Check for common hidden classes
if any(
hidden in class_
for hidden in ["hidden", "invisible", "truncate", "collapse"]
):
return False
# Check for hidden attributes
if el.attrib.get("hidden") is not None:
return False
if el.attrib.get("aria-hidden") == "true":
return False
# Check for empty or whitespace-only content
if not el.text and len(el) == 0:
return False
return True
def is_layout_or_decorative(el: etree._Element) -> bool:
tag = el.tag.lower()
# Layout elements
if tag in {"nav", "footer", "header", "aside", "main", "section"}:
return True
# Decorative elements
if tag in {"svg", "path", "circle", "rect", "line", "polygon", "polyline"}:
return True
# Check id and class for layout/decorative keywords
id_class = " ".join(
[el.attrib.get("id", ""), el.attrib.get("class", "")]
).lower()
layout_keywords = {
"sidebar",
"nav",
"header",
"footer",
"menu",
"advert",
"ads",
"breadcrumb",
"container",
"wrapper",
"layout",
"grid",
"flex",
"row",
"column",
"section",
"banner",
"hero",
"card",
"modal",
"popup",
"tooltip",
"dropdown",
"overlay",
}
return any(keyword in id_class for keyword in layout_keywords)
# Tags to ignore in the final markdown output
included_tags = {
"div",
"span",
"a",
"p",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"img",
"button",
"input",
"textarea",
"ul",
"ol",
"li",
"table",
"tr",
"td",
"th",
"input",
"textarea",
"select",
"option",
"optgroup",
"fieldset",
"legend",
}
special_elements = []
normal_elements = []
for el in tree.iter():
if el.tag is etree.Comment:
continue
tag = el.tag.lower()
if tag not in included_tags:
continue
if not is_visible(el):
continue
if is_layout_or_decorative(el):
continue
path = root.getpath(el)
attrs = format_attributes(el)
attrs_str = f" {attrs}" if attrs else ""
text = el.text.strip() if el.text else ""
if not text and not attrs:
continue
# input elements
if tag == "button":
prefix = "🔘 **<button>**"
special_elements.append(f"<!-- {path} -->\n{prefix} {text}")
elif tag == "a":
href = el.attrib.get("href", "")
prefix = f"🔗 **<a href='{href}'>**"
special_elements.append(f"<!-- {path} -->\n{prefix} {text}")
elif tag == "input":
input_type = el.attrib.get("type", "text")
prefix = f"📝 **<input type='{input_type}'>**"
special_elements.append(f"<!-- {path} -->\n{prefix}")
else:
prefix = f"**<{tag}{attrs_str}>**"
if text:
normal_elements.append(f"<!-- {path} -->\n{prefix} {text}")
return "\n\n".join(normal_elements + special_elements) # type: ignore
def parse_response(text: str) -> list[dict[str, str]]:
xpaths = re.findall(r"<xpaths>(.*?)</xpaths>", text, re.DOTALL)
results = []
if xpaths:
lines = xpaths[0].strip().splitlines()
for line in lines:
if line.strip().startswith("-"):
name = re.findall(r"<name: (.*?)>", line)[0]
xpath = re.findall(r"<xpath: (.*?)>", line)[0]
results.append({"name": name, "xpath": xpath})
else:
results.append({"name": "", "xpath": line.strip()})
return results
def parse_next_page(text: str) -> str | None:
next_page = re.findall(r"<next_page>(.*?)</next_page>", text, re.DOTALL)
if next_page:
lines = next_page[0].strip().splitlines()
next_page = [
line.strip().lstrip("-").strip()
for line in lines
if line.strip().startswith("-")
]
return next_page[0] if next_page else None
async def capture_elements(
page: Page, xpaths: list[dict[str, str]], return_html: bool
) -> list[CapturedElement]:
captured_elements = []
seen_texts = set()
for xpath in xpaths:
try:
locator = page.locator(f"xpath={xpath['xpath']}")
count = await locator.count()
for i in range(count):
if return_html:
element_text = (
await page.locator(f"xpath={xpath['xpath']}")
.nth(i)
.inner_html()
)
seen_texts.add(element_text)
captured_elements.append(
CapturedElement(
name=xpath["name"],
text=element_text,
xpath=xpath["xpath"],
)
)
continue
element_text = ""
element_handle = await locator.nth(i).element_handle()
if not element_handle:
continue
link = await element_handle.get_attribute("href") or ""
text = await element_handle.text_content()
if text:
element_text += text
if link:
element_text += f" ({link})"
cleaned = clean_text(element_text)
if cleaned in seen_texts:
continue
seen_texts.add(cleaned)
captured_elements.append(
CapturedElement(
name=xpath["name"],
text=cleaned,
xpath=xpath["xpath"],
)
)
except Exception as e:
print(f"Error processing xpath {xpath}: {e}")
return captured_elements

View File

@@ -1,32 +1,28 @@
# STL
import os
import logging
from collections.abc import Iterable, AsyncGenerator
# PDM
from openai import OpenAI
from ollama import Message
from fastapi import APIRouter
from fastapi.responses import JSONResponse, StreamingResponse
from openai.types.chat import ChatCompletionMessageParam
# LOCAL
from ollama import Message, AsyncClient
from api.backend.models import AI
from api.backend.ai.clients import (
llama_model,
open_ai_key,
llama_client,
open_ai_model,
openai_client,
)
from api.backend.ai.schemas import AI
from api.backend.routers.handle_exceptions import handle_exceptions
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("AI")
ai_router = APIRouter()
# Load environment variables
open_ai_key = os.getenv("OPENAI_KEY")
open_ai_model = os.getenv("OPENAI_MODEL")
llama_url = os.getenv("OLLAMA_URL")
llama_model = os.getenv("OLLAMA_MODEL")
# Initialize clients
openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None
llama_client = AsyncClient(host=llama_url) if llama_url else None
async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]:
if llama_client and llama_model:
@@ -67,6 +63,7 @@ chat_function = llama_chat if llama_client else openai_chat
@ai_router.post("/ai")
@handle_exceptions(logger=LOG)
async def ai(c: AI):
return StreamingResponse(
chat_function(chat_messages=c.messages), media_type="text/plain"
@@ -74,5 +71,6 @@ async def ai(c: AI):
@ai_router.get("/ai/check")
@handle_exceptions(logger=LOG)
async def check():
return JSONResponse(content={"ai_enabled": bool(open_ai_key or llama_model)})

39
api/backend/ai/clients.py Normal file
View File

@@ -0,0 +1,39 @@
# STL
import os
# PDM
from ollama import AsyncClient
from openai import OpenAI
# Load environment variables
open_ai_key = os.getenv("OPENAI_KEY")
open_ai_model = os.getenv("OPENAI_MODEL")
llama_url = os.getenv("OLLAMA_URL")
llama_model = os.getenv("OLLAMA_MODEL")
# Initialize clients
openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None
llama_client = AsyncClient(host=llama_url) if llama_url else None
async def ask_open_ai(prompt: str) -> str:
if not openai_client:
raise ValueError("OpenAI client not initialized")
response = openai_client.chat.completions.create(
model=open_ai_model or "gpt-4.1-mini",
messages=[{"role": "user", "content": prompt}],
)
return response.choices[0].message.content or ""
async def ask_ollama(prompt: str) -> str:
if not llama_client:
raise ValueError("Ollama client not initialized")
response = await llama_client.chat(
model=llama_model or "", messages=[{"role": "user", "content": prompt}]
)
return response.message.content or ""

View File

@@ -0,0 +1,4 @@
# LOCAL
from .ai import AI
__all__ = ["AI"]

View File

@@ -0,0 +1,9 @@
# STL
from typing import Any
# PDM
import pydantic
class AI(pydantic.BaseModel):
messages: list[Any]

View File

@@ -1,39 +1,57 @@
# STL
import os
import logging
import apscheduler # type: ignore
from contextlib import asynccontextmanager
# PDM
import apscheduler.schedulers
import apscheduler.schedulers.background
from fastapi import FastAPI, Request, status
from fastapi.responses import JSONResponse
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
# LOCAL
from api.backend.ai.ai_router import ai_router
from api.backend.auth.auth_router import auth_router
from api.backend.utils import get_log_level
from api.backend.routers.job_router import job_router
from api.backend.routers.stats_router import stats_router
from api.backend.database.startup import init_database
from fastapi.responses import JSONResponse
from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
from api.backend.scheduler import scheduler
from api.backend.ai.ai_router import ai_router
from api.backend.job.job_router import job_router
from api.backend.auth.auth_router import auth_router
from api.backend.stats.stats_router import stats_router
from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
log_level = os.getenv("LOG_LEVEL")
LOG_LEVEL = get_log_level(log_level)
logging.basicConfig(
level=LOG_LEVEL,
format="%(levelname)s: %(asctime)s - %(name)s - %(message)s",
format="%(levelname)s: %(asctime)s - [%(name)s] - %(message)s",
handlers=[logging.StreamHandler()],
)
LOG = logging.getLogger(__name__)
app = FastAPI(title="api", root_path="/api")
@asynccontextmanager
async def lifespan(_: FastAPI):
# Startup
LOG.info("Starting application...")
LOG.info("Starting cron scheduler...")
await start_cron_scheduler(scheduler)
scheduler.start()
LOG.info("Cron scheduler started successfully")
yield
# Shutdown
LOG.info("Shutting down application...")
LOG.info("Stopping cron scheduler...")
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
LOG.info("Cron scheduler stopped")
LOG.info("Application shutdown complete")
app = FastAPI(title="api", root_path="/api", lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
@@ -43,28 +61,12 @@ app.add_middleware(
allow_headers=["*"],
)
app.include_router(auth_router)
app.include_router(ai_router)
app.include_router(job_router)
app.include_router(stats_router)
@app.on_event("startup")
async def startup_event():
start_cron_scheduler(scheduler)
scheduler.start()
if os.getenv("ENV") != "test":
init_database()
LOG.info("Starting up...")
@app.on_event("shutdown")
def shutdown_scheduler():
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(request: Request, exc: RequestValidationError):
exc_str = f"{exc}".replace("\n", " ").replace(" ", " ")

View File

@@ -1,13 +1,16 @@
# STL
from datetime import timedelta
import os
import logging
from datetime import timedelta
# PDM
from fastapi import Depends, APIRouter, HTTPException, status
from fastapi.security import OAuth2PasswordRequestForm
from sqlalchemy.ext.asyncio import AsyncSession
# LOCAL
from api.backend.schemas import User, Token, UserCreate
from api.backend.auth.schemas import User, Token, UserCreate
from api.backend.database.base import AsyncSessionLocal, get_db
from api.backend.auth.auth_utils import (
ACCESS_TOKEN_EXPIRE_MINUTES,
get_current_user,
@@ -15,18 +18,19 @@ from api.backend.auth.auth_utils import (
get_password_hash,
create_access_token,
)
import logging
from api.backend.database.common import update
from api.backend.database.models import User as DatabaseUser
from api.backend.routers.handle_exceptions import handle_exceptions
auth_router = APIRouter()
LOG = logging.getLogger("auth_router")
LOG = logging.getLogger("Auth")
@auth_router.post("/auth/token", response_model=Token)
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
user = await authenticate_user(form_data.username, form_data.password)
@handle_exceptions(logger=LOG)
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends(), db: AsyncSession = Depends(get_db)):
user = await authenticate_user(form_data.username, form_data.password, db)
if not user:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
@@ -47,23 +51,37 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends(
@auth_router.post("/auth/signup", response_model=User)
@handle_exceptions(logger=LOG)
async def create_user(user: UserCreate):
hashed_password = get_password_hash(user.password)
user_dict = user.model_dump()
user_dict["hashed_password"] = hashed_password
del user_dict["password"]
query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)"
_ = update(query, (user_dict["email"], hashed_password, user_dict["full_name"]))
async with AsyncSessionLocal() as session:
new_user = DatabaseUser(
email=user.email,
hashed_password=user_dict["hashed_password"],
full_name=user.full_name,
)
session.add(new_user)
await session.commit()
return user_dict
@auth_router.get("/auth/users/me", response_model=User)
@handle_exceptions(logger=LOG)
async def read_users_me(current_user: User = Depends(get_current_user)):
return current_user
@auth_router.get("/auth/check")
@handle_exceptions(logger=LOG)
async def check_auth():
return {"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True"}
return {
"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True",
"recordings_enabled": os.environ.get("RECORDINGS_ENABLED", "true").lower()
== "true",
}

View File

@@ -1,22 +1,24 @@
# STL
import os
import logging
from typing import Any, Optional
from datetime import datetime, timedelta
import logging
# PDM
from jose import JWTError, jwt
from dotenv import load_dotenv
from fastapi import Depends, HTTPException, status
from sqlalchemy import select
from passlib.context import CryptContext
from fastapi.security import OAuth2PasswordBearer
from sqlalchemy.ext.asyncio import AsyncSession
# LOCAL
from api.backend.schemas import User, UserInDB, TokenData
from api.backend.auth.schemas import User, UserInDB, TokenData
from api.backend.database.base import get_db
from api.backend.database.models import User as UserModel
from api.backend.database.common import query
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Auth")
_ = load_dotenv()
@@ -38,18 +40,24 @@ def get_password_hash(password: str):
return pwd_context.hash(password)
async def get_user(email: str):
user_query = "SELECT * FROM users WHERE email = ?"
user = query(user_query, (email,))[0]
async def get_user(session: AsyncSession, email: str) -> UserInDB | None:
stmt = select(UserModel).where(UserModel.email == email)
result = await session.execute(stmt)
user = result.scalars().first()
if not user:
return
return None
return UserInDB(**user)
return UserInDB(
email=str(user.email),
hashed_password=str(user.hashed_password),
full_name=str(user.full_name),
disabled=bool(user.disabled),
)
async def authenticate_user(email: str, password: str):
user = await get_user(email)
async def authenticate_user(email: str, password: str, db: AsyncSession):
user = await get_user(db, email)
if not user:
return False
@@ -75,7 +83,9 @@ def create_access_token(
return encoded_jwt
async def get_current_user(token: str = Depends(oauth2_scheme)):
async def get_current_user(
db: AsyncSession = Depends(get_db), token: str = Depends(oauth2_scheme)
):
LOG.debug(f"Getting current user with token: {token}")
if not token:
@@ -83,7 +93,7 @@ async def get_current_user(token: str = Depends(oauth2_scheme)):
return EMPTY_USER
if len(token.split(".")) != 3:
LOG.error(f"Malformed token: {token}")
LOG.debug(f"Malformed token: {token}")
return EMPTY_USER
try:
@@ -118,14 +128,15 @@ async def get_current_user(token: str = Depends(oauth2_scheme)):
LOG.error(f"Exception occurred: {e}")
return EMPTY_USER
user = await get_user(email=token_data.email)
user = await get_user(db, email=token_data.email or "")
if user is None:
return EMPTY_USER
return user
async def require_user(token: str = Depends(oauth2_scheme)):
async def require_user(db: AsyncSession, token: str = Depends(oauth2_scheme)):
credentials_exception = HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Could not validate credentials",
@@ -136,6 +147,7 @@ async def require_user(token: str = Depends(oauth2_scheme)):
payload: Optional[dict[str, Any]] = jwt.decode(
token, SECRET_KEY, algorithms=[ALGORITHM]
)
if not payload:
raise credentials_exception
@@ -149,7 +161,7 @@ async def require_user(token: str = Depends(oauth2_scheme)):
except JWTError:
raise credentials_exception
user = await get_user(email=token_data.email)
user = await get_user(db, email=token_data.email or "")
if user is None:
raise credentials_exception

View File

@@ -0,0 +1,4 @@
# LOCAL
from .auth import User, Token, UserInDB, TokenData, UserCreate
__all__ = ["User", "Token", "UserInDB", "TokenData", "UserCreate"]

View File

@@ -1 +1,24 @@
DATABASE_PATH = "data/database.db"
# STL
import os
from pathlib import Path
DATABASE_URL = os.getenv("DATABASE_URL", "sqlite+aiosqlite:///data/database.db")
RECORDINGS_DIR = Path("media/recordings")
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
MEDIA_DIR = Path("media")
MEDIA_TYPES = [
"audio",
"documents",
"images",
"pdfs",
"presentations",
"spreadsheets",
"videos",
]
REGISTRATION_ENABLED = os.getenv("REGISTRATION_ENABLED", "true").lower() == "true"
DEFAULT_USER_EMAIL = os.getenv("DEFAULT_USER_EMAIL")
DEFAULT_USER_PASSWORD = os.getenv("DEFAULT_USER_PASSWORD")
DEFAULT_USER_FULL_NAME = os.getenv("DEFAULT_USER_FULL_NAME")
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")

View File

@@ -1,3 +0,0 @@
from .common import insert, QUERIES, update
__all__ = ["insert", "QUERIES", "update"]

View File

@@ -0,0 +1,26 @@
# STL
from typing import AsyncGenerator
# PDM
from sqlalchemy.orm import declarative_base
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
# LOCAL
from api.backend.constants import DATABASE_URL
engine = create_async_engine(DATABASE_URL, echo=False, future=True)
AsyncSessionLocal = async_sessionmaker(
bind=engine,
autoflush=False,
autocommit=False,
expire_on_commit=False,
class_=AsyncSession,
)
Base = declarative_base()
async def get_db() -> AsyncGenerator[AsyncSession, None]:
async with AsyncSessionLocal() as session:
yield session

View File

@@ -1,92 +0,0 @@
import sqlite3
from typing import Any, Optional
from api.backend.constants import DATABASE_PATH
from api.backend.utils import format_json, format_sql_row_to_python
from api.backend.database.schema import INIT_QUERY
from api.backend.database.queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
import logging
LOG = logging.getLogger(__name__)
def connect():
connection = sqlite3.connect(DATABASE_PATH)
connection.set_trace_callback(print)
cursor = connection.cursor()
return cursor
def insert(query: str, values: tuple[Any, ...]):
connection = sqlite3.connect(DATABASE_PATH)
cursor = connection.cursor()
copy = list(values)
format_json(copy)
try:
_ = cursor.execute(query, copy)
connection.commit()
except sqlite3.Error as e:
LOG.error(f"An error occurred: {e}")
finally:
cursor.close()
connection.close()
def query(query: str, values: Optional[tuple[Any, ...]] = None):
connection = sqlite3.connect(DATABASE_PATH)
connection.row_factory = sqlite3.Row
cursor = connection.cursor()
rows = []
try:
if values:
_ = cursor.execute(query, values)
else:
_ = cursor.execute(query)
rows = cursor.fetchall()
finally:
cursor.close()
connection.close()
formatted_rows: list[dict[str, Any]] = []
for row in rows:
row = dict(row)
formatted_row = format_sql_row_to_python(row)
formatted_rows.append(formatted_row)
return formatted_rows
def update(query: str, values: Optional[tuple[Any, ...]] = None):
connection = sqlite3.connect(DATABASE_PATH)
cursor = connection.cursor()
copy = None
if values:
copy = list(values)
format_json(copy)
try:
if copy:
res = cursor.execute(query, copy)
else:
res = cursor.execute(query)
connection.commit()
return res.rowcount
except sqlite3.Error as e:
LOG.error(f"An error occurred: {e}")
finally:
cursor.close()
connection.close()
return 0
QUERIES = {
"init": INIT_QUERY,
"insert_job": JOB_INSERT_QUERY,
"delete_job": DELETE_JOB_QUERY,
}

View File

@@ -0,0 +1,65 @@
# PDM
from sqlalchemy import JSON, Column, String, Boolean, DateTime, ForeignKey, func
from sqlalchemy.orm import relationship
# LOCAL
from api.backend.database.base import Base
class User(Base):
__tablename__ = "users"
email = Column(String(255), primary_key=True, nullable=False)
hashed_password = Column(String(255), nullable=False)
full_name = Column(String(255), nullable=True)
disabled = Column(Boolean, default=False)
jobs = relationship("Job", back_populates="user_obj", cascade="all, delete-orphan")
cron_jobs = relationship(
"CronJob", back_populates="user_obj", cascade="all, delete-orphan"
)
class Job(Base):
__tablename__ = "jobs"
id = Column(String(64), primary_key=True, nullable=False)
url = Column(String(2048), nullable=False)
elements = Column(JSON, nullable=False)
user = Column(String(255), ForeignKey("users.email"), nullable=True)
time_created = Column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
result = Column(JSON, nullable=False)
status = Column(String(50), nullable=False)
chat = Column(JSON, nullable=True)
job_options = Column(JSON, nullable=True)
agent_mode = Column(Boolean, default=False, nullable=False)
prompt = Column(String(1024), nullable=True)
favorite = Column(Boolean, default=False, nullable=False)
user_obj = relationship("User", back_populates="jobs")
cron_jobs = relationship(
"CronJob", back_populates="job_obj", cascade="all, delete-orphan"
)
class CronJob(Base):
__tablename__ = "cron_jobs"
id = Column(String(64), primary_key=True, nullable=False)
user_email = Column(String(255), ForeignKey("users.email"), nullable=False)
job_id = Column(String(64), ForeignKey("jobs.id"), nullable=False)
cron_expression = Column(String(255), nullable=False)
time_created = Column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
time_updated = Column(
DateTime(timezone=True),
server_default=func.now(),
onupdate=func.now(),
nullable=False,
)
user_obj = relationship("User", back_populates="cron_jobs")
job_obj = relationship("Job", back_populates="cron_jobs")

View File

@@ -1,3 +0,0 @@
from .queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
__all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"]

View File

@@ -0,0 +1,72 @@
# STL
import logging
from typing import Any
# PDM
from sqlalchemy import delete as sql_delete
from sqlalchemy import select
from sqlalchemy import update as sql_update
# LOCAL
from api.backend.database.base import AsyncSessionLocal
from api.backend.database.models import Job
LOG = logging.getLogger("Database")
async def insert_job(item: dict[str, Any]) -> None:
async with AsyncSessionLocal() as session:
job = Job(
id=item["id"],
url=item["url"],
elements=item["elements"],
user=item["user"],
time_created=item["time_created"],
result=item["result"],
status=item["status"],
chat=item["chat"],
job_options=item["job_options"],
agent_mode=item["agent_mode"],
prompt=item["prompt"],
)
session.add(job)
await session.commit()
LOG.info(f"Inserted item: {item}")
async def get_queued_job():
async with AsyncSessionLocal() as session:
stmt = (
select(Job)
.where(Job.status == "Queued")
.order_by(Job.time_created.desc())
.limit(1)
)
result = await session.execute(stmt)
job = result.scalars().first()
LOG.info(f"Got queued job: {job}")
return job
async def update_job(ids: list[str], updates: dict[str, Any]):
if not updates:
return
async with AsyncSessionLocal() as session:
stmt = sql_update(Job).where(Job.id.in_(ids)).values(**updates)
result = await session.execute(stmt)
await session.commit()
LOG.debug(f"Updated job count: {result.rowcount}")
async def delete_jobs(jobs: list[str]):
if not jobs:
LOG.info("No jobs to delete.")
return False
async with AsyncSessionLocal() as session:
stmt = sql_delete(Job).where(Job.id.in_(jobs))
result = await session.execute(stmt)
await session.commit()
LOG.info(f"Deleted jobs count: {result.rowcount}")
return result.rowcount

View File

@@ -1,9 +0,0 @@
JOB_INSERT_QUERY = """
INSERT INTO jobs
(id, url, elements, user, time_created, result, status, chat, job_options)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
"""
DELETE_JOB_QUERY = """
DELETE FROM jobs WHERE id IN ()
"""

View File

@@ -0,0 +1,43 @@
# PDM
from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession
# LOCAL
from api.backend.database.models import Job
async def average_elements_per_link(session: AsyncSession, user_email: str):
date_func = func.date(Job.time_created)
stmt = (
select(
date_func.label("date"),
func.avg(func.json_array_length(Job.elements)).label("average_elements"),
func.count().label("count"),
)
.where(Job.status == "Completed", Job.user == user_email)
.group_by(date_func)
.order_by("date")
)
result = await session.execute(stmt)
rows = result.all()
return [dict(row._mapping) for row in rows]
async def get_jobs_per_day(session: AsyncSession, user_email: str):
date_func = func.date(Job.time_created)
stmt = (
select(
date_func.label("date"),
func.count().label("job_count"),
)
.where(Job.status == "Completed", Job.user == user_email)
.group_by(date_func)
.order_by("date")
)
result = await session.execute(stmt)
rows = result.all()
return [dict(row._mapping) for row in rows]

View File

@@ -1,3 +0,0 @@
from .schema import INIT_QUERY
__all__ = ["INIT_QUERY"]

View File

@@ -1,30 +0,0 @@
INIT_QUERY = """
CREATE TABLE IF NOT EXISTS jobs (
id STRING PRIMARY KEY NOT NULL,
url STRING NOT NULL,
elements JSON NOT NULL,
user STRING,
time_created DATETIME NOT NULL,
result JSON NOT NULL,
status STRING NOT NULL,
chat JSON,
job_options JSON
);
CREATE TABLE IF NOT EXISTS users (
email STRING PRIMARY KEY NOT NULL,
hashed_password STRING NOT NULL,
full_name STRING,
disabled BOOLEAN
);
CREATE TABLE IF NOT EXISTS cron_jobs (
id STRING PRIMARY KEY NOT NULL,
user_email STRING NOT NULL,
job_id STRING NOT NULL,
cron_expression STRING NOT NULL,
time_created DATETIME NOT NULL,
time_updated DATETIME NOT NULL,
FOREIGN KEY (job_id) REFERENCES jobs(id)
);
"""

View File

@@ -1,43 +1,56 @@
import os
from api.backend.database.common import connect, QUERIES, insert
# STL
import logging
# PDM
from sqlalchemy.exc import IntegrityError
# LOCAL
from api.backend.constants import (
DEFAULT_USER_EMAIL,
REGISTRATION_ENABLED,
DEFAULT_USER_PASSWORD,
DEFAULT_USER_FULL_NAME,
)
from api.backend.database.base import Base, AsyncSessionLocal, engine
from api.backend.auth.auth_utils import get_password_hash
from api.backend.database.models import User
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Database")
async def init_database():
LOG.info("Creating database schema...")
def init_database():
cursor = connect()
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
for query in QUERIES["init"].strip().split(";"):
if query.strip():
LOG.info(f"Executing query: {query}")
_ = cursor.execute(query)
if not REGISTRATION_ENABLED:
default_user_email = DEFAULT_USER_EMAIL
default_user_password = DEFAULT_USER_PASSWORD
default_user_full_name = DEFAULT_USER_FULL_NAME
if os.environ.get("REGISTRATION_ENABLED", "True") == "False":
default_user_email = os.environ.get("DEFAULT_USER_EMAIL")
default_user_password = os.environ.get("DEFAULT_USER_PASSWORD")
default_user_full_name = os.environ.get("DEFAULT_USER_FULL_NAME")
if (
not default_user_email
or not default_user_password
or not default_user_full_name
):
LOG.error(
"DEFAULT_USER_EMAIL, DEFAULT_USER_PASSWORD, or DEFAULT_USER_FULL_NAME is not set!"
)
if not (default_user_email and default_user_password and default_user_full_name):
LOG.error("DEFAULT_USER_* env vars are not set!")
exit(1)
query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)"
_ = insert(
query,
(
default_user_email,
get_password_hash(default_user_password),
default_user_full_name,
),
)
async with AsyncSessionLocal() as session:
user = await session.get(User, default_user_email)
if user:
LOG.info("Default user already exists. Skipping creation.")
return
LOG.info("Creating default user...")
new_user = User(
email=default_user_email,
hashed_password=get_password_hash(default_user_password),
full_name=default_user_full_name,
disabled=False,
)
try:
session.add(new_user)
await session.commit()
LOG.info(f"Created default user: {default_user_email}")
except IntegrityError as e:
await session.rollback()
LOG.warning(f"Could not create default user (already exists?): {e}")
cursor.close()

View File

@@ -0,0 +1,37 @@
# STL
import json
from typing import Any
from datetime import datetime
def format_list_for_query(ids: list[str]):
return (
f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)"
)
def format_sql_row_to_python(row: dict[str, Any]):
new_row: dict[str, Any] = {}
for key, value in row.items():
if isinstance(value, str):
try:
new_row[key] = json.loads(value)
except json.JSONDecodeError:
new_row[key] = value
else:
new_row[key] = value
return new_row
def format_json(items: list[Any]):
for idx, item in enumerate(items):
if isinstance(item, (dict, list)):
formatted_item = json.dumps(item)
items[idx] = formatted_item
def parse_datetime(dt_str: str) -> datetime:
if dt_str.endswith("Z"):
dt_str = dt_str.replace("Z", "+00:00") # valid ISO format for UTC
return datetime.fromisoformat(dt_str)

View File

@@ -1,17 +1,9 @@
from .job import (
insert,
update_job,
delete_jobs,
get_jobs_per_day,
get_queued_job,
average_elements_per_link,
)
# LOCAL
from .job import insert, update_job, delete_jobs, get_queued_job
__all__ = [
"insert",
"update_job",
"delete_jobs",
"get_jobs_per_day",
"get_queued_job",
"average_elements_per_link",
]

View File

@@ -1,78 +1,75 @@
import datetime
from typing import Any
# STL
import uuid
from api.backend.database.common import insert, query
from api.backend.models import CronJob
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
from apscheduler.triggers.cron import CronTrigger # type: ignore
from api.backend.job import insert as insert_job
import logging
import datetime
from typing import Any, List
LOG = logging.getLogger("Cron Scheduler")
# PDM
from sqlalchemy import select
from apscheduler.triggers.cron import CronTrigger
from apscheduler.schedulers.asyncio import AsyncIOScheduler
# LOCAL
from api.backend.job import insert as insert_job
from api.backend.database.base import AsyncSessionLocal
from api.backend.database.models import Job, CronJob
LOG = logging.getLogger("Cron")
def insert_cron_job(cron_job: CronJob):
query = """
INSERT INTO cron_jobs (id, user_email, job_id, cron_expression, time_created, time_updated)
VALUES (?, ?, ?, ?, ?, ?)
"""
values = (
cron_job.id,
cron_job.user_email,
cron_job.job_id,
cron_job.cron_expression,
cron_job.time_created,
cron_job.time_updated,
)
insert(query, values)
async def insert_cron_job(cron_job: CronJob) -> bool:
async with AsyncSessionLocal() as session:
session.add(cron_job)
await session.commit()
return True
def delete_cron_job(id: str, user_email: str):
query = """
DELETE FROM cron_jobs
WHERE id = ? AND user_email = ?
"""
values = (id, user_email)
insert(query, values)
async def delete_cron_job(id: str, user_email: str) -> bool:
async with AsyncSessionLocal() as session:
stmt = select(CronJob).where(CronJob.id == id, CronJob.user_email == user_email)
result = await session.execute(stmt)
cron_job = result.scalars().first()
if cron_job:
await session.delete(cron_job)
await session.commit()
return True
def get_cron_jobs(user_email: str):
cron_jobs = query("SELECT * FROM cron_jobs WHERE user_email = ?", (user_email,))
return cron_jobs
async def get_cron_jobs(user_email: str) -> List[CronJob]:
async with AsyncSessionLocal() as session:
stmt = select(CronJob).where(CronJob.user_email == user_email)
result = await session.execute(stmt)
return list(result.scalars().all())
def get_all_cron_jobs():
cron_jobs = query("SELECT * FROM cron_jobs")
return cron_jobs
async def get_all_cron_jobs() -> List[CronJob]:
async with AsyncSessionLocal() as session:
stmt = select(CronJob)
result = await session.execute(stmt)
return list(result.scalars().all())
def insert_job_from_cron_job(job: dict[str, Any]):
insert_job(
{
**job,
"id": uuid.uuid4().hex,
"status": "Queued",
"result": "",
"chat": None,
"time_created": datetime.datetime.now(),
"time_updated": datetime.datetime.now(),
}
)
async def insert_job_from_cron_job(job: dict[str, Any]):
async with AsyncSessionLocal() as session:
await insert_job(
{
**job,
"id": uuid.uuid4().hex,
"status": "Queued",
"result": "",
"chat": None,
"time_created": datetime.datetime.now(datetime.timezone.utc),
"time_updated": datetime.datetime.now(datetime.timezone.utc),
},
session,
)
def get_cron_job_trigger(cron_expression: str):
expression_parts = cron_expression.split()
if len(expression_parts) != 5:
print(f"Invalid cron expression: {cron_expression}")
LOG.warning(f"Invalid cron expression: {cron_expression}")
return None
minute, hour, day, month, day_of_week = expression_parts
@@ -82,19 +79,37 @@ def get_cron_job_trigger(cron_expression: str):
)
def start_cron_scheduler(scheduler: BackgroundScheduler):
cron_jobs = get_all_cron_jobs()
async def start_cron_scheduler(scheduler: AsyncIOScheduler):
async with AsyncSessionLocal() as session:
stmt = select(CronJob)
result = await session.execute(stmt)
cron_jobs = result.scalars().all()
LOG.info(f"Cron jobs: {cron_jobs}")
LOG.info(f"Cron jobs: {cron_jobs}")
for job in cron_jobs:
queried_job = query("SELECT * FROM jobs WHERE id = ?", (job["job_id"],))
for cron_job in cron_jobs:
stmt = select(Job).where(Job.id == cron_job.job_id)
result = await session.execute(stmt)
queried_job = result.scalars().first()
LOG.info(f"Adding job: {queried_job}")
LOG.info(f"Adding job: {queried_job}")
scheduler.add_job(
insert_job_from_cron_job,
get_cron_job_trigger(job["cron_expression"]),
id=job["id"],
args=[queried_job[0]],
)
trigger = get_cron_job_trigger(cron_job.cron_expression) # type: ignore
if not trigger:
continue
job_dict = (
{
c.key: getattr(queried_job, c.key)
for c in queried_job.__table__.columns
}
if queried_job
else {}
)
scheduler.add_job(
insert_job_from_cron_job,
trigger,
id=cron_job.id,
args=[job_dict],
)

View File

@@ -1,97 +1,113 @@
# STL
import logging
import datetime
from typing import Any
# PDM
from sqlalchemy import delete as sql_delete
from sqlalchemy import select
from sqlalchemy import update as sql_update
from sqlalchemy.ext.asyncio import AsyncSession
# LOCAL
from api.backend.utils import format_list_for_query
from api.backend.database.common import (
insert as common_insert,
query as common_query,
QUERIES,
update as common_update,
)
from api.backend.database.base import AsyncSessionLocal
from api.backend.database.models import Job
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Job")
def insert(item: dict[str, Any]) -> None:
common_insert(
QUERIES["insert_job"],
(
async def insert(item: dict[str, Any], db: AsyncSession) -> None:
existing = await db.get(Job, item["id"])
if existing:
await multi_field_update_job(
item["id"],
item["url"],
item["elements"],
item["user"],
item["time_created"],
item["result"],
item["status"],
item["chat"],
item["job_options"],
),
{
"agent_mode": item["agent_mode"],
"prompt": item["prompt"],
"job_options": item["job_options"],
"elements": item["elements"],
"status": "Queued",
"result": [],
"time_created": datetime.datetime.now(datetime.timezone.utc),
"chat": None,
},
db,
)
return
job = Job(
id=item["id"],
url=item["url"],
elements=item["elements"],
user=item["user"],
time_created=datetime.datetime.now(datetime.timezone.utc),
result=item["result"],
status=item["status"],
chat=item["chat"],
job_options=item["job_options"],
agent_mode=item["agent_mode"],
prompt=item["prompt"],
)
LOG.info(f"Inserted item: {item}")
db.add(job)
await db.commit()
LOG.debug(f"Inserted item: {item}")
async def check_for_job_completion(id: str) -> dict[str, Any]:
async with AsyncSessionLocal() as session:
job = await session.get(Job, id)
return job.__dict__ if job else {}
async def get_queued_job():
query = (
"SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
)
res = common_query(query)
LOG.info(f"Got queued job: {res}")
return res[0] if res else None
async with AsyncSessionLocal() as session:
stmt = (
select(Job)
.where(Job.status == "Queued")
.order_by(Job.time_created.desc())
.limit(1)
)
result = await session.execute(stmt)
job = result.scalars().first()
LOG.debug(f"Got queued job: {job}")
return job.__dict__ if job else None
async def update_job(ids: list[str], field: str, value: Any):
query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
res = common_update(query, tuple([value] + ids))
LOG.info(f"Updated job: {res}")
async with AsyncSessionLocal() as session:
stmt = sql_update(Job).where(Job.id.in_(ids)).values({field: value})
res = await session.execute(stmt)
await session.commit()
LOG.debug(f"Updated job count: {res.rowcount}")
async def multi_field_update_job(
id: str, fields: dict[str, Any], session: AsyncSession | None = None
):
close_session = False
if not session:
session = AsyncSessionLocal()
close_session = True
try:
stmt = sql_update(Job).where(Job.id == id).values(**fields)
await session.execute(stmt)
await session.commit()
LOG.debug(f"Updated job {id} fields: {fields}")
finally:
if close_session:
await session.close()
async def delete_jobs(jobs: list[str]):
if not jobs:
LOG.info("No jobs to delete.")
LOG.debug("No jobs to delete.")
return False
query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}"
res = common_update(query, tuple(jobs))
return res > 0
async def average_elements_per_link(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
AVG(json_array_length(elements)) AS average_elements,
COUNT(*) AS count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = common_query(job_query, (user,))
return results
async def get_jobs_per_day(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
COUNT(*) AS job_count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = common_query(job_query, (user,))
return results
async with AsyncSessionLocal() as session:
stmt = sql_delete(Job).where(Job.id.in_(jobs))
res = await session.execute(stmt)
await session.commit()
LOG.debug(f"Deleted jobs: {res.rowcount}")
return res.rowcount > 0

View File

@@ -0,0 +1,280 @@
# STL
import csv
import uuid
import random
import logging
import datetime
from io import StringIO
# PDM
from fastapi import Depends, APIRouter
from sqlalchemy import select
from fastapi.encoders import jsonable_encoder
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
from sqlalchemy.ext.asyncio import AsyncSession
from apscheduler.triggers.cron import CronTrigger # type: ignore
# LOCAL
from api.backend.job import insert, update_job, delete_jobs
from api.backend.constants import MEDIA_DIR, MEDIA_TYPES, RECORDINGS_DIR
from api.backend.scheduler import scheduler
from api.backend.schemas.job import Job, UpdateJobs, DownloadJob, DeleteScrapeJobs
from api.backend.auth.schemas import User
from api.backend.schemas.cron import CronJob as PydanticCronJob
from api.backend.schemas.cron import DeleteCronJob
from api.backend.database.base import get_db
from api.backend.auth.auth_utils import get_current_user
from api.backend.database.models import Job as DatabaseJob
from api.backend.database.models import CronJob
from api.backend.job.utils.text_utils import clean_text
from api.backend.job.models.job_options import FetchOptions
from api.backend.routers.handle_exceptions import handle_exceptions
from api.backend.job.utils.clean_job_format import clean_job_format
from api.backend.job.cron_scheduling.cron_scheduling import (
get_cron_jobs,
delete_cron_job,
insert_cron_job,
get_cron_job_trigger,
insert_job_from_cron_job,
)
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
LOG = logging.getLogger("Job")
job_router = APIRouter()
@job_router.post("/update")
@handle_exceptions(logger=LOG)
async def update(update_jobs: UpdateJobs, _: User = Depends(get_current_user)):
await update_job(update_jobs.ids, update_jobs.field, update_jobs.value)
return {"message": "Jobs updated successfully"}
@job_router.post("/submit-scrape-job")
@handle_exceptions(logger=LOG)
async def submit_scrape_job(job: Job, db: AsyncSession = Depends(get_db)):
LOG.info(f"Recieved job: {job}")
if not job.id:
job.id = uuid.uuid4().hex
job_dict = job.model_dump()
await insert(job_dict, db)
return JSONResponse(
content={"id": job.id, "message": "Job submitted successfully."}
)
@job_router.post("/retrieve-scrape-jobs")
@handle_exceptions(logger=LOG)
async def retrieve_scrape_jobs(
fetch_options: FetchOptions,
user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
):
LOG.info(
f"Retrieving jobs for account: {user.email if user.email else 'Guest User'}"
)
if fetch_options.chat:
stmt = select(DatabaseJob.chat).filter(DatabaseJob.user == user.email)
else:
stmt = select(DatabaseJob).filter(DatabaseJob.user == user.email)
results = await db.execute(stmt)
rows = results.all() if fetch_options.chat else results.scalars().all()
return JSONResponse(content=jsonable_encoder(rows[::-1]))
@job_router.get("/job/{id}")
@handle_exceptions(logger=LOG)
async def job(
id: str, user: User = Depends(get_current_user), db: AsyncSession = Depends(get_db)
):
LOG.info(f"Retrieving jobs for account: {user.email}")
stmt = select(DatabaseJob).filter(
DatabaseJob.user == user.email, DatabaseJob.id == id
)
results = await db.execute(stmt)
return JSONResponse(
content=jsonable_encoder([job.__dict__ for job in results.scalars().all()])
)
@job_router.post("/download")
@handle_exceptions(logger=LOG)
async def download(download_job: DownloadJob, db: AsyncSession = Depends(get_db)):
LOG.info(f"Downloading job with ids: {download_job.ids}")
stmt = select(DatabaseJob).where(DatabaseJob.id.in_(download_job.ids))
result = await db.execute(stmt)
results = [job.__dict__ for job in result.scalars().all()]
if download_job.job_format == "csv":
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
headers = [
"id",
"url",
"element_name",
"xpath",
"text",
"user",
"time_created",
]
csv_writer.writerow(headers)
for result in results:
for res in result["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
csv_writer.writerow(
[
result.get("id", "")
+ "-"
+ str(random.randint(0, 1000000)),
url,
element_name,
value.get("xpath", ""),
text,
result.get("user", ""),
result.get("time_created", ""),
]
)
_ = csv_buffer.seek(0)
response = StreamingResponse(
csv_buffer,
media_type="text/csv",
)
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
elif download_job.job_format == "md":
response = StreamingResponse(
stream_md_from_job_results(results),
media_type="text/markdown",
)
response.headers["Content-Disposition"] = "attachment; filename=export.md"
return response
@job_router.get("/job/{id}/convert-to-csv")
@handle_exceptions(logger=LOG)
async def convert_to_csv(id: str, db: AsyncSession = Depends(get_db)):
stmt = select(DatabaseJob).filter(DatabaseJob.id == id)
results = await db.execute(stmt)
jobs = results.scalars().all()
return JSONResponse(content=clean_job_format([job.__dict__ for job in jobs]))
@job_router.post("/delete-scrape-jobs")
@handle_exceptions(logger=LOG)
async def delete(delete_scrape_jobs: DeleteScrapeJobs):
result = await delete_jobs(delete_scrape_jobs.ids)
return (
JSONResponse(content={"message": "Jobs successfully deleted."})
if result
else JSONResponse(content={"error": "Jobs not deleted."})
)
@job_router.post("/schedule-cron-job")
@handle_exceptions(logger=LOG)
async def schedule_cron_job(
cron_job: PydanticCronJob,
db: AsyncSession = Depends(get_db),
):
if not cron_job.id:
cron_job.id = uuid.uuid4().hex
now = datetime.datetime.now()
if not cron_job.time_created:
cron_job.time_created = now
if not cron_job.time_updated:
cron_job.time_updated = now
await insert_cron_job(CronJob(**cron_job.model_dump()))
stmt = select(DatabaseJob).where(DatabaseJob.id == cron_job.job_id)
result = await db.execute(stmt)
queried_job = result.scalars().first()
if not queried_job:
return JSONResponse(status_code=404, content={"error": "Related job not found"})
scheduler.add_job(
insert_job_from_cron_job,
get_cron_job_trigger(cron_job.cron_expression),
id=cron_job.id,
args=[queried_job],
)
return JSONResponse(content={"message": "Cron job scheduled successfully."})
@job_router.post("/delete-cron-job")
@handle_exceptions(logger=LOG)
async def delete_cron_job_request(request: DeleteCronJob):
if not request.id:
return JSONResponse(
content={"error": "Cron job id is required."}, status_code=400
)
await delete_cron_job(request.id, request.user_email)
scheduler.remove_job(request.id)
return JSONResponse(content={"message": "Cron job deleted successfully."})
@job_router.get("/cron-jobs")
@handle_exceptions(logger=LOG)
async def get_cron_jobs_request(user: User = Depends(get_current_user)):
cron_jobs = await get_cron_jobs(user.email)
return JSONResponse(content=jsonable_encoder(cron_jobs))
@job_router.get("/recordings/{id}")
@handle_exceptions(logger=LOG)
async def get_recording(id: str):
path = RECORDINGS_DIR / f"{id}.mp4"
if not path.exists():
return JSONResponse(content={"error": "Recording not found."}, status_code=404)
return FileResponse(
path, headers={"Content-Type": "video/mp4", "Accept-Ranges": "bytes"}
)
@job_router.get("/get-media")
@handle_exceptions(logger=LOG)
async def get_media(id: str):
files: dict[str, list[str]] = {}
for media_type in MEDIA_TYPES:
path = MEDIA_DIR / media_type / f"{id}"
files[media_type] = [file.name for file in path.glob("*")]
return JSONResponse(content={"files": files})
@job_router.get("/media")
@handle_exceptions(logger=LOG)
async def get_media_file(id: str, type: str, file: str):
path = MEDIA_DIR / type / f"{id}" / file
if not path.exists():
return JSONResponse(content={"error": "Media file not found."}, status_code=404)
return FileResponse(path)

View File

@@ -1,3 +1,5 @@
from .job_options import JobOptions
# LOCAL
from .job import Element, CapturedElement
from .job_options import Proxy, JobOptions
__all__ = ["JobOptions"]
__all__ = ["JobOptions", "CapturedElement", "Element", "Proxy"]

View File

@@ -0,0 +1,15 @@
from typing import Optional
import pydantic
class Element(pydantic.BaseModel):
name: str
xpath: str
url: Optional[str] = None
class CapturedElement(pydantic.BaseModel):
xpath: str
text: str
name: str

View File

@@ -1,8 +1,19 @@
from pydantic import BaseModel
# STL
from typing import Any, Optional
# PDM
from pydantic import BaseModel
# LOCAL
from api.backend.job.models.site_map import SiteMap
class Proxy(BaseModel):
server: str
username: Optional[str] = None
password: Optional[str] = None
class FetchOptions(BaseModel):
chat: Optional[bool] = None
@@ -10,6 +21,8 @@ class FetchOptions(BaseModel):
class JobOptions(BaseModel):
multi_page_scrape: bool = False
custom_headers: dict[str, Any] = {}
proxies: list[str] = []
proxies: list[Proxy] = []
site_map: Optional[SiteMap] = None
collect_media: bool = False
custom_cookies: list[dict[str, Any]] = []
return_html: bool = False

View File

@@ -0,0 +1,49 @@
# STL
import logging
from typing import Any, Optional
from urllib.parse import urlparse
# PDM
from playwright.async_api import Page, BrowserContext
LOG = logging.getLogger("Job")
async def add_custom_cookies(
custom_cookies: list[dict[str, Any]],
url: str,
context: BrowserContext,
) -> None:
parsed_url = urlparse(url)
domain = parsed_url.netloc
for cookie in custom_cookies:
cookie_dict = {
"name": cookie.get("name", ""),
"value": cookie.get("value", ""),
"domain": domain,
"path": "/",
}
LOG.info(f"Adding cookie: {cookie_dict}")
await context.add_cookies([cookie_dict]) # type: ignore
async def add_custom_headers(
custom_headers: dict[str, Any],
page: Page,
) -> None:
await page.set_extra_http_headers(custom_headers)
async def add_custom_items(
url: str,
page: Page,
cookies: Optional[list[dict[str, Any]]] = None,
headers: Optional[dict[str, Any]] = None,
) -> None:
if cookies:
await add_custom_cookies(cookies, url, page.context)
if headers:
await add_custom_headers(headers, page)

View File

@@ -1,20 +1,24 @@
# STL
import os
from pathlib import Path
from urllib.parse import urlparse
import re
import logging
from typing import Dict, List
from pathlib import Path
from urllib.parse import urljoin, urlparse
# PDM
import aiohttp
from playwright.async_api import Page
from api.backend.utils import LOG
LOG = logging.getLogger("Job")
async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
async def collect_media(id: str, page: Page) -> dict[str, list[dict[str, str]]]:
media_types = {
"images": "img",
"videos": "video",
"audio": "audio",
"pdfs": 'a[href$=".pdf"]',
"pdfs": 'a[href$=".pdf"], a[href*=".pdf#page="]',
"documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]',
"presentations": 'a[href$=".ppt"], a[href$=".pptx"]',
"spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]',
@@ -48,6 +52,11 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
root_domain = f"{root_url.scheme}://{root_url.netloc}"
url = f"{root_domain}{url}"
if url and re.match(r"^[\w\-]+/", url):
root_url = urlparse(page.url)
root_domain = f"{root_url.scheme}://{root_url.netloc}"
url = urljoin(root_domain + "/", url)
if url and url.startswith(("http://", "https://")):
try:
parsed = urlparse(url)
@@ -67,15 +76,20 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
}.get(media_type, "")
filename += ext
file_path = media_dir / filename
if not os.path.exists(media_dir / id):
os.makedirs(media_dir / id, exist_ok=True)
file_path = media_dir / id / f"{filename}"
async with session.get(url) as response:
response.raise_for_status()
with open(file_path, "wb") as f:
while True:
chunk = await response.content.read(8192)
if not chunk:
break
f.write(chunk)
urls.append({"url": url, "local_path": str(file_path)})

View File

@@ -1,83 +1,80 @@
import logging
# STL
import random
from typing import Any, Optional, cast
import logging
from typing import Any, cast
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup, Tag
# PDM
from bs4 import Tag, BeautifulSoup
from lxml import etree
from camoufox import AsyncCamoufox
from playwright.async_api import Page
from urllib.parse import urlparse, urljoin
from api.backend.models import Element, CapturedElement
from api.backend.job.scraping.scraping_utils import scrape_content
# LOCAL
from api.backend.constants import RECORDINGS_ENABLED
from api.backend.job.models import Element, CapturedElement
from api.backend.job.utils.text_utils import clean_text
from api.backend.job.scraping.add_custom import add_custom_items
from api.backend.job.scraping.scraping_utils import (
sxpath,
is_same_domain,
scrape_content,
)
from api.backend.job.site_mapping.site_mapping import handle_site_mapping
LOG = logging.getLogger(__name__)
def is_same_domain(url: str, original_url: str) -> bool:
parsed_url = urlparse(url)
parsed_original_url = urlparse(original_url)
return parsed_url.netloc == parsed_original_url.netloc or parsed_url.netloc == ""
def clean_xpath(xpath: str) -> str:
parts = xpath.split("/")
clean_parts = ["/" if part == "" else part for part in parts]
clean_xpath = "//".join(clean_parts).replace("////", "//").replace("'", "\\'")
LOG.info(f"Cleaned xpath: {clean_xpath}")
return clean_xpath
def sxpath(context: etree._Element, xpath: str):
return context.xpath(xpath)
LOG = logging.getLogger("Job")
async def make_site_request(
id: str,
url: str,
headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False,
job_options: dict[str, Any],
visited_urls: set[str] = set(),
pages: set[tuple[str, str]] = set(),
original_url: str = "",
proxies: Optional[list[str]] = None,
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
):
headers = job_options["custom_headers"]
multi_page_scrape = job_options["multi_page_scrape"]
proxies = job_options["proxies"]
site_map = job_options["site_map"]
collect_media = job_options["collect_media"]
custom_cookies = job_options["custom_cookies"]
if url in visited_urls:
return
proxy = None
if proxies:
proxy = random.choice(proxies)
LOG.info(f"Using proxy: {proxy}")
async with AsyncCamoufox(headless=True, proxy=proxy) as browser:
async with AsyncCamoufox(headless=not RECORDINGS_ENABLED, proxy=proxy) as browser:
page: Page = await browser.new_page()
await page.set_viewport_size({"width": 1920, "height": 1080})
if headers:
await page.set_extra_http_headers(headers)
# Add cookies and headers
await add_custom_items(url, page, custom_cookies, headers)
LOG.info(f"Visiting URL: {url}")
try:
await page.goto(url, timeout=60000)
await page.wait_for_load_state("networkidle", timeout=10000)
await page.wait_for_load_state("networkidle")
final_url = page.url
visited_urls.add(url)
visited_urls.add(final_url)
html_content = await scrape_content(page, pages, collect_media)
html_content = await scrape_content(id, page, pages, collect_media)
html_content = await page.content()
pages.add((html_content, final_url))
if site_map:
await handle_site_mapping(
site_map, page, pages, collect_media=collect_media
id, site_map, page, pages, collect_media=collect_media
)
finally:
@@ -104,19 +101,18 @@ async def make_site_request(
if link not in visited_urls and is_same_domain(link, original_url):
await make_site_request(
id,
link,
headers=headers,
multi_page_scrape=multi_page_scrape,
job_options=job_options,
visited_urls=visited_urls,
pages=pages,
original_url=original_url,
proxies=proxies,
site_map=site_map,
collect_media=collect_media,
)
async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]):
async def collect_scraped_elements(
page: tuple[str, str], xpaths: list[Element], return_html: bool
):
soup = BeautifulSoup(page[0], "lxml")
root = etree.HTML(str(soup))
@@ -126,12 +122,24 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
el = sxpath(root, elem.xpath)
for e in el: # type: ignore
if return_html:
elements[elem.name] = [
CapturedElement(
xpath=elem.xpath,
text=page[0],
name=elem.name,
)
]
continue
text = (
"\t".join(str(t) for t in e.itertext())
" ".join(str(t) for t in e.itertext())
if isinstance(e, etree._Element)
else str(e) # type: ignore
)
text = clean_text(text)
captured_element = CapturedElement(
xpath=elem.xpath, text=text, name=elem.name
)
@@ -145,32 +153,30 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
async def scrape(
id: str,
url: str,
xpaths: list[Element],
headers: Optional[dict[str, Any]] = None,
multi_page_scrape: bool = False,
proxies: Optional[list[str]] = None,
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
job_options: dict[str, Any],
):
visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set()
await make_site_request(
id,
url,
headers=headers,
multi_page_scrape=multi_page_scrape,
job_options=job_options,
visited_urls=visited_urls,
pages=pages,
original_url=url,
proxies=proxies,
site_map=site_map,
collect_media=collect_media,
)
elements: list[dict[str, dict[str, list[CapturedElement]]]] = []
for page in pages:
elements.append(await collect_scraped_elements(page, xpaths))
elements.append(
await collect_scraped_elements(
page, xpaths, job_options.get("return_html", False)
)
)
return elements

View File

@@ -1,14 +1,21 @@
# STL
import asyncio
import logging
from typing import Set, Tuple
from urllib.parse import urlparse
# PDM
from lxml import etree
from playwright.async_api import Page
from api.backend.utils import LOG
# LOCAL
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
LOG = logging.getLogger("Job")
async def scrape_content(
page: Page, pages: Set[Tuple[str, str]], collect_media: bool
id: str, page: Page, pages: Set[Tuple[str, str]], collect_media: bool
) -> str:
last_height = await page.evaluate("document.body.scrollHeight")
@@ -27,6 +34,25 @@ async def scrape_content(
if collect_media:
LOG.info("Collecting media")
await collect_media_utils(page)
await collect_media_utils(id, page)
return html
def is_same_domain(url: str, original_url: str) -> bool:
parsed_url = urlparse(url)
parsed_original_url = urlparse(original_url)
return parsed_url.netloc == parsed_original_url.netloc or parsed_url.netloc == ""
def clean_xpath(xpath: str) -> str:
parts = xpath.split("/")
clean_parts = ["/" if part == "" else part for part in parts]
clean_xpath = "//".join(clean_parts).replace("////", "//").replace("'", "\\'")
LOG.info(f"Cleaned xpath: {clean_xpath}")
return clean_xpath
def sxpath(context: etree._Element, xpath: str):
return context.xpath(xpath)

View File

@@ -1,14 +1,17 @@
import logging
# STL
import asyncio
import logging
from copy import deepcopy
from typing import Any
# PDM
from playwright.async_api import Page
# LOCAL
from api.backend.job.models.site_map import Action, SiteMap
from api.backend.job.scraping.scraping_utils import scrape_content
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Job")
def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]:
@@ -24,7 +27,6 @@ def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]:
async def handle_input(action: Action, page: Page) -> bool:
try:
element = page.locator(f"xpath={action.xpath}")
await element.wait_for(state="visible", timeout=10000)
LOG.info(f"Sending keys: {action.input} to element: {action.xpath}")
await element.fill(action.input)
return True
@@ -36,7 +38,6 @@ async def handle_input(action: Action, page: Page) -> bool:
async def handle_click(action: Action, page: Page) -> bool:
try:
element = page.locator(f"xpath={action.xpath}")
await element.wait_for(state="visible", timeout=10000)
LOG.info(f"Clicking element: {action.xpath}")
await element.click()
return True
@@ -52,6 +53,7 @@ ACTION_MAP = {
async def handle_site_mapping(
id: str,
site_map_dict: dict[str, Any],
page: Page,
pages: set[tuple[str, str]],
@@ -68,11 +70,11 @@ async def handle_site_mapping(
await asyncio.sleep(2)
await scrape_content(page, pages, collect_media=collect_media)
await scrape_content(id, page, pages, collect_media=collect_media)
cleared_site_map_dict = clear_done_actions(site_map_dict)
if cleared_site_map_dict["actions"]:
await handle_site_mapping(
cleared_site_map_dict, page, pages, collect_media=collect_media
id, cleared_site_map_dict, page, pages, collect_media=collect_media
)

View File

@@ -1,6 +1,8 @@
# STL
from typing import Any
from api.backend.utils import clean_text
# LOCAL
from api.backend.job.utils.text_utils import clean_text
def clean_job_format(jobs: list[dict[str, Any]]) -> dict[str, Any]:
@@ -26,7 +28,9 @@ def clean_job_format(jobs: list[dict[str, Any]]) -> dict[str, Any]:
"xpath": value.get("xpath", ""),
"text": text,
"user": job.get("user", ""),
"time_created": job.get("time_created", ""),
"time_created": job.get(
"time_created", ""
).isoformat(),
}
)

View File

@@ -1,6 +1,8 @@
# STL
from typing import Any
from api.backend.utils import clean_text
# LOCAL
from api.backend.job.utils.text_utils import clean_text
def stream_md_from_job_results(jobs: list[dict[str, Any]]):

View File

@@ -0,0 +1,10 @@
def clean_text(text: str):
text = text.strip()
text = text.replace("\n", " ")
text = text.replace("\t", " ")
text = text.replace("\r", " ")
text = text.replace("\f", " ")
text = text.replace("\v", " ")
text = text.replace("\b", " ")
text = text.replace("\a", " ")
return text

View File

@@ -0,0 +1,31 @@
# STL
import logging
import traceback
from typing import Any, Union, Callable, Awaitable
from functools import wraps
# PDM
from fastapi.responses import JSONResponse
def handle_exceptions(
logger: logging.Logger,
) -> Callable[
[Callable[..., Awaitable[Any]]], Callable[..., Awaitable[Union[Any, JSONResponse]]]
]:
def decorator(
func: Callable[..., Awaitable[Any]],
) -> Callable[..., Awaitable[Union[Any, JSONResponse]]]:
@wraps(func)
async def wrapper(*args: Any, **kwargs: Any) -> Union[Any, JSONResponse]:
try:
return await func(*args, **kwargs)
except Exception as e:
logger.error(f"Exception occurred: {e}")
traceback.print_exc()
return JSONResponse(content={"error": str(e)}, status_code=500)
return wrapper
return decorator

View File

@@ -1,233 +0,0 @@
# STL
import datetime
import uuid
import traceback
from io import StringIO
import csv
import logging
import random
# PDM
from fastapi import Depends, APIRouter
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse, StreamingResponse
from api.backend.scheduler import scheduler
from apscheduler.triggers.cron import CronTrigger # type: ignore
# LOCAL
from api.backend.job import insert, update_job, delete_jobs
from api.backend.models import (
DeleteCronJob,
UpdateJobs,
DownloadJob,
DeleteScrapeJobs,
Job,
CronJob,
)
from api.backend.schemas import User
from api.backend.auth.auth_utils import get_current_user
from api.backend.utils import clean_text, format_list_for_query
from api.backend.job.models.job_options import FetchOptions
from api.backend.database.common import query
from api.backend.job.cron_scheduling.cron_scheduling import (
delete_cron_job,
get_cron_job_trigger,
insert_cron_job,
get_cron_jobs,
insert_job_from_cron_job,
)
from api.backend.job.utils.clean_job_format import clean_job_format
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
LOG = logging.getLogger(__name__)
job_router = APIRouter()
@job_router.post("/update")
async def update(update_jobs: UpdateJobs, _: User = Depends(get_current_user)):
"""Used to update jobs"""
await update_job(update_jobs.ids, update_jobs.field, update_jobs.value)
@job_router.post("/submit-scrape-job")
async def submit_scrape_job(job: Job):
LOG.info(f"Recieved job: {job}")
try:
job.id = uuid.uuid4().hex
job_dict = job.model_dump()
insert(job_dict)
return JSONResponse(content={"id": job.id})
except Exception as e:
LOG.error(f"Exception occurred: {traceback.format_exc()}")
return JSONResponse(content={"error": str(e)}, status_code=500)
@job_router.post("/retrieve-scrape-jobs")
async def retrieve_scrape_jobs(
fetch_options: FetchOptions, user: User = Depends(get_current_user)
):
LOG.info(f"Retrieving jobs for account: {user.email}")
ATTRIBUTES = "chat" if fetch_options.chat else "*"
try:
job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?"
results = query(job_query, (user.email,))
return JSONResponse(content=jsonable_encoder(results[::-1]))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
return JSONResponse(content=[], status_code=500)
@job_router.get("/job/{id}")
async def job(id: str, user: User = Depends(get_current_user)):
LOG.info(f"Retrieving jobs for account: {user.email}")
try:
job_query = "SELECT * FROM jobs WHERE user = ? AND id = ?"
results = query(job_query, (user.email, id))
return JSONResponse(content=jsonable_encoder(results))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
return JSONResponse(content={"error": str(e)}, status_code=500)
@job_router.post("/download")
async def download(download_job: DownloadJob):
LOG.info(f"Downloading job with ids: {download_job.ids}")
try:
job_query = (
f"SELECT * FROM jobs WHERE id IN {format_list_for_query(download_job.ids)}"
)
results = query(job_query, tuple(download_job.ids))
if download_job.job_format == "csv":
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
headers = [
"id",
"url",
"element_name",
"xpath",
"text",
"user",
"time_created",
]
csv_writer.writerow(headers)
for result in results:
for res in result["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
csv_writer.writerow(
[
result.get("id", "")
+ "-"
+ str(random.randint(0, 1000000)),
url,
element_name,
value.get("xpath", ""),
text,
result.get("user", ""),
result.get("time_created", ""),
]
)
_ = csv_buffer.seek(0)
response = StreamingResponse(
csv_buffer,
media_type="text/csv",
)
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
elif download_job.job_format == "md":
response = StreamingResponse(
stream_md_from_job_results(results),
media_type="text/markdown",
)
response.headers["Content-Disposition"] = "attachment; filename=export.md"
return response
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
return {"error": str(e)}
@job_router.get("/job/{id}/convert-to-csv")
async def convert_to_csv(id: str):
try:
job_query = f"SELECT * FROM jobs WHERE id = ?"
results = query(job_query, (id,))
return JSONResponse(content=clean_job_format(results))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
return {"error": str(e)}
@job_router.post("/delete-scrape-jobs")
async def delete(delete_scrape_jobs: DeleteScrapeJobs):
result = await delete_jobs(delete_scrape_jobs.ids)
return (
JSONResponse(content={"message": "Jobs successfully deleted."})
if result
else JSONResponse({"error": "Jobs not deleted."})
)
@job_router.post("/schedule-cron-job")
async def schedule_cron_job(cron_job: CronJob):
if not cron_job.id:
cron_job.id = uuid.uuid4().hex
if not cron_job.time_created:
cron_job.time_created = datetime.datetime.now()
if not cron_job.time_updated:
cron_job.time_updated = datetime.datetime.now()
insert_cron_job(cron_job)
queried_job = query("SELECT * FROM jobs WHERE id = ?", (cron_job.job_id,))
scheduler.add_job(
insert_job_from_cron_job,
get_cron_job_trigger(cron_job.cron_expression),
id=cron_job.id,
args=[queried_job[0]],
)
return JSONResponse(content={"message": "Cron job scheduled successfully."})
@job_router.post("/delete-cron-job")
async def delete_cron_job_request(request: DeleteCronJob):
if not request.id:
return JSONResponse(
content={"error": "Cron job id is required."}, status_code=400
)
delete_cron_job(request.id, request.user_email)
scheduler.remove_job(request.id)
return JSONResponse(content={"message": "Cron job deleted successfully."})
@job_router.get("/cron-jobs")
async def get_cron_jobs_request(user: User = Depends(get_current_user)):
cron_jobs = get_cron_jobs(user.email)
return JSONResponse(content=jsonable_encoder(cron_jobs))

View File

@@ -1,29 +0,0 @@
# STL
import logging
# PDM
from fastapi import APIRouter, Depends
# LOCAL
from api.backend.job import (
get_jobs_per_day,
average_elements_per_link,
)
from api.backend.auth.auth_utils import get_current_user
from api.backend.schemas import User
LOG = logging.getLogger(__name__)
stats_router = APIRouter()
@stats_router.get("/statistics/get-average-element-per-link")
async def get_average_element_per_link(user: User = Depends(get_current_user)):
return await average_elements_per_link(user.email)
@stats_router.get("/statistics/get-average-jobs-per-day")
async def average_jobs_per_day(user: User = Depends(get_current_user)):
data = await get_jobs_per_day(user.email)
return data

View File

@@ -1,3 +1,4 @@
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
# PDM
from apscheduler.schedulers.asyncio import AsyncIOScheduler
scheduler = BackgroundScheduler()
scheduler = AsyncIOScheduler()

View File

@@ -0,0 +1,17 @@
from typing import Optional, Union
from datetime import datetime
import pydantic
class CronJob(pydantic.BaseModel):
id: Optional[str] = None
user_email: str
job_id: str
cron_expression: str
time_created: Optional[Union[datetime, str]] = None
time_updated: Optional[Union[datetime, str]] = None
class DeleteCronJob(pydantic.BaseModel):
id: str
user_email: str

View File

@@ -1,24 +1,24 @@
# STL
from typing import Any, Literal, Optional, Union
from datetime import datetime
# LOCAL
from api.backend.job.models.job_options import JobOptions
# PDM
import pydantic
class Element(pydantic.BaseModel):
name: str
xpath: str
url: Optional[str] = None
from api.backend.job.models import Element, CapturedElement
class CapturedElement(pydantic.BaseModel):
xpath: str
text: str
name: str
class Job(pydantic.BaseModel):
id: Optional[str] = None
url: str
elements: list[Element]
user: str = ""
time_created: Optional[Union[datetime, str]] = None
result: list[dict[str, dict[str, list[CapturedElement]]]] = []
job_options: JobOptions
status: str = "Queued"
chat: Optional[str] = None
agent_mode: bool = False
prompt: Optional[str] = None
favorite: bool = False
class RetrieveScrapeJobs(pydantic.BaseModel):
@@ -34,41 +34,7 @@ class DeleteScrapeJobs(pydantic.BaseModel):
ids: list[str]
class GetStatistics(pydantic.BaseModel):
user: str
class UpdateJobs(pydantic.BaseModel):
ids: list[str]
field: str
value: Any
class AI(pydantic.BaseModel):
messages: list[Any]
class Job(pydantic.BaseModel):
id: Optional[str] = None
url: str
elements: list[Element]
user: str = ""
time_created: Optional[Union[datetime, str]] = None
result: list[dict[str, dict[str, list[CapturedElement]]]] = []
job_options: JobOptions
status: str = "Queued"
chat: Optional[str] = None
class CronJob(pydantic.BaseModel):
id: Optional[str] = None
user_email: str
job_id: str
cron_expression: str
time_created: Optional[Union[datetime, str]] = None
time_updated: Optional[Union[datetime, str]] = None
class DeleteCronJob(pydantic.BaseModel):
id: str
user_email: str

View File

@@ -0,0 +1,37 @@
# STL
import logging
# PDM
from fastapi import Depends, APIRouter
from sqlalchemy.ext.asyncio import AsyncSession
# LOCAL
from api.backend.auth.schemas import User
from api.backend.database.base import get_db
from api.backend.auth.auth_utils import get_current_user
from api.backend.routers.handle_exceptions import handle_exceptions
from api.backend.database.queries.statistics.statistic_queries import (
get_jobs_per_day,
average_elements_per_link,
)
LOG = logging.getLogger("Statistics")
stats_router = APIRouter()
@stats_router.get("/statistics/get-average-element-per-link")
@handle_exceptions(logger=LOG)
async def get_average_element_per_link(
user: User = Depends(get_current_user), db: AsyncSession = Depends(get_db)
):
return await average_elements_per_link(db, user.email)
@stats_router.get("/statistics/get-average-jobs-per-day")
@handle_exceptions(logger=LOG)
async def average_jobs_per_day(
user: User = Depends(get_current_user), db: AsyncSession = Depends(get_db)
):
data = await get_jobs_per_day(db, user.email)
return data

View File

@@ -0,0 +1,108 @@
# STL
import os
import asyncio
from typing import Any, Generator, AsyncGenerator
# PDM
import pytest
import pytest_asyncio
from httpx import AsyncClient, ASGITransport
from proxy import Proxy
from sqlalchemy import text
from sqlalchemy.pool import NullPool
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
# LOCAL
from api.backend.app import app
from api.backend.database.base import get_db
from api.backend.database.models import Base
from api.backend.tests.constants import TEST_DB_PATH
@pytest.fixture(scope="session", autouse=True)
def running_proxy():
proxy = Proxy(["--hostname", "127.0.0.1", "--port", "8080"])
proxy.setup()
yield proxy
proxy.shutdown()
@pytest.fixture(scope="session")
def test_db_path() -> str:
return TEST_DB_PATH
@pytest.fixture(scope="session", autouse=True)
def test_db(test_db_path: str) -> Generator[str, None, None]:
"""Create a fresh test database for each test function."""
os.makedirs(os.path.dirname(test_db_path), exist_ok=True)
if os.path.exists(test_db_path):
os.remove(test_db_path)
# Create async engine for test database
test_db_url = f"sqlite+aiosqlite:///{test_db_path}"
engine = create_async_engine(test_db_url, echo=False)
async def setup_db():
async with engine.begin() as conn:
# Create tables
# LOCAL
from api.backend.database.models import Base
await conn.run_sync(Base.metadata.create_all)
# Run setup
asyncio.run(setup_db())
yield test_db_path
if os.path.exists(test_db_path):
os.remove(test_db_path)
@pytest_asyncio.fixture(scope="session")
async def test_engine():
test_db_url = f"sqlite+aiosqlite:///{TEST_DB_PATH}"
engine = create_async_engine(test_db_url, poolclass=NullPool)
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
yield engine
await engine.dispose()
@pytest_asyncio.fixture(scope="function")
async def db_session(test_engine: Any) -> AsyncGenerator[AsyncSession, None]:
async_session = async_sessionmaker(
bind=test_engine,
class_=AsyncSession,
expire_on_commit=False,
)
async with async_session() as session:
try:
yield session
finally:
# Truncate all tables after each test
for table in reversed(Base.metadata.sorted_tables):
await session.execute(text(f"DELETE FROM {table.name}"))
await session.commit()
@pytest.fixture()
def override_get_db(db_session: AsyncSession):
async def _override() -> AsyncGenerator[AsyncSession, None]:
yield db_session
return _override
@pytest_asyncio.fixture()
async def client(override_get_db: Any) -> AsyncGenerator[AsyncClient, None]:
app.dependency_overrides[get_db] = override_get_db
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url="http://test") as c:
yield c
app.dependency_overrides.clear()

View File

@@ -0,0 +1 @@
TEST_DB_PATH = "tests/test_db.sqlite"

View File

@@ -1,7 +1,13 @@
from api.backend.models import Element, Job, JobOptions, CapturedElement
# STL
import uuid
# PDM
from faker import Faker
# LOCAL
from api.backend.job.models import Element, JobOptions, CapturedElement
from api.backend.schemas.job import Job
fake = Faker()

View File

@@ -1,40 +1,65 @@
# STL
import random
from datetime import datetime, timezone
# PDM
import pytest
from fastapi.testclient import TestClient
from unittest.mock import AsyncMock, patch
from api.backend.app import app
from api.backend.models import DownloadJob
from api.backend.tests.factories.job_factory import create_completed_job
from httpx import AsyncClient
from sqlalchemy.ext.asyncio import AsyncSession
client = TestClient(app)
# LOCAL
from api.backend.schemas.job import DownloadJob
from api.backend.database.models import Job
mocked_job = create_completed_job().model_dump()
mock_results = [mocked_job]
mocked_random_int = 123456
@pytest.mark.asyncio
@patch("api.backend.routers.job_router.query")
@patch("api.backend.routers.job_router.random.randint")
async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
# Ensure the mock returns immediately
mock_query.return_value = mock_results
mock_randint.return_value = mocked_random_int
async def test_download(client: AsyncClient, db_session: AsyncSession):
# Insert a test job into the DB
job_id = "test-job-id"
test_job = Job(
id=job_id,
url="https://example.com",
elements=[],
user="test@example.com",
time_created=datetime.now(timezone.utc),
result=[
{
"https://example.com": {
"element_name": [{"xpath": "//div", "text": "example"}]
}
}
],
status="Completed",
chat=None,
job_options={},
agent_mode=False,
prompt="",
favorite=False,
)
db_session.add(test_job)
await db_session.commit()
# Create a DownloadJob instance
download_job = DownloadJob(ids=[mocked_job["id"]], job_format="csv")
# Force predictable randint
random.seed(0)
# Make a POST request to the /download endpoint
response = client.post("/download", json=download_job.model_dump())
# Build request
download_job = DownloadJob(ids=[job_id], job_format="csv")
response = await client.post("/download", json=download_job.model_dump())
# Assertions
assert response.status_code == 200
assert response.headers["Content-Disposition"] == "attachment; filename=export.csv"
# Check the content of the CSV
# Validate CSV contents
csv_content = response.content.decode("utf-8")
expected_csv = (
f'"id","url","element_name","xpath","text","user","time_created"\r\n'
f'"{mocked_job["id"]}-{mocked_random_int}","https://example.com","element_name","//div","example",'
f'"{mocked_job["user"]}","{mocked_job["time_created"]}"\r\n'
lines = csv_content.strip().split("\n")
assert (
lines[0].strip()
== '"id","url","element_name","xpath","text","user","time_created"'
)
assert csv_content == expected_csv
assert '"https://example.com"' in lines[1]
assert '"element_name"' in lines[1]
assert '"//div"' in lines[1]
assert '"example"' in lines[1]

View File

@@ -1,25 +1,117 @@
import pytest
# STL
import logging
from playwright.async_api import async_playwright, Error
from typing import Dict
from datetime import datetime
# PDM
import pytest
from httpx import AsyncClient
from sqlalchemy import select
from fastapi.testclient import TestClient
from playwright.async_api import Route, Cookie, async_playwright
from sqlalchemy.ext.asyncio import AsyncSession
# LOCAL
from api.backend.app import app
from api.backend.job.models import Proxy, Element, JobOptions
from api.backend.schemas.job import Job
from api.backend.database.models import Job as JobModel
from api.backend.job.scraping.add_custom import add_custom_items
logging.basicConfig(level=logging.DEBUG)
LOG = logging.getLogger(__name__)
client = TestClient(app)
@pytest.mark.asyncio
async def test_proxy():
proxy = "127.0.0.1:8080"
async def test_add_custom_items():
test_cookies = [{"name": "big", "value": "cookie"}]
test_headers = {"User-Agent": "test-agent", "Accept": "application/json"}
async with async_playwright() as p:
browser = await p.firefox.launch(
headless=True, proxy={"server": f"http://{proxy}"}
)
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
with pytest.raises(Error) as excinfo:
await page.goto("http://example.com")
# Set up request interception
captured_headers: Dict[str, str] = {}
assert "NS_ERROR_PROXY_CONNECTION_REFUSED" in str(excinfo.value)
async def handle_route(route: Route) -> None:
nonlocal captured_headers
captured_headers = route.request.headers
await route.continue_()
await page.route("**/*", handle_route)
await add_custom_items(
url="http://example.com",
page=page,
cookies=test_cookies,
headers=test_headers,
)
# Navigate to example.com
await page.goto("http://example.com")
# Verify cookies were added
cookies: list[Cookie] = await page.context.cookies()
test_cookie = next((c for c in cookies if c.get("name") == "big"), None)
assert test_cookie is not None
assert test_cookie.get("value") == "cookie"
assert test_cookie.get("path") == "/" # Default path should be set
assert test_cookie.get("sameSite") == "Lax" # Default sameSite should be set
# Verify headers were added
assert captured_headers.get("user-agent") == "test-agent"
await browser.close()
@pytest.mark.asyncio
async def test_proxies(client: AsyncClient, db_session: AsyncSession):
job = Job(
url="https://example.com",
elements=[Element(xpath="//div", name="test")],
job_options=JobOptions(
proxies=[
Proxy(
server="127.0.0.1:8080",
username="user",
password="pass",
)
],
),
time_created=datetime.now().isoformat(),
)
response = await client.post("/submit-scrape-job", json=job.model_dump())
assert response.status_code == 200
stmt = select(JobModel)
result = await db_session.execute(stmt)
jobs = result.scalars().all()
assert len(jobs) > 0
job_from_db = jobs[0]
job_dict = job_from_db.__dict__
job_dict.pop("_sa_instance_state", None)
assert job_dict is not None
print(job_dict)
assert job_dict["job_options"]["proxies"] == [
{
"server": "127.0.0.1:8080",
"username": "user",
"password": "pass",
}
]
# Verify the job was stored correctly in the database
assert job_dict["url"] == "https://example.com"
assert job_dict["status"] == "Queued"
assert len(job_dict["elements"]) == 1
assert job_dict["elements"][0]["xpath"] == "//div"
assert job_dict["elements"][0]["name"] == "test"

View File

@@ -0,0 +1,17 @@
# STL
import sqlite3
# LOCAL
from api.backend.database.schema import INIT_QUERY
from api.backend.tests.constants import TEST_DB_PATH
def connect_to_db():
conn = sqlite3.connect(TEST_DB_PATH)
cur = conn.cursor()
for query in INIT_QUERY.split(";"):
cur.execute(query)
conn.commit()
return conn, cur

View File

@@ -1,17 +1,10 @@
from typing import Any, Optional
# STL
import logging
import json
from typing import Optional
LOG = logging.getLogger(__name__)
def clean_text(text: str):
text = text.replace("\r\n", "\n") # Normalize newlines
text = text.replace("\n", "\\n") # Escape newlines
text = text.replace('"', '\\"') # Escape double quotes
return text
def get_log_level(level_name: Optional[str]) -> int:
level = logging.INFO
@@ -20,30 +13,3 @@ def get_log_level(level_name: Optional[str]) -> int:
level = getattr(logging, level_name, logging.INFO)
return level
def format_list_for_query(ids: list[str]):
return (
f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)"
)
def format_sql_row_to_python(row: dict[str, Any]):
new_row: dict[str, Any] = {}
for key, value in row.items():
if isinstance(value, str):
try:
new_row[key] = json.loads(value)
except json.JSONDecodeError:
new_row[key] = value
else:
new_row[key] = value
return new_row
def format_json(items: list[Any]):
for idx, item in enumerate(items):
if isinstance(item, (dict, list)):
formatted_item = json.dumps(item)
items[idx] = formatted_item

View File

@@ -0,0 +1,17 @@
# STL
import os
from pathlib import Path
NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "")
NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "")
SCRAPERR_FRONTEND_URL = os.getenv("SCRAPERR_FRONTEND_URL", "")
EMAIL = os.getenv("EMAIL", "")
TO = os.getenv("TO", "")
SMTP_HOST = os.getenv("SMTP_HOST", "")
SMTP_PORT = int(os.getenv("SMTP_PORT", 587))
SMTP_USER = os.getenv("SMTP_USER", "")
SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "")
USE_TLS = os.getenv("USE_TLS", "false").lower() == "true"
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
RECORDINGS_DIR = Path("/project/app/media/recordings")

View File

@@ -1,48 +1,88 @@
import os
from api.backend.job import get_queued_job, update_job
from api.backend.scraping import scrape
from api.backend.models import Element
from fastapi.encoders import jsonable_encoder
# STL
import json
import asyncio
import traceback
import subprocess
from api.backend.database.startup import init_database
# PDM
from fastapi.encoders import jsonable_encoder
from api.backend.worker.post_job_complete.post_job_complete import post_job_complete
# LOCAL
from api.backend.job import update_job, get_queued_job
from api.backend.job.models import Element
from api.backend.worker.logger import LOG
NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "")
NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "")
SCRAPERR_FRONTEND_URL = os.getenv("SCRAPERR_FRONTEND_URL", "")
EMAIL = os.getenv("EMAIL", "")
TO = os.getenv("TO", "")
SMTP_HOST = os.getenv("SMTP_HOST", "")
SMTP_PORT = int(os.getenv("SMTP_PORT", 587))
SMTP_USER = os.getenv("SMTP_USER", "")
SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "")
USE_TLS = os.getenv("USE_TLS", "false").lower() == "true"
from api.backend.ai.agent.agent import scrape_with_agent
from api.backend.worker.constants import (
TO,
EMAIL,
USE_TLS,
SMTP_HOST,
SMTP_PORT,
SMTP_USER,
SMTP_PASSWORD,
RECORDINGS_DIR,
RECORDINGS_ENABLED,
NOTIFICATION_CHANNEL,
SCRAPERR_FRONTEND_URL,
NOTIFICATION_WEBHOOK_URL,
)
from api.backend.job.scraping.scraping import scrape
from api.backend.worker.post_job_complete.post_job_complete import post_job_complete
async def process_job():
job = await get_queued_job()
ffmpeg_proc = None
status = "Queued"
if job:
LOG.info(f"Beginning processing job: {job}.")
try:
output_path = RECORDINGS_DIR / f"{job['id']}.mp4"
if RECORDINGS_ENABLED:
ffmpeg_proc = subprocess.Popen(
[
"ffmpeg",
"-y",
"-video_size",
"1280x1024",
"-framerate",
"15",
"-f",
"x11grab",
"-i",
":99",
"-codec:v",
"libx264",
"-preset",
"ultrafast",
output_path,
]
)
_ = await update_job([job["id"]], field="status", value="Scraping")
scraped = await scrape(
job["url"],
[Element(**j) for j in job["elements"]],
job["job_options"]["custom_headers"],
job["job_options"]["multi_page_scrape"],
job["job_options"]["proxies"],
job["job_options"]["site_map"],
job["job_options"]["collect_media"],
)
proxies = job["job_options"]["proxies"]
if proxies and isinstance(proxies[0], str) and proxies[0].startswith("{"):
try:
proxies = [json.loads(p) for p in proxies]
except json.JSONDecodeError:
LOG.error(f"Failed to parse proxy JSON: {proxies}")
proxies = []
if job["agent_mode"]:
scraped = await scrape_with_agent(job)
else:
scraped = await scrape(
job["id"],
job["url"],
[Element(**j) for j in job["elements"]],
{**job["job_options"], "proxies": proxies},
)
LOG.info(
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
)
@@ -75,11 +115,15 @@ async def process_job():
},
)
if ffmpeg_proc:
ffmpeg_proc.terminate()
ffmpeg_proc.wait()
async def main():
LOG.info("Starting job worker...")
init_database()
RECORDINGS_DIR.mkdir(parents=True, exist_ok=True)
while True:
await process_job()

View File

@@ -1,12 +1,13 @@
# STL
import logging
import os
from api.backend.utils import get_log_level
# LOCAL
from api.backend.app import LOG_LEVEL
logging.basicConfig(
level=get_log_level(os.getenv("LOG_LEVEL")),
format="%(levelname)s: %(asctime)s - %(name)s - %(message)s",
level=LOG_LEVEL,
format="%(levelname)s: %(asctime)s - [%(name)s] - %(message)s",
handlers=[logging.StreamHandler()],
)
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Job Worker")

View File

@@ -1,5 +1,7 @@
# STL
from typing import Any
# LOCAL
from api.backend.worker.post_job_complete.models import PostJobCompleteOptions
from api.backend.worker.post_job_complete.email_notifcation import (
send_job_complete_email,
@@ -16,9 +18,10 @@ async def post_job_complete(job: dict[str, Any], options: PostJobCompleteOptions
if not options.values():
return
if options["channel"] == "discord":
discord_notification(job, options)
elif options["channel"] == "email":
send_job_complete_email(job, options)
else:
raise ValueError(f"Invalid channel: {options['channel']}")
match options["channel"]:
case "discord":
discord_notification(job, options)
case "email":
send_job_complete_email(job, options)
case _:
raise ValueError(f"Invalid channel: {options['channel']}")

View File

@@ -0,0 +1,23 @@
describe("Global setup", () => {
it("signs up user once", () => {
cy.request({
method: "POST",
url: "/api/signup",
body: JSON.stringify({
data: {
email: "test@test.com",
password: "password",
full_name: "John Doe",
},
}),
headers: {
"Content-Type": "application/json",
},
failOnStatusCode: false,
}).then((response) => {
if (response.status !== 200 && response.status !== 201) {
console.warn("Signup failed:", response.status, response.body);
}
});
});
});

View File

@@ -0,0 +1,101 @@
import { login } from "../utilities/authentication.utils";
import {
addCustomHeaders,
addElement,
addMedia,
addSiteMapAction,
checkForMedia,
cleanUpJobs,
enterJobUrl,
openAdvancedJobOptions,
submitBasicJob,
submitJob,
waitForJobCompletion,
} from "../utilities/job.utilities";
import { mockSubmitJob } from "../utilities/mocks";
describe.only("Advanced Job Options", () => {
beforeEach(() => {
mockSubmitJob();
login();
cy.visit("/");
});
afterEach(() => {
cleanUpJobs();
});
it.only("should handle custom headers", () => {
const customHeaders = {
"User-Agent": "Test Agent",
"Accept-Language": "en-US",
};
addCustomHeaders(customHeaders);
submitBasicJob("https://httpbin.org/headers", "headers", "//pre");
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
expect(
interception.request?.body.data.job_options.custom_headers
).to.deep.equal(customHeaders);
});
waitForJobCompletion("https://httpbin.org/headers");
});
it("should handle site map actions", () => {
addSiteMapAction("click", "//button[contains(text(), 'Load More')]");
addSiteMapAction("input", "//input[@type='search']", "test search");
submitBasicJob("https://example.com", "content", "//div[@class='content']");
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
const siteMap = interception.request?.body.data.job_options.site_map;
expect(siteMap.actions).to.have.length(2);
expect(siteMap.actions[0].type).to.equal("click");
expect(siteMap.actions[1].type).to.equal("input");
});
waitForJobCompletion("https://example.com");
});
it("should handle multiple elements", () => {
enterJobUrl("https://books.toscrape.com");
addElement("titles", "//h3");
addElement("prices", "//p[@class='price_color']");
submitJob();
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
expect(interception.request?.body.data.elements).to.have.length(2);
});
waitForJobCompletion("https://books.toscrape.com");
});
it.only("should handle collecting media", () => {
enterJobUrl("https://books.toscrape.com");
openAdvancedJobOptions();
addMedia();
cy.get("body").type("{esc}");
addElement("images", "//img");
submitJob();
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
expect(interception.request?.body.data.job_options.collect_media).to.be
.true;
});
waitForJobCompletion("https://books.toscrape.com");
checkForMedia();
});
});

38
cypress/e2e/agent.cy.ts Normal file
View File

@@ -0,0 +1,38 @@
import { login } from "../utilities/authentication.utils";
import {
buildAgentJob,
cleanUpJobs,
submitJob,
waitForJobCompletion,
} from "../utilities/job.utilities";
import { mockSubmitJob } from "../utilities/mocks";
describe("Agent", () => {
beforeEach(() => {
mockSubmitJob();
login();
});
afterEach(() => {
cleanUpJobs();
});
it("should be able to scrape some data", () => {
cy.visit("/agent");
cy.wait(1000);
const url = "https://books.toscrape.com";
const prompt = "Collect all the links on the page";
buildAgentJob(url, prompt);
submitJob();
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
expect(interception.request?.body.data.url).to.eq(url);
expect(interception.request?.body.data.prompt).to.eq(prompt);
});
waitForJobCompletion("https://books.toscrape.com");
});
});

View File

@@ -1,60 +1,61 @@
describe("Authentication", () => {
it("should register", () => {
cy.intercept("POST", "/api/signup").as("signup");
import { faker } from "@faker-js/faker";
import { mockLogin, mockSignup } from "../utilities/mocks";
cy.visit("/").then(() => {
cy.get("button").contains("Login").click();
cy.url().should("include", "/login");
const mockEmail = faker.internet.email();
const mockPassword = faker.internet.password();
cy.get("form").should("be.visible");
cy.get("button")
.contains("No Account? Sign up")
.should("be.visible")
.click();
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("input[name='fullName']").type("John Doe");
cy.get("button[type='submit']").contains("Signup").click();
cy.wait("@signup").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("signup request did not return a response");
}
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
expect(interception.response.statusCode).to.eq(200);
});
});
describe.only("Authentication", () => {
beforeEach(() => {
cy.visit("/");
mockSignup();
mockLogin();
});
it("should login", () => {
cy.intercept("POST", "/api/token").as("token");
it("should register", () => {
cy.get("button").contains("Login").click();
cy.url().should("include", "/login");
cy.visit("/").then(() => {
cy.get("button")
.contains("Login")
.click()
.then(() => {
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("button[type='submit']").contains("Login").click();
cy.get("form").should("be.visible");
cy.wait("@token").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("token request did not return a response");
}
cy.get("button")
.contains("No Account? Sign up")
.should("be.visible")
.click();
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
cy.get("input[name='email']").type(mockEmail);
cy.get("input[name='password']").type(mockPassword);
cy.get("input[name='fullName']").type(faker.person.fullName());
cy.get("button[type='submit']").contains("Signup").click();
expect(interception.response.statusCode).to.eq(200);
});
});
cy.wait("@signup").then((interception) => {
if (!interception.response) {
throw new Error("signup request did not return a response");
}
expect(interception.response.statusCode).to.eq(200);
});
});
});
it("should login", () => {
cy.intercept("POST", "/api/token").as("token");
cy.visit("/").then(() => {
cy.get("button")
.contains("Login")
.click()
.then(() => {
cy.get("input[name='email']").type(mockEmail);
cy.get("input[name='password']").type(mockPassword);
cy.get("button[type='submit']").contains("Login").click();
cy.wait("@token").then((interception) => {
if (!interception.response) {
throw new Error("token request did not return a response");
}
expect(interception.response.statusCode).to.eq(200);
});
});
});
});

34
cypress/e2e/chat.cy.ts Normal file
View File

@@ -0,0 +1,34 @@
import { login } from "../utilities/authentication.utils";
import {
cleanUpJobs,
selectJobFromSelector,
submitBasicJob,
waitForJobCompletion,
} from "../utilities/job.utilities";
import { mockLogin } from "../utilities/mocks";
describe.only("Chat", () => {
beforeEach(() => {
mockLogin();
login();
cy.visit("/");
});
afterEach(() => {
cleanUpJobs();
});
it.only("should be able to chat", () => {
const url = "https://books.toscrape.com";
submitBasicJob(url, "test", "//body");
waitForJobCompletion(url);
cy.visit("/chat");
selectJobFromSelector();
cy.get("[data-cy='message-input']").type("Hello");
cy.get("[data-cy='send-message']").click();
cy.get("[data-cy='ai-message']").should("exist");
});
});

View File

@@ -1,34 +1,37 @@
import { login } from "../utilities/authentication.utils";
import {
addElement,
cleanUpJobs,
enterJobUrl,
submitJob,
waitForJobCompletion,
} from "../utilities/job.utilities";
import { mockSubmitJob } from "../utilities/mocks";
describe.only("Job", () => {
it("should create a job", () => {
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
beforeEach(() => {
mockSubmitJob();
login();
cy.visit("/");
});
cy.get('[data-cy="url-input"]').type("https://example.com");
cy.get('[data-cy="name-field"]').type("example");
cy.get('[data-cy="xpath-field"]').type("//body");
cy.get('[data-cy="add-button"]').click();
afterEach(() => {
cleanUpJobs();
});
cy.contains("Submit").click();
it("should create a job", () => {
enterJobUrl("https://books.toscrape.com");
addElement("body", "//body");
submitJob();
cy.wait("@submitScrapeJob").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
cy.log("Request body: " + JSON.stringify(interception.request?.body));
throw new Error("submitScrapeJob request did not return a response");
}
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
expect(interception.response.statusCode).to.eq(200);
});
cy.get("li").contains("Jobs").click();
cy.contains("div", "https://example.com", { timeout: 10000 }).should(
"exist"
);
cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
waitForJobCompletion("https://books.toscrape.com");
});
});

View File

@@ -14,7 +14,7 @@
// ***********************************************************
// Import commands.js using ES2015 syntax:
import './commands'
import "./commands";
// Alternatively you can use CommonJS syntax:
// require('./commands')
// require('./commands')

View File

@@ -0,0 +1,68 @@
export const signup = () => {
cy.intercept("POST", "/api/token").as("token");
cy.visit("/").then(() => {
cy.get("button").contains("Login").click();
cy.url().should("include", "/login");
cy.get("form").should("be.visible");
cy.get("button")
.contains("No Account? Sign up")
.should("be.visible")
.click();
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("input[name='fullName']").type("John Doe");
cy.get("button[type='submit']").contains("Signup").click();
cy.wait("@token").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("token request did not return a response");
}
});
});
};
export const login = () => {
cy.intercept("POST", "/api/token").as("token");
cy.intercept("GET", "/api/me").as("me");
cy.intercept("GET", "/api/check").as("check");
cy.visit("/").then(() => {
cy.get("body").then(() => {
cy.get("button")
.contains("Login")
.click()
.then(() => {
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("button[type='submit']").contains("Login").click();
cy.wait("@token").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("token request did not return a response");
}
});
cy.wait("@me").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("me request did not return a response");
}
});
cy.wait("@check").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("check request did not return a response");
}
});
cy.url().should("not.include", "/login");
});
});
});
};

View File

@@ -0,0 +1,187 @@
export const cleanUpJobs = () => {
cy.intercept("POST", "/api/retrieve").as("retrieve");
cy.visit("/jobs");
cy.wait("@retrieve", { timeout: 15000 });
cy.get("tbody tr", { timeout: 20000 }).should("have.length.at.least", 1);
const tryClickSelectAll = (attempt = 1, maxAttempts = 5) => {
cy.log(`Attempt ${attempt} to click Select All`);
cy.get('[data-testid="select-all"]')
.closest("button")
.then(($btn) => {
// Retry if button is disabled
if ($btn.is(":disabled") || $btn.css("pointer-events") === "none") {
if (attempt < maxAttempts) {
cy.wait(1000).then(() =>
tryClickSelectAll(attempt + 1, maxAttempts)
);
} else {
throw new Error(
"Select All button is still disabled after max retries"
);
}
} else {
// Click and then verify if checkbox is checked
cy.wrap($btn)
.click({ force: true })
.then(() => {
cy.get("tbody tr")
.first()
.find("td")
.first()
.find("input[type='checkbox']")
.should("be.checked")
.then(() => {
cy.log("Select All successful");
});
});
// Handle failure case
cy.on("fail", () => {
cy.log("Error clicking Select All");
if (attempt < maxAttempts) {
cy.wait(1000).then(() =>
tryClickSelectAll(attempt + 1, maxAttempts)
);
} else {
throw new Error(
"Checkbox was never checked after clicking Select All"
);
}
return false; // Prevent Cypress from failing the test
});
}
});
};
tryClickSelectAll();
cy.get('[data-testid="DeleteIcon"]', { timeout: 10000 })
.closest("button")
.should("not.be.disabled")
.click();
};
export const submitBasicJob = (url: string, name: string, xpath: string) => {
cy.get('[data-cy="url-input"]').type(url);
cy.get('[data-cy="name-field"]').type(name);
cy.get('[data-cy="xpath-field"]').type(xpath);
cy.get('[data-cy="add-button"]').click();
cy.contains("Submit").click();
};
export const waitForJobCompletion = (url: string) => {
cy.intercept("POST", "/api/retrieve").as("retrieve");
cy.visit("/jobs");
cy.wait("@retrieve", { timeout: 30000 });
cy.contains("div", url, { timeout: 30000 }).should("exist");
const checkJobStatus = () => {
cy.get("[data-testid='job-status']", { timeout: 120000 }).then(($el) => {
const status = $el.text().toLowerCase().trim();
if (status.includes("completed")) {
return true;
} else if (status.includes("scraping") || status.includes("queued")) {
cy.wait(5000);
checkJobStatus();
} else {
throw new Error(`Unexpected job status: ${status}`);
}
});
};
checkJobStatus();
};
export const enableMultiPageScraping = () => {
cy.get("button").contains("Advanced Options").click();
cy.get('[data-cy="multi-page-toggle"]').click();
cy.get("body").type("{esc}");
};
export const addCustomHeaders = (headers: Record<string, string>) => {
cy.get("button").contains("Advanced Options").click();
cy.get('[name="custom_headers"]').type(JSON.stringify(headers), {
parseSpecialCharSequences: false,
});
cy.get("body").type("{esc}");
};
export const addCustomCookies = (cookies: Record<string, string>) => {
cy.get("button").contains("Advanced Options").click();
cy.get('[name="custom_cookies"]').type(JSON.stringify(cookies));
cy.get("body").type("{esc}");
};
export const openAdvancedJobOptions = () => {
cy.get("button").contains("Advanced Options").click();
};
export const selectJobFromSelector = () => {
checkAiDisabled();
cy.get("div[id='select-job']", { timeout: 10000 }).first().click();
cy.get("li[role='option']", { timeout: 10000 }).first().click();
};
export const addMedia = () => {
cy.get('[data-cy="collect-media-checkbox"]').click();
};
export const checkForMedia = () => {
cy.intercept("GET", "/api/media/get-media?id=**").as("getMedia");
cy.visit("/media");
selectJobFromSelector();
cy.wait("@getMedia", { timeout: 30000 });
};
export const addSiteMapAction = (
type: "click" | "input",
xpath: string,
input?: string
) => {
cy.get("button").contains("Create Site Map").click();
cy.get('[data-cy="site-map-select"]').select(type);
cy.get('[data-cy="site-map-xpath"]').type(xpath);
if (type === "input" && input) {
cy.get('[data-cy="site-map-input"]').type(input);
}
cy.get('[data-cy="add-site-map-action"]').click();
};
export const addElement = (name: string, xpath: string) => {
cy.get('[data-cy="name-field"]').type(name);
cy.get('[data-cy="xpath-field"]').type(xpath);
cy.get('[data-cy="add-button"]').click();
};
export const checkAiDisabled = () => {
cy.getAllLocalStorage().then((result) => {
const storage = JSON.parse(
result["http://localhost"]["persist:root"] as string
);
const settings = JSON.parse(storage.settings);
expect(settings.aiEnabled).to.equal(true);
});
};
export const buildAgentJob = (url: string, prompt: string) => {
checkAiDisabled();
enterJobUrl(url);
cy.get("[data-cy='prompt-input']").type(prompt);
};
export const submitJob = () => {
cy.get("button").contains("Submit").click();
};
export const enterJobUrl = (url: string) => {
cy.get('[data-cy="url-input"]').type(url);
};

View File

@@ -0,0 +1,15 @@
export const mockSubmitJob = () => {
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
};
export const mockToken = () => {
cy.intercept("POST", "/api/token").as("token");
};
export const mockSignup = () => {
cy.intercept("POST", "/api/signup").as("signup");
};
export const mockLogin = () => {
cy.intercept("POST", "/api/token").as("token");
};

View File

@@ -0,0 +1 @@
export * from "./authentication.utils";

View File

@@ -1,6 +1,9 @@
version: "3"
services:
scraperr:
build:
context: .
dockerfile: docker/frontend/Dockerfile
command: ["npm", "run", "dev"]
volumes:
- "$PWD/src:/app/src"
@@ -10,7 +13,12 @@ services:
- "$PWD/package-lock.json:/app/package-lock.json"
- "$PWD/tsconfig.json:/app/tsconfig.json"
scraperr_api:
build:
context: .
dockerfile: docker/api/Dockerfile
environment:
- LOG_LEVEL=INFO
volumes:
- "$PWD/api:/project/app/api"
ports:
- "5900:5900"

View File

@@ -1,11 +1,6 @@
services:
scraperr:
depends_on:
- scraperr_api
image: jpyles0524/scraperr:latest
build:
context: .
dockerfile: docker/frontend/Dockerfile
container_name: scraperr
command: ["npm", "run", "start"]
environment:
@@ -18,11 +13,9 @@ services:
scraperr_api:
init: True
image: jpyles0524/scraperr_api:latest
build:
context: .
dockerfile: docker/api/Dockerfile
environment:
- LOG_LEVEL=INFO
- OPENAI_KEY=${OPENAI_KEY}
container_name: scraperr_api
ports:
- 8000:8000

View File

@@ -3,7 +3,7 @@ FROM python:3.10.12-slim as pybuilder
RUN apt-get update && \
apt-get install -y curl && \
apt-get install -y uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 && \
apt-get install -y x11vnc xvfb uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 ffmpeg pkg-config default-libmysqlclient-dev gcc && \
curl -LsSf https://astral.sh/uv/install.sh | sh && \
apt-get remove -y curl && \
apt-get autoremove -y && \
@@ -14,7 +14,8 @@ RUN pdm config python.use_venv false
WORKDIR /project/app
COPY pyproject.toml pdm.lock /project/app/
RUN pdm install
RUN pdm install -v --frozen-lockfile
RUN pdm run playwright install --with-deps
@@ -30,7 +31,15 @@ EXPOSE 8000
WORKDIR /project/app
RUN mkdir -p /project/app/media
RUN mkdir -p /project/app/data
RUN touch /project/app/data/database.db
EXPOSE 5900
COPY alembic /project/app/alembic
COPY alembic.ini /project/app/alembic.ini
COPY start.sh /project/app/start.sh
CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ]

View File

@@ -1,10 +1,14 @@
# Build next dependencies
FROM node:23.1
FROM node:23.1-slim
WORKDIR /app
COPY package*.json ./
RUN npm install
# Copy package files first to leverage Docker cache
COPY package.json yarn.lock ./
# Install dependencies in a separate layer
RUN yarn install --frozen-lockfile --network-timeout 600000
# Copy the rest of the application
COPY tsconfig.json /app/tsconfig.json
COPY tailwind.config.js /app/tailwind.config.js
COPY next.config.mjs /app/next.config.mjs
@@ -13,6 +17,7 @@ COPY postcss.config.js /app/postcss.config.js
COPY public /app/public
COPY src /app/src
RUN npm run build
# Build the application
RUN yarn build
EXPOSE 3000

Binary file not shown.

Before

Width:  |  Height:  |  Size: 47 KiB

After

Width:  |  Height:  |  Size: 67 KiB

View File

@@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 1.0.13
version: 1.1.6
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to

2
next-env.d.ts vendored
View File

@@ -2,4 +2,4 @@
/// <reference types="next/image-types/global" />
// NOTE: This file should not be edited
// see https://nextjs.org/docs/basic-features/typescript for more information.
// see https://nextjs.org/docs/pages/building-your-application/configuring/typescript for more information.

11371
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -12,9 +12,11 @@
"@minchat/react-chat-ui": "^0.16.2",
"@mui/icons-material": "^5.15.3",
"@mui/material": "^5.16.0",
"@reduxjs/toolkit": "^2.8.2",
"@testing-library/jest-dom": "^5.16.5",
"@testing-library/react": "^13.4.0",
"@testing-library/user-event": "^13.5.0",
"@types/react": "^18.3.21",
"axios": "^1.7.2",
"bootstrap": "^5.3.0",
"chart.js": "^4.4.3",
@@ -30,16 +32,19 @@
"react-dom": "^18.3.1",
"react-markdown": "^9.0.0",
"react-modal-image": "^2.6.0",
"react-redux": "^9.2.0",
"react-router": "^6.14.1",
"react-router-dom": "^6.14.1",
"react-spinners": "^0.14.1",
"react-toastify": "^11.0.5",
"redux-persist": "^6.0.0",
"typescript": "^4.9.5",
"web-vitals": "^2.1.4"
},
"scripts": {
"dev": "next dev",
"build": "next build",
"start": "next start",
"dev": "yarn next dev",
"build": "yarn next build",
"start": "yarn next start",
"serve": "serve -s ./dist",
"cy:open": "cypress open",
"cy:run": "cypress run"
@@ -63,6 +68,7 @@
]
},
"devDependencies": {
"@faker-js/faker": "^9.8.0",
"@types/cypress": "^1.1.6",
"@types/js-cookie": "^3.0.6",
"autoprefixer": "^10.4.21",

1494
pdm.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -12,7 +12,7 @@ dependencies = [
"asyncio>=3.4.3",
"aiohttp>=3.9.5",
"bs4>=0.0.2",
"lxml[html_clean]>=5.2.2",
"lxml>=5.2.2",
"lxml-stubs>=0.5.1",
"fake-useragent>=1.5.1",
"requests-html>=0.10.0",
@@ -24,7 +24,6 @@ dependencies = [
"python-keycloak>=4.2.0",
"fastapi-keycloak>=1.0.11",
"pymongo>=4.8.0",
"motor[asyncio]>=3.5.0",
"python-jose[cryptography]>=3.3.0",
"passlib[bcrypt]>=1.7.4",
"selenium-wire>=5.1.0",
@@ -41,6 +40,16 @@ dependencies = [
"apscheduler>=3.11.0",
"playwright>=1.52.0",
"camoufox>=0.4.11",
"html2text>=2025.4.15",
"proxy-py>=2.4.10",
"browserforge==1.2.1",
"sqlalchemy>=2.0.41",
"aiosqlite>=0.21.0",
"alembic>=1.16.4",
"asyncpg>=0.30.0",
"aiomysql>=0.2.0",
"psycopg2-binary>=2.9.10",
"mysqlclient>=2.2.7",
]
requires-python = ">=3.10"
readme = "README.md"
@@ -97,9 +106,9 @@ strictSetInference = true
[tool.isort]
length_sort = "1"
length_sort = true
profile = "black"
sections = "STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER"
sections = ["STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
import_heading_stdlib = "STL"
import_heading_thirdparty = "PDM"
import_heading_firstparty = "LOCAL"

Some files were not shown because too many files have changed in this diff Show More