mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-11-16 14:16:12 +00:00
Compare commits
66 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
44ccad1935 | ||
|
|
308759d70c | ||
|
|
6bf130dd4b | ||
|
|
875a3684c9 | ||
|
|
b096fb1b3c | ||
|
|
5f65125882 | ||
|
|
327db34683 | ||
|
|
8d0f362a70 | ||
|
|
24f4b57fea | ||
|
|
1c0dec6db6 | ||
|
|
e9c60f6338 | ||
|
|
5719a85491 | ||
|
|
052d80de07 | ||
|
|
7047a3c0e3 | ||
|
|
71f603fc62 | ||
|
|
86a77a27df | ||
|
|
b11e263b93 | ||
|
|
91dc13348d | ||
|
|
93b0c83381 | ||
|
|
9381ba9232 | ||
|
|
20dccc5527 | ||
|
|
02619eb184 | ||
|
|
58c6c09fc9 | ||
|
|
bf896b4c6b | ||
|
|
e3b9c11ab7 | ||
|
|
32da3375b3 | ||
|
|
b5131cbe4c | ||
|
|
47c4c9a7d1 | ||
|
|
4352988666 | ||
|
|
00759151e6 | ||
|
|
bfae00ca72 | ||
|
|
e810700569 | ||
|
|
9857fa96e0 | ||
|
|
b52fbc538d | ||
|
|
42c0f3ae79 | ||
|
|
9aab2f9b4f | ||
|
|
e182d3e4b8 | ||
|
|
53f35989f5 | ||
|
|
a67ab34cfa | ||
|
|
3bf6657191 | ||
|
|
c38d19a0ca | ||
|
|
a53e7e1aa1 | ||
|
|
84368b1f6d | ||
|
|
ce4c1ceaa7 | ||
|
|
7e1ce58bb8 | ||
|
|
175e7d63bf | ||
|
|
d2c06de247 | ||
|
|
e0159bf9d4 | ||
|
|
6d574ddfd2 | ||
|
|
b089d72786 | ||
|
|
9ee4d577fd | ||
|
|
cddce5164d | ||
|
|
bf3163bfba | ||
|
|
54b513e92c | ||
|
|
6c56f2f161 | ||
|
|
d4edb9d93e | ||
|
|
5ebd96b62b | ||
|
|
d602d3330a | ||
|
|
6639e8b48f | ||
|
|
263e46ba4d | ||
|
|
f815a58efc | ||
|
|
50ec5df657 | ||
|
|
28de0f362c | ||
|
|
6b33723cac | ||
|
|
5c89e4d7d2 | ||
|
|
ed0828a585 |
4
.dockerignore
Normal file
4
.dockerignore
Normal file
@@ -0,0 +1,4 @@
|
||||
node_modules
|
||||
npm-debug.log
|
||||
Dockerfile
|
||||
.dockerignore
|
||||
8
.github/actions/push-to-helm/action.yaml
vendored
8
.github/actions/push-to-helm/action.yaml
vendored
@@ -5,6 +5,9 @@ inputs:
|
||||
app-repo-token:
|
||||
required: true
|
||||
description: "The token for the target repository"
|
||||
version:
|
||||
required: true
|
||||
description: "The version of the Helm chart"
|
||||
|
||||
runs:
|
||||
using: 'composite'
|
||||
@@ -15,6 +18,11 @@ runs:
|
||||
- name: Set up Helm
|
||||
uses: azure/setup-helm@v3
|
||||
|
||||
- name: Update Helm chart version
|
||||
run: |
|
||||
sed -i "s/^version: .*/version: ${{ inputs.version }}/" helm/Chart.yaml
|
||||
shell: bash
|
||||
|
||||
- name: Package Helm chart
|
||||
run: |
|
||||
mkdir -p packaged
|
||||
|
||||
28
.github/actions/run-cypress-tests/action.yaml
vendored
28
.github/actions/run-cypress-tests/action.yaml
vendored
@@ -2,6 +2,13 @@ name: Run Cypress Tests
|
||||
|
||||
description: Run Cypress tests
|
||||
|
||||
inputs:
|
||||
openai_key:
|
||||
description: "OpenAI API key"
|
||||
required: true
|
||||
default: ""
|
||||
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
@@ -13,13 +20,25 @@ runs:
|
||||
with:
|
||||
node-version: 22
|
||||
|
||||
- name: Setup yarn
|
||||
shell: bash
|
||||
run: npm install -g yarn
|
||||
|
||||
- name: Install xvfb for headless testing
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y xvfb libnss3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libasound2t64 libpango-1.0-0 libcairo2 libgtk-3-0 libgdk-pixbuf2.0-0 libx11-6 libx11-xcb1 libxcb1 libxss1 libxtst6 libnspr4
|
||||
|
||||
- name: Setup Docker project
|
||||
shell: bash
|
||||
run: make build up-dev
|
||||
run: |
|
||||
export OPENAI_KEY="${{ inputs.openai_key }}"
|
||||
make build-ci up-ci
|
||||
|
||||
- name: Install dependencies
|
||||
shell: bash
|
||||
run: npm install
|
||||
run: yarn install
|
||||
|
||||
- name: Wait for frontend to be ready
|
||||
shell: bash
|
||||
@@ -54,5 +73,8 @@ runs:
|
||||
|
||||
- name: Run Cypress tests
|
||||
shell: bash
|
||||
run: npm run cy:run
|
||||
run: |
|
||||
set -e
|
||||
npm run cy:run
|
||||
|
||||
|
||||
|
||||
31
.github/workflows/cypress-tests.yml
vendored
Normal file
31
.github/workflows/cypress-tests.yml
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
name: Cypress Tests
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
secrets:
|
||||
openai_key:
|
||||
required: true
|
||||
|
||||
|
||||
jobs:
|
||||
cypress-tests:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Run Cypress Tests
|
||||
id: run-tests
|
||||
uses: ./.github/actions/run-cypress-tests
|
||||
with:
|
||||
openai_key: ${{ secrets.openai_key }}
|
||||
|
||||
- name: Check container logs on failure
|
||||
if: steps.run-tests.conclusion == 'failure'
|
||||
run: |
|
||||
echo "Cypress tests failed. Dumping container logs..."
|
||||
docker logs scraperr_api || true
|
||||
|
||||
- name: Fail job if Cypress failed
|
||||
if: steps.run-tests.conclusion == 'failure'
|
||||
run: exit 1
|
||||
|
||||
54
.github/workflows/docker-image.yml
vendored
54
.github/workflows/docker-image.yml
vendored
@@ -1,24 +1,36 @@
|
||||
name: Docker Image
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: ["Unit Tests"]
|
||||
types:
|
||||
- completed
|
||||
workflow_dispatch:
|
||||
workflow_call:
|
||||
inputs:
|
||||
version:
|
||||
required: true
|
||||
type: string
|
||||
secrets:
|
||||
dockerhub_username:
|
||||
required: true
|
||||
dockerhub_token:
|
||||
required: true
|
||||
repo_token:
|
||||
required: true
|
||||
discord_webhook_url:
|
||||
required: true
|
||||
|
||||
jobs:
|
||||
build:
|
||||
if: ${{ github.event.workflow_run.conclusion == 'success' && github.ref == 'refs/heads/master' }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Get version from helm chart
|
||||
- name: Echo version
|
||||
run: |
|
||||
VERSION=$(grep "version:" ./helm/Chart.yaml | cut -d: -f2 | tr -d ' ')
|
||||
echo "VERSION=$VERSION" >> $GITHUB_ENV
|
||||
echo "Version is $VERSION"
|
||||
echo "Version is ${{ inputs.version }}"
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
@@ -26,28 +38,27 @@ jobs:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Build and push frontend
|
||||
- name: Build and push frontend (multi-arch)
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
file: ./docker/frontend/Dockerfile
|
||||
push: true
|
||||
platforms: linux/amd64,linux/arm64
|
||||
tags: |
|
||||
${{ secrets.DOCKERHUB_USERNAME }}/${{ secrets.DOCKERHUB_REPO }}:latest
|
||||
${{ secrets.DOCKERHUB_USERNAME }}/${{ secrets.DOCKERHUB_REPO }}:${{ env.VERSION }}
|
||||
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:latest
|
||||
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:${{ inputs.version }}
|
||||
|
||||
- name: Build and push api
|
||||
- name: Build and push api (multi-arch)
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
file: ./docker/api/Dockerfile
|
||||
push: true
|
||||
platforms: linux/amd64,linux/arm64
|
||||
tags: |
|
||||
${{ secrets.DOCKERHUB_USERNAME }}/${{ secrets.DOCKERHUB_REPO }}:latest
|
||||
${{ secrets.DOCKERHUB_USERNAME }}/${{ secrets.DOCKERHUB_REPO }}:${{ env.VERSION }}
|
||||
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
|
||||
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:${{ inputs.version }}
|
||||
|
||||
push-helm-chart:
|
||||
runs-on: ubuntu-latest
|
||||
@@ -59,7 +70,8 @@ jobs:
|
||||
- name: Push Helm Chart
|
||||
uses: ./.github/actions/push-to-helm
|
||||
with:
|
||||
app-repo-token: ${{ secrets.GPAT_TOKEN }}
|
||||
app-repo-token: ${{ secrets.repo_token }}
|
||||
version: ${{ inputs.version }}
|
||||
|
||||
success-message:
|
||||
runs-on: ubuntu-latest
|
||||
@@ -71,7 +83,7 @@ jobs:
|
||||
uses: jaypyles/discord-webhook-action@v1.0.0
|
||||
with:
|
||||
webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
|
||||
content: "Scraperr Successfully Built Docker Images"
|
||||
content: "Scraperr Successfully Built Docker Images (v${{ inputs.version }})"
|
||||
username: "Scraperr CI"
|
||||
embed-title: "✅ Deployment Status"
|
||||
embed-description: "Scraperr successfully built docker images."
|
||||
|
||||
35
.github/workflows/merge.yml
vendored
Normal file
35
.github/workflows/merge.yml
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
name: Merge
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
pull_request:
|
||||
types: [closed]
|
||||
branches:
|
||||
- master
|
||||
|
||||
jobs:
|
||||
# TODO: Renable once browser forge is fixed for camoufox, or else tests will never pass
|
||||
# tests:
|
||||
# uses: ./.github/workflows/tests.yml
|
||||
# secrets:
|
||||
# openai_key: ${{ secrets.OPENAI_KEY }}
|
||||
# discord_webhook_url: ${{ secrets.DISCORD_WEBHOOK_URL }}
|
||||
|
||||
version:
|
||||
uses: ./.github/workflows/version.yml
|
||||
secrets:
|
||||
git_token: ${{ secrets.GPAT_TOKEN }}
|
||||
|
||||
build-and-deploy:
|
||||
if: needs.version.outputs.version_bump == 'true'
|
||||
needs: version
|
||||
uses: ./.github/workflows/docker-image.yml
|
||||
secrets:
|
||||
dockerhub_username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
dockerhub_token: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
repo_token: ${{ secrets.GPAT_TOKEN }}
|
||||
discord_webhook_url: ${{ secrets.DISCORD_WEBHOOK_URL }}
|
||||
with:
|
||||
version: ${{ needs.version.outputs.version }}
|
||||
15
.github/workflows/pr.yml
vendored
Normal file
15
.github/workflows/pr.yml
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
name: PR
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- master
|
||||
types: [opened, synchronize, reopened]
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
tests:
|
||||
uses: ./.github/workflows/tests.yml
|
||||
secrets:
|
||||
openai_key: ${{ secrets.OPENAI_KEY }}
|
||||
discord_webhook_url: ${{ secrets.DISCORD_WEBHOOK_URL }}
|
||||
29
.github/workflows/pytest.yml
vendored
Normal file
29
.github/workflows/pytest.yml
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
name: Pytest
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
unit-tests:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-node@v3
|
||||
|
||||
- name: Set env
|
||||
run: echo "ENV=test" >> $GITHUB_ENV
|
||||
|
||||
- name: Install pdm
|
||||
run: pip install pdm
|
||||
|
||||
- name: Install project dependencies
|
||||
run: pdm install
|
||||
|
||||
- name: Install playwright
|
||||
run: pdm run playwright install --with-deps
|
||||
|
||||
- name: Run tests
|
||||
run: PYTHONPATH=. pdm run pytest -v -ra api/backend/tests
|
||||
|
||||
42
.github/workflows/tests.yml
vendored
Normal file
42
.github/workflows/tests.yml
vendored
Normal file
@@ -0,0 +1,42 @@
|
||||
name: Reusable PR Tests
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
secrets:
|
||||
openai_key:
|
||||
required: true
|
||||
discord_webhook_url:
|
||||
required: true
|
||||
|
||||
|
||||
jobs:
|
||||
pytest:
|
||||
uses: ./.github/workflows/pytest.yml
|
||||
|
||||
cypress-tests:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Run Cypress Tests
|
||||
uses: ./.github/actions/run-cypress-tests
|
||||
with:
|
||||
openai_key: ${{ secrets.openai_key }}
|
||||
|
||||
success-message:
|
||||
runs-on: ubuntu-latest
|
||||
needs:
|
||||
- pytest
|
||||
- cypress-tests
|
||||
steps:
|
||||
- name: Send Discord Message
|
||||
uses: jaypyles/discord-webhook-action@v1.0.0
|
||||
with:
|
||||
webhook-url: ${{ secrets.discord_webhook_url }}
|
||||
content: "Scraperr Successfully Passed Tests"
|
||||
username: "Scraperr CI"
|
||||
embed-title: "✅ Deployment Status"
|
||||
embed-description: "Scraperr successfully passed all tests."
|
||||
embed-color: 3066993
|
||||
embed-footer-text: "Scraperr CI"
|
||||
embed-timestamp: ${{ github.event.head_commit.timestamp }}
|
||||
57
.github/workflows/unit-tests.yml
vendored
57
.github/workflows/unit-tests.yml
vendored
@@ -1,57 +0,0 @@
|
||||
name: Unit Tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
unit-tests:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set env
|
||||
run: echo "ENV=test" >> $GITHUB_ENV
|
||||
|
||||
- name: Install pdm
|
||||
run: pip install pdm
|
||||
|
||||
- name: Install project dependencies
|
||||
run: pdm install
|
||||
|
||||
- name: Install playwright
|
||||
run: pdm run playwright install
|
||||
|
||||
- name: Run tests
|
||||
run: PYTHONPATH=. pdm run pytest api/backend/tests
|
||||
|
||||
cypress-tests:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: ./.github/actions/run-cypress-tests
|
||||
|
||||
success-message:
|
||||
runs-on: ubuntu-latest
|
||||
needs:
|
||||
- unit-tests
|
||||
- cypress-tests
|
||||
steps:
|
||||
- name: Send Discord Message
|
||||
uses: jaypyles/discord-webhook-action@v1.0.0
|
||||
with:
|
||||
webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
|
||||
content: "Scraperr Successfully Passed Tests"
|
||||
username: "Scraperr CI"
|
||||
embed-title: "✅ Deployment Status"
|
||||
embed-description: "Scraperr successfully passed all tests."
|
||||
embed-color: 3066993 # Green
|
||||
embed-footer-text: "Scraperr CI"
|
||||
embed-timestamp: ${{ github.event.head_commit.timestamp }}
|
||||
89
.github/workflows/version.yml
vendored
Normal file
89
.github/workflows/version.yml
vendored
Normal file
@@ -0,0 +1,89 @@
|
||||
name: Version
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
secrets:
|
||||
git_token:
|
||||
required: true
|
||||
outputs:
|
||||
version:
|
||||
description: "The new version number"
|
||||
value: ${{ jobs.version.outputs.version }}
|
||||
version_bump:
|
||||
description: "Whether the version was bumped"
|
||||
value: ${{ jobs.version.outputs.version_bump }}
|
||||
|
||||
jobs:
|
||||
version:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
outputs:
|
||||
version: ${{ steps.set_version.outputs.version }}
|
||||
version_bump: ${{ steps.check_version_bump.outputs.version_bump }}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get version bump
|
||||
id: get_version_type
|
||||
run: |
|
||||
COMMIT_MSG=$(git log -1 --pretty=%B)
|
||||
|
||||
if [[ $COMMIT_MSG =~ ^feat\(breaking\) ]]; then
|
||||
VERSION_TYPE="major"
|
||||
elif [[ $COMMIT_MSG =~ ^feat! ]]; then
|
||||
VERSION_TYPE="minor"
|
||||
elif [[ $COMMIT_MSG =~ ^(feat|fix|chore): ]]; then
|
||||
VERSION_TYPE="patch"
|
||||
else
|
||||
VERSION_TYPE="patch"
|
||||
fi
|
||||
|
||||
echo "VERSION_TYPE=$VERSION_TYPE" >> $GITHUB_ENV
|
||||
|
||||
- name: Check for version bump
|
||||
id: check_version_bump
|
||||
run: |
|
||||
COMMIT_MSG=$(git log -1 --pretty=%B)
|
||||
|
||||
if [[ $COMMIT_MSG =~ .*\[no\ bump\].* ]]; then
|
||||
echo "version_bump=false" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "version_bump=true" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Skip version bump
|
||||
if: steps.check_version_bump.outputs.version_bump == 'false'
|
||||
run: |
|
||||
echo "Skipping version bump as requested"
|
||||
gh run cancel ${{ github.run_id }}
|
||||
exit 0
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.git_token }}
|
||||
|
||||
- name: Set version
|
||||
if: steps.check_version_bump.outputs.version_bump != 'false'
|
||||
id: set_version
|
||||
run: |
|
||||
VERSION=$(./scripts/version.sh "$VERSION_TYPE")
|
||||
echo "VERSION=$VERSION" >> $GITHUB_ENV
|
||||
echo "Version is $VERSION"
|
||||
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
||||
env:
|
||||
VERSION_TYPE: ${{ env.VERSION_TYPE }}
|
||||
|
||||
- name: Update chart file
|
||||
if: steps.check_version_bump.outputs.version_bump != 'false'
|
||||
run: |
|
||||
sed -i "s/^version: .*/version: $VERSION/" helm/Chart.yaml
|
||||
|
||||
git config --local user.email "github-actions[bot]@users.noreply.github.com"
|
||||
git config --local user.name "github-actions[bot]"
|
||||
git add helm/Chart.yaml
|
||||
git commit -m "chore: bump version to $VERSION"
|
||||
git push
|
||||
env:
|
||||
VERSION: ${{ env.VERSION }}
|
||||
16
.gitignore
vendored
16
.gitignore
vendored
@@ -188,4 +188,18 @@ postgres_data
|
||||
.vscode
|
||||
ollama
|
||||
data
|
||||
media
|
||||
|
||||
media/images
|
||||
media/videos
|
||||
media/audio
|
||||
media/pdfs
|
||||
media/spreadsheets
|
||||
media/presentations
|
||||
media/documents
|
||||
media/recordings
|
||||
media/download_summary.txt
|
||||
|
||||
cypress/screenshots
|
||||
cypress/videos
|
||||
|
||||
docker-compose.dev.local.yml
|
||||
15
Makefile
15
Makefile
@@ -1,6 +1,6 @@
|
||||
.DEFAULT_GOAL := help
|
||||
|
||||
COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.yml
|
||||
COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.local.yml
|
||||
COMPOSE_PROD = docker compose -f docker-compose.yml
|
||||
|
||||
.PHONY: help deps build pull up up-dev down setup deploy
|
||||
@@ -17,6 +17,7 @@ help:
|
||||
@echo " make down - Stop and remove containers, networks, images, and volumes"
|
||||
@echo " make setup - Setup server with dependencies and clone repo"
|
||||
@echo " make deploy - Deploy site onto server"
|
||||
@echo " make cypress-start - Start Cypress"
|
||||
@echo ""
|
||||
|
||||
logs:
|
||||
@@ -51,3 +52,15 @@ setup:
|
||||
|
||||
deploy:
|
||||
ansible-playbook -i ./ansible/inventory.yaml ./ansible/deploy_site.yaml -v
|
||||
|
||||
build-ci:
|
||||
docker compose -f docker-compose.yml -f docker-compose.dev.yml build
|
||||
|
||||
up-ci:
|
||||
docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d --force-recreate
|
||||
|
||||
cypress-start:
|
||||
DISPLAY=:0 npx cypress open
|
||||
|
||||
cypress-run:
|
||||
npx cypress run
|
||||
@@ -13,7 +13,7 @@
|
||||
|
||||
## 📋 Overview
|
||||
|
||||
Scraperr enables you to extract data from websites with precision using XPath selectors. This self-hosted application provides a clean interface to manage scraping jobs, view results, and export data.
|
||||
Scrape websites without writing a single line of code.
|
||||
|
||||
> 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information.
|
||||
|
||||
@@ -29,7 +29,7 @@ Scraperr enables you to extract data from websites with precision using XPath se
|
||||
- **Custom Headers**: Add JSON headers to your scraping requests
|
||||
- **Media Downloads**: Automatically download images, videos, and other media
|
||||
- **Results Visualization**: View scraped data in a structured table format
|
||||
- **Data Export**: Export your results in various formats
|
||||
- **Data Export**: Export your results in markdown and csv formats
|
||||
- **Notifcation Channels**: Send completion notifcations, through various channels
|
||||
|
||||
## 🚀 Getting Started
|
||||
|
||||
147
alembic.ini
Normal file
147
alembic.ini
Normal file
@@ -0,0 +1,147 @@
|
||||
# A generic, single database configuration.
|
||||
|
||||
[alembic]
|
||||
# path to migration scripts.
|
||||
# this is typically a path given in POSIX (e.g. forward slashes)
|
||||
# format, relative to the token %(here)s which refers to the location of this
|
||||
# ini file
|
||||
script_location = %(here)s/alembic
|
||||
|
||||
# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
|
||||
# Uncomment the line below if you want the files to be prepended with date and time
|
||||
# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
|
||||
# for all available tokens
|
||||
# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
|
||||
|
||||
# sys.path path, will be prepended to sys.path if present.
|
||||
# defaults to the current working directory. for multiple paths, the path separator
|
||||
# is defined by "path_separator" below.
|
||||
prepend_sys_path = .
|
||||
|
||||
|
||||
# timezone to use when rendering the date within the migration file
|
||||
# as well as the filename.
|
||||
# If specified, requires the python>=3.9 or backports.zoneinfo library and tzdata library.
|
||||
# Any required deps can installed by adding `alembic[tz]` to the pip requirements
|
||||
# string value is passed to ZoneInfo()
|
||||
# leave blank for localtime
|
||||
# timezone =
|
||||
|
||||
# max length of characters to apply to the "slug" field
|
||||
# truncate_slug_length = 40
|
||||
|
||||
# set to 'true' to run the environment during
|
||||
# the 'revision' command, regardless of autogenerate
|
||||
# revision_environment = false
|
||||
|
||||
# set to 'true' to allow .pyc and .pyo files without
|
||||
# a source .py file to be detected as revisions in the
|
||||
# versions/ directory
|
||||
# sourceless = false
|
||||
|
||||
# version location specification; This defaults
|
||||
# to <script_location>/versions. When using multiple version
|
||||
# directories, initial revisions must be specified with --version-path.
|
||||
# The path separator used here should be the separator specified by "path_separator"
|
||||
# below.
|
||||
# version_locations = %(here)s/bar:%(here)s/bat:%(here)s/alembic/versions
|
||||
|
||||
# path_separator; This indicates what character is used to split lists of file
|
||||
# paths, including version_locations and prepend_sys_path within configparser
|
||||
# files such as alembic.ini.
|
||||
# The default rendered in new alembic.ini files is "os", which uses os.pathsep
|
||||
# to provide os-dependent path splitting.
|
||||
#
|
||||
# Note that in order to support legacy alembic.ini files, this default does NOT
|
||||
# take place if path_separator is not present in alembic.ini. If this
|
||||
# option is omitted entirely, fallback logic is as follows:
|
||||
#
|
||||
# 1. Parsing of the version_locations option falls back to using the legacy
|
||||
# "version_path_separator" key, which if absent then falls back to the legacy
|
||||
# behavior of splitting on spaces and/or commas.
|
||||
# 2. Parsing of the prepend_sys_path option falls back to the legacy
|
||||
# behavior of splitting on spaces, commas, or colons.
|
||||
#
|
||||
# Valid values for path_separator are:
|
||||
#
|
||||
# path_separator = :
|
||||
# path_separator = ;
|
||||
# path_separator = space
|
||||
# path_separator = newline
|
||||
#
|
||||
# Use os.pathsep. Default configuration used for new projects.
|
||||
path_separator = os
|
||||
|
||||
# set to 'true' to search source files recursively
|
||||
# in each "version_locations" directory
|
||||
# new in Alembic version 1.10
|
||||
# recursive_version_locations = false
|
||||
|
||||
# the output encoding used when revision files
|
||||
# are written from script.py.mako
|
||||
# output_encoding = utf-8
|
||||
|
||||
# database URL. This is consumed by the user-maintained env.py script only.
|
||||
# other means of configuring database URLs may be customized within the env.py
|
||||
# file.
|
||||
sqlalchemy.url = driver://user:pass@localhost/dbname
|
||||
|
||||
|
||||
[post_write_hooks]
|
||||
# post_write_hooks defines scripts or Python functions that are run
|
||||
# on newly generated revision scripts. See the documentation for further
|
||||
# detail and examples
|
||||
|
||||
# format using "black" - use the console_scripts runner, against the "black" entrypoint
|
||||
# hooks = black
|
||||
# black.type = console_scripts
|
||||
# black.entrypoint = black
|
||||
# black.options = -l 79 REVISION_SCRIPT_FILENAME
|
||||
|
||||
# lint with attempts to fix using "ruff" - use the module runner, against the "ruff" module
|
||||
# hooks = ruff
|
||||
# ruff.type = module
|
||||
# ruff.module = ruff
|
||||
# ruff.options = check --fix REVISION_SCRIPT_FILENAME
|
||||
|
||||
# Alternatively, use the exec runner to execute a binary found on your PATH
|
||||
# hooks = ruff
|
||||
# ruff.type = exec
|
||||
# ruff.executable = ruff
|
||||
# ruff.options = check --fix REVISION_SCRIPT_FILENAME
|
||||
|
||||
# Logging configuration. This is also consumed by the user-maintained
|
||||
# env.py script only.
|
||||
[loggers]
|
||||
keys = root,sqlalchemy,alembic
|
||||
|
||||
[handlers]
|
||||
keys = console
|
||||
|
||||
[formatters]
|
||||
keys = generic
|
||||
|
||||
[logger_root]
|
||||
level = WARNING
|
||||
handlers = console
|
||||
qualname =
|
||||
|
||||
[logger_sqlalchemy]
|
||||
level = WARNING
|
||||
handlers =
|
||||
qualname = sqlalchemy.engine
|
||||
|
||||
[logger_alembic]
|
||||
level = INFO
|
||||
handlers =
|
||||
qualname = alembic
|
||||
|
||||
[handler_console]
|
||||
class = StreamHandler
|
||||
args = (sys.stderr,)
|
||||
level = NOTSET
|
||||
formatter = generic
|
||||
|
||||
[formatter_generic]
|
||||
format = %(levelname)-5.5s [%(name)s] %(message)s
|
||||
datefmt = %H:%M:%S
|
||||
1
alembic/README
Normal file
1
alembic/README
Normal file
@@ -0,0 +1 @@
|
||||
Generic single-database configuration.
|
||||
103
alembic/env.py
Normal file
103
alembic/env.py
Normal file
@@ -0,0 +1,103 @@
|
||||
# STL
|
||||
import os
|
||||
import sys
|
||||
from logging.config import fileConfig
|
||||
|
||||
# PDM
|
||||
from dotenv import load_dotenv
|
||||
from sqlalchemy import pool, engine_from_config
|
||||
|
||||
# LOCAL
|
||||
from alembic import context
|
||||
from api.backend.database.base import Base
|
||||
from api.backend.database.models import Job, User, CronJob # type: ignore
|
||||
|
||||
load_dotenv()
|
||||
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "api")))
|
||||
|
||||
# Load the raw async database URL
|
||||
raw_database_url = os.getenv("DATABASE_URL", "sqlite+aiosqlite:///data/database.db")
|
||||
|
||||
# Map async dialects to sync ones
|
||||
driver_downgrade_map = {
|
||||
"sqlite+aiosqlite": "sqlite",
|
||||
"postgresql+asyncpg": "postgresql",
|
||||
"mysql+aiomysql": "mysql",
|
||||
}
|
||||
|
||||
# Extract scheme and convert if async
|
||||
for async_driver, sync_driver in driver_downgrade_map.items():
|
||||
if raw_database_url.startswith(async_driver + "://"):
|
||||
sync_database_url = raw_database_url.replace(async_driver, sync_driver, 1)
|
||||
break
|
||||
|
||||
else:
|
||||
# No async driver detected — assume it's already sync
|
||||
sync_database_url = raw_database_url
|
||||
|
||||
|
||||
# Apply it to Alembic config
|
||||
config = context.config
|
||||
config.set_main_option("sqlalchemy.url", sync_database_url)
|
||||
|
||||
# Interpret the config file for Python logging.
|
||||
# This line sets up loggers basically.
|
||||
if config.config_file_name is not None:
|
||||
fileConfig(config.config_file_name)
|
||||
|
||||
# add your model's MetaData object here
|
||||
# for 'autogenerate' support
|
||||
# from myapp import mymodel
|
||||
# target_metadata = mymodel.Base.metadata
|
||||
target_metadata = Base.metadata
|
||||
|
||||
|
||||
def run_migrations_offline() -> None:
|
||||
"""Run migrations in 'offline' mode.
|
||||
|
||||
This configures the context with just a URL
|
||||
and not an Engine, though an Engine is acceptable
|
||||
here as well. By skipping the Engine creation
|
||||
we don't even need a DBAPI to be available.
|
||||
|
||||
Calls to context.execute() here emit the given string to the
|
||||
script output.
|
||||
|
||||
"""
|
||||
url = config.get_main_option("sqlalchemy.url")
|
||||
context.configure(
|
||||
url=url,
|
||||
target_metadata=target_metadata,
|
||||
literal_binds=True,
|
||||
dialect_opts={"paramstyle": "named"},
|
||||
)
|
||||
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
def run_migrations_online() -> None:
|
||||
"""Run migrations in 'online' mode.
|
||||
|
||||
In this scenario we need to create an Engine
|
||||
and associate a connection with the context.
|
||||
|
||||
"""
|
||||
connectable = engine_from_config(
|
||||
config.get_section(config.config_ini_section, {}),
|
||||
prefix="sqlalchemy.",
|
||||
poolclass=pool.NullPool,
|
||||
)
|
||||
|
||||
with connectable.connect() as connection:
|
||||
context.configure(connection=connection, target_metadata=target_metadata)
|
||||
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
if context.is_offline_mode():
|
||||
run_migrations_offline()
|
||||
else:
|
||||
run_migrations_online()
|
||||
28
alembic/script.py.mako
Normal file
28
alembic/script.py.mako
Normal file
@@ -0,0 +1,28 @@
|
||||
"""${message}
|
||||
|
||||
Revision ID: ${up_revision}
|
||||
Revises: ${down_revision | comma,n}
|
||||
Create Date: ${create_date}
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
${imports if imports else ""}
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = ${repr(up_revision)}
|
||||
down_revision: Union[str, Sequence[str], None] = ${repr(down_revision)}
|
||||
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
|
||||
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Upgrade schema."""
|
||||
${upgrades if upgrades else "pass"}
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Downgrade schema."""
|
||||
${downgrades if downgrades else "pass"}
|
||||
67
alembic/versions/6aa921d2e637_initial_revision.py
Normal file
67
alembic/versions/6aa921d2e637_initial_revision.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""initial revision
|
||||
|
||||
Revision ID: 6aa921d2e637
|
||||
Revises:
|
||||
Create Date: 2025-07-12 20:17:44.448034
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = '6aa921d2e637'
|
||||
down_revision: Union[str, Sequence[str], None] = None
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Upgrade schema."""
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.create_table('users',
|
||||
sa.Column('email', sa.String(length=255), nullable=False),
|
||||
sa.Column('hashed_password', sa.String(length=255), nullable=False),
|
||||
sa.Column('full_name', sa.String(length=255), nullable=True),
|
||||
sa.Column('disabled', sa.Boolean(), nullable=True),
|
||||
sa.PrimaryKeyConstraint('email')
|
||||
)
|
||||
op.create_table('jobs',
|
||||
sa.Column('id', sa.String(length=64), nullable=False),
|
||||
sa.Column('url', sa.String(length=2048), nullable=False),
|
||||
sa.Column('elements', sa.JSON(), nullable=False),
|
||||
sa.Column('user', sa.String(length=255), nullable=True),
|
||||
sa.Column('time_created', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
|
||||
sa.Column('result', sa.JSON(), nullable=False),
|
||||
sa.Column('status', sa.String(length=50), nullable=False),
|
||||
sa.Column('chat', sa.JSON(), nullable=True),
|
||||
sa.Column('job_options', sa.JSON(), nullable=True),
|
||||
sa.Column('agent_mode', sa.Boolean(), nullable=False),
|
||||
sa.Column('prompt', sa.String(length=1024), nullable=True),
|
||||
sa.Column('favorite', sa.Boolean(), nullable=False),
|
||||
sa.ForeignKeyConstraint(['user'], ['users.email'], ),
|
||||
sa.PrimaryKeyConstraint('id')
|
||||
)
|
||||
op.create_table('cron_jobs',
|
||||
sa.Column('id', sa.String(length=64), nullable=False),
|
||||
sa.Column('user_email', sa.String(length=255), nullable=False),
|
||||
sa.Column('job_id', sa.String(length=64), nullable=False),
|
||||
sa.Column('cron_expression', sa.String(length=255), nullable=False),
|
||||
sa.Column('time_created', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
|
||||
sa.Column('time_updated', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
|
||||
sa.ForeignKeyConstraint(['job_id'], ['jobs.id'], ),
|
||||
sa.ForeignKeyConstraint(['user_email'], ['users.email'], ),
|
||||
sa.PrimaryKeyConstraint('id')
|
||||
)
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Downgrade schema."""
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.drop_table('cron_jobs')
|
||||
op.drop_table('jobs')
|
||||
op.drop_table('users')
|
||||
# ### end Alembic commands ###
|
||||
6
api/backend/ai/agent/actions.py
Normal file
6
api/backend/ai/agent/actions.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
|
||||
class Action(TypedDict):
|
||||
type: str
|
||||
url: str
|
||||
96
api/backend/ai/agent/agent.py
Normal file
96
api/backend/ai/agent/agent.py
Normal file
@@ -0,0 +1,96 @@
|
||||
# STL
|
||||
import random
|
||||
from typing import Any
|
||||
|
||||
# PDM
|
||||
from camoufox import AsyncCamoufox
|
||||
from playwright.async_api import Page
|
||||
|
||||
# LOCAL
|
||||
from api.backend.constants import RECORDINGS_ENABLED
|
||||
from api.backend.ai.clients import ask_ollama, ask_open_ai, open_ai_key
|
||||
from api.backend.job.models import CapturedElement
|
||||
from api.backend.worker.logger import LOG
|
||||
from api.backend.ai.agent.utils import (
|
||||
parse_response,
|
||||
capture_elements,
|
||||
convert_to_markdown,
|
||||
)
|
||||
from api.backend.ai.agent.prompts import (
|
||||
EXTRACT_ELEMENTS_PROMPT,
|
||||
ELEMENT_EXTRACTION_PROMPT,
|
||||
)
|
||||
from api.backend.job.scraping.add_custom import add_custom_items
|
||||
from api.backend.job.scraping.collect_media import collect_media
|
||||
|
||||
ask_ai = ask_open_ai if open_ai_key else ask_ollama
|
||||
|
||||
|
||||
async def scrape_with_agent(agent_job: dict[str, Any]):
|
||||
LOG.info(f"Starting work for agent job: {agent_job}")
|
||||
pages = set()
|
||||
|
||||
proxy = None
|
||||
|
||||
if agent_job["job_options"]["proxies"]:
|
||||
proxy = random.choice(agent_job["job_options"]["proxies"])
|
||||
LOG.info(f"Using proxy: {proxy}")
|
||||
|
||||
async with AsyncCamoufox(headless=not RECORDINGS_ENABLED, proxy=proxy) as browser:
|
||||
page: Page = await browser.new_page()
|
||||
|
||||
await add_custom_items(
|
||||
agent_job["url"],
|
||||
page,
|
||||
agent_job["job_options"]["custom_cookies"],
|
||||
agent_job["job_options"]["custom_headers"],
|
||||
)
|
||||
|
||||
try:
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
await page.goto(agent_job["url"], timeout=60000)
|
||||
|
||||
if agent_job["job_options"]["collect_media"]:
|
||||
await collect_media(agent_job["id"], page)
|
||||
|
||||
html_content = await page.content()
|
||||
markdown_content = convert_to_markdown(html_content)
|
||||
|
||||
response = await ask_ai(
|
||||
ELEMENT_EXTRACTION_PROMPT.format(
|
||||
extraction_prompt=EXTRACT_ELEMENTS_PROMPT,
|
||||
webpage=markdown_content,
|
||||
prompt=agent_job["prompt"],
|
||||
)
|
||||
)
|
||||
|
||||
xpaths = parse_response(response)
|
||||
|
||||
captured_elements = await capture_elements(
|
||||
page, xpaths, agent_job["job_options"].get("return_html", False)
|
||||
)
|
||||
|
||||
final_url = page.url
|
||||
|
||||
pages.add((html_content, final_url))
|
||||
finally:
|
||||
await page.close()
|
||||
await browser.close()
|
||||
|
||||
name_to_elements = {}
|
||||
|
||||
for page in pages:
|
||||
for element in captured_elements:
|
||||
if element.name not in name_to_elements:
|
||||
name_to_elements[element.name] = []
|
||||
|
||||
name_to_elements[element.name].append(element)
|
||||
|
||||
scraped_elements: list[dict[str, dict[str, list[CapturedElement]]]] = [
|
||||
{
|
||||
page[1]: name_to_elements,
|
||||
}
|
||||
for page in pages
|
||||
]
|
||||
|
||||
return scraped_elements
|
||||
58
api/backend/ai/agent/prompts.py
Normal file
58
api/backend/ai/agent/prompts.py
Normal file
@@ -0,0 +1,58 @@
|
||||
EXTRACT_ELEMENTS_PROMPT = """
|
||||
You are an assistant that extracts XPath expressions from webpages.
|
||||
|
||||
You will receive HTML content in markdown format.
|
||||
|
||||
Each element in the markdown has their xpath shown above them in a path like:
|
||||
<!-- //div -->
|
||||
|
||||
Respond only with a list of general XPath expressions inside `<xpaths>...</xpaths>` tags.
|
||||
|
||||
You will also decide the decision of what to do next. If there is no decision available, return nothing for that section.
|
||||
"""
|
||||
|
||||
ELEMENT_EXTRACTION_PROMPT = """
|
||||
{extraction_prompt}
|
||||
|
||||
**Guidelines:**
|
||||
- Prefer shorter, more general XPaths like `//div[...]` or `//span[...]`.
|
||||
- Avoid overly specific or deep paths like `//div[3]/ul/li[2]/a`.
|
||||
- Do **not** chain multiple elements deeply (e.g., `//div/span/a`).
|
||||
- Use XPaths further down the tree when possible.
|
||||
- Do not include any extra explanation or text.
|
||||
- One XPath is acceptable if that's all that's needed.
|
||||
- Try and limit it down to 1 - 3 xpaths.
|
||||
- Include a name for each xpath.
|
||||
|
||||
<important>
|
||||
- USE THE MOST SIMPLE XPATHS POSSIBLE.
|
||||
- USE THE MOST GENERAL XPATHS POSSIBLE.
|
||||
- USE THE MOST SPECIFIC XPATHS POSSIBLE.
|
||||
- USE THE MOST GENERAL XPATHS POSSIBLE.
|
||||
</important>
|
||||
|
||||
**Example Format:**
|
||||
```xml
|
||||
<xpaths>
|
||||
- <name: insert_name_here>: <xpath: //div>
|
||||
- <name: insert_name_here>: <xpath: //span>
|
||||
- <name: insert_name_here>: <xpath: //span[contains(@text, 'example')]>
|
||||
- <name: insert_name_here>: <xpath: //div[contains(@text, 'example')]>
|
||||
- <name: insert_name_here>: <xpath: //a[@href]>
|
||||
- etc
|
||||
</xpaths>
|
||||
|
||||
<decision>
|
||||
<next_page>
|
||||
- //a[@href='next_page_url']
|
||||
</next_page>
|
||||
</decision>
|
||||
```
|
||||
|
||||
**Input webpage:**
|
||||
{webpage}
|
||||
|
||||
**Target content:**
|
||||
{prompt}
|
||||
|
||||
"""
|
||||
272
api/backend/ai/agent/utils.py
Normal file
272
api/backend/ai/agent/utils.py
Normal file
@@ -0,0 +1,272 @@
|
||||
# STL
|
||||
import re
|
||||
|
||||
# PDM
|
||||
from lxml import html, etree
|
||||
from playwright.async_api import Page
|
||||
|
||||
# LOCAL
|
||||
from api.backend.job.models import CapturedElement
|
||||
from api.backend.job.utils.text_utils import clean_text
|
||||
|
||||
|
||||
def convert_to_markdown(html_str: str):
|
||||
parser = html.HTMLParser()
|
||||
tree = html.fromstring(html_str, parser=parser)
|
||||
root = tree.getroottree()
|
||||
|
||||
def format_attributes(el: etree._Element) -> str:
|
||||
"""Convert element attributes into a string."""
|
||||
return " ".join(f'{k}="{v}"' for k, v in el.attrib.items())
|
||||
|
||||
def is_visible(el: etree._Element) -> bool:
|
||||
style = el.attrib.get("style", "").lower()
|
||||
class_ = el.attrib.get("class", "").lower()
|
||||
|
||||
# Check for visibility styles
|
||||
if "display: none" in style or "visibility: hidden" in style:
|
||||
return False
|
||||
if "opacity: 0" in style or "opacity:0" in style:
|
||||
return False
|
||||
if "height: 0" in style or "width: 0" in style:
|
||||
return False
|
||||
|
||||
# Check for common hidden classes
|
||||
if any(
|
||||
hidden in class_
|
||||
for hidden in ["hidden", "invisible", "truncate", "collapse"]
|
||||
):
|
||||
return False
|
||||
|
||||
# Check for hidden attributes
|
||||
if el.attrib.get("hidden") is not None:
|
||||
return False
|
||||
if el.attrib.get("aria-hidden") == "true":
|
||||
return False
|
||||
|
||||
# Check for empty or whitespace-only content
|
||||
if not el.text and len(el) == 0:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def is_layout_or_decorative(el: etree._Element) -> bool:
|
||||
tag = el.tag.lower()
|
||||
|
||||
# Layout elements
|
||||
if tag in {"nav", "footer", "header", "aside", "main", "section"}:
|
||||
return True
|
||||
|
||||
# Decorative elements
|
||||
if tag in {"svg", "path", "circle", "rect", "line", "polygon", "polyline"}:
|
||||
return True
|
||||
|
||||
# Check id and class for layout/decorative keywords
|
||||
id_class = " ".join(
|
||||
[el.attrib.get("id", ""), el.attrib.get("class", "")]
|
||||
).lower()
|
||||
|
||||
layout_keywords = {
|
||||
"sidebar",
|
||||
"nav",
|
||||
"header",
|
||||
"footer",
|
||||
"menu",
|
||||
"advert",
|
||||
"ads",
|
||||
"breadcrumb",
|
||||
"container",
|
||||
"wrapper",
|
||||
"layout",
|
||||
"grid",
|
||||
"flex",
|
||||
"row",
|
||||
"column",
|
||||
"section",
|
||||
"banner",
|
||||
"hero",
|
||||
"card",
|
||||
"modal",
|
||||
"popup",
|
||||
"tooltip",
|
||||
"dropdown",
|
||||
"overlay",
|
||||
}
|
||||
|
||||
return any(keyword in id_class for keyword in layout_keywords)
|
||||
|
||||
# Tags to ignore in the final markdown output
|
||||
included_tags = {
|
||||
"div",
|
||||
"span",
|
||||
"a",
|
||||
"p",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"img",
|
||||
"button",
|
||||
"input",
|
||||
"textarea",
|
||||
"ul",
|
||||
"ol",
|
||||
"li",
|
||||
"table",
|
||||
"tr",
|
||||
"td",
|
||||
"th",
|
||||
"input",
|
||||
"textarea",
|
||||
"select",
|
||||
"option",
|
||||
"optgroup",
|
||||
"fieldset",
|
||||
"legend",
|
||||
}
|
||||
|
||||
special_elements = []
|
||||
normal_elements = []
|
||||
|
||||
for el in tree.iter():
|
||||
if el.tag is etree.Comment:
|
||||
continue
|
||||
|
||||
tag = el.tag.lower()
|
||||
|
||||
if tag not in included_tags:
|
||||
continue
|
||||
|
||||
if not is_visible(el):
|
||||
continue
|
||||
|
||||
if is_layout_or_decorative(el):
|
||||
continue
|
||||
|
||||
path = root.getpath(el)
|
||||
attrs = format_attributes(el)
|
||||
attrs_str = f" {attrs}" if attrs else ""
|
||||
text = el.text.strip() if el.text else ""
|
||||
|
||||
if not text and not attrs:
|
||||
continue
|
||||
|
||||
# input elements
|
||||
if tag == "button":
|
||||
prefix = "🔘 **<button>**"
|
||||
special_elements.append(f"<!-- {path} -->\n{prefix} {text}")
|
||||
elif tag == "a":
|
||||
href = el.attrib.get("href", "")
|
||||
prefix = f"🔗 **<a href='{href}'>**"
|
||||
special_elements.append(f"<!-- {path} -->\n{prefix} {text}")
|
||||
elif tag == "input":
|
||||
input_type = el.attrib.get("type", "text")
|
||||
prefix = f"📝 **<input type='{input_type}'>**"
|
||||
special_elements.append(f"<!-- {path} -->\n{prefix}")
|
||||
else:
|
||||
prefix = f"**<{tag}{attrs_str}>**"
|
||||
|
||||
if text:
|
||||
normal_elements.append(f"<!-- {path} -->\n{prefix} {text}")
|
||||
|
||||
return "\n\n".join(normal_elements + special_elements) # type: ignore
|
||||
|
||||
|
||||
def parse_response(text: str) -> list[dict[str, str]]:
|
||||
xpaths = re.findall(r"<xpaths>(.*?)</xpaths>", text, re.DOTALL)
|
||||
results = []
|
||||
|
||||
if xpaths:
|
||||
lines = xpaths[0].strip().splitlines()
|
||||
for line in lines:
|
||||
if line.strip().startswith("-"):
|
||||
name = re.findall(r"<name: (.*?)>", line)[0]
|
||||
xpath = re.findall(r"<xpath: (.*?)>", line)[0]
|
||||
results.append({"name": name, "xpath": xpath})
|
||||
else:
|
||||
results.append({"name": "", "xpath": line.strip()})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def parse_next_page(text: str) -> str | None:
|
||||
next_page = re.findall(r"<next_page>(.*?)</next_page>", text, re.DOTALL)
|
||||
|
||||
if next_page:
|
||||
lines = next_page[0].strip().splitlines()
|
||||
next_page = [
|
||||
line.strip().lstrip("-").strip()
|
||||
for line in lines
|
||||
if line.strip().startswith("-")
|
||||
]
|
||||
|
||||
return next_page[0] if next_page else None
|
||||
|
||||
|
||||
async def capture_elements(
|
||||
page: Page, xpaths: list[dict[str, str]], return_html: bool
|
||||
) -> list[CapturedElement]:
|
||||
captured_elements = []
|
||||
seen_texts = set()
|
||||
|
||||
for xpath in xpaths:
|
||||
try:
|
||||
locator = page.locator(f"xpath={xpath['xpath']}")
|
||||
count = await locator.count()
|
||||
|
||||
for i in range(count):
|
||||
if return_html:
|
||||
element_text = (
|
||||
await page.locator(f"xpath={xpath['xpath']}")
|
||||
.nth(i)
|
||||
.inner_html()
|
||||
)
|
||||
|
||||
seen_texts.add(element_text)
|
||||
captured_elements.append(
|
||||
CapturedElement(
|
||||
name=xpath["name"],
|
||||
text=element_text,
|
||||
xpath=xpath["xpath"],
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
element_text = ""
|
||||
|
||||
element_handle = await locator.nth(i).element_handle()
|
||||
|
||||
if not element_handle:
|
||||
continue
|
||||
|
||||
link = await element_handle.get_attribute("href") or ""
|
||||
|
||||
text = await element_handle.text_content()
|
||||
|
||||
if text:
|
||||
element_text += text
|
||||
|
||||
if link:
|
||||
element_text += f" ({link})"
|
||||
|
||||
cleaned = clean_text(element_text)
|
||||
|
||||
if cleaned in seen_texts:
|
||||
continue
|
||||
|
||||
seen_texts.add(cleaned)
|
||||
|
||||
captured_elements.append(
|
||||
CapturedElement(
|
||||
name=xpath["name"],
|
||||
text=cleaned,
|
||||
xpath=xpath["xpath"],
|
||||
)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing xpath {xpath}: {e}")
|
||||
|
||||
return captured_elements
|
||||
@@ -1,32 +1,28 @@
|
||||
# STL
|
||||
import os
|
||||
import logging
|
||||
from collections.abc import Iterable, AsyncGenerator
|
||||
|
||||
# PDM
|
||||
from openai import OpenAI
|
||||
from ollama import Message
|
||||
from fastapi import APIRouter
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
from openai.types.chat import ChatCompletionMessageParam
|
||||
|
||||
# LOCAL
|
||||
from ollama import Message, AsyncClient
|
||||
from api.backend.models import AI
|
||||
from api.backend.ai.clients import (
|
||||
llama_model,
|
||||
open_ai_key,
|
||||
llama_client,
|
||||
open_ai_model,
|
||||
openai_client,
|
||||
)
|
||||
from api.backend.ai.schemas import AI
|
||||
from api.backend.routers.handle_exceptions import handle_exceptions
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
LOG = logging.getLogger("AI")
|
||||
|
||||
ai_router = APIRouter()
|
||||
|
||||
# Load environment variables
|
||||
open_ai_key = os.getenv("OPENAI_KEY")
|
||||
open_ai_model = os.getenv("OPENAI_MODEL")
|
||||
llama_url = os.getenv("OLLAMA_URL")
|
||||
llama_model = os.getenv("OLLAMA_MODEL")
|
||||
|
||||
# Initialize clients
|
||||
openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None
|
||||
llama_client = AsyncClient(host=llama_url) if llama_url else None
|
||||
|
||||
|
||||
async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]:
|
||||
if llama_client and llama_model:
|
||||
@@ -67,6 +63,7 @@ chat_function = llama_chat if llama_client else openai_chat
|
||||
|
||||
|
||||
@ai_router.post("/ai")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def ai(c: AI):
|
||||
return StreamingResponse(
|
||||
chat_function(chat_messages=c.messages), media_type="text/plain"
|
||||
@@ -74,5 +71,6 @@ async def ai(c: AI):
|
||||
|
||||
|
||||
@ai_router.get("/ai/check")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def check():
|
||||
return JSONResponse(content={"ai_enabled": bool(open_ai_key or llama_model)})
|
||||
|
||||
39
api/backend/ai/clients.py
Normal file
39
api/backend/ai/clients.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# STL
|
||||
import os
|
||||
|
||||
# PDM
|
||||
from ollama import AsyncClient
|
||||
from openai import OpenAI
|
||||
|
||||
# Load environment variables
|
||||
open_ai_key = os.getenv("OPENAI_KEY")
|
||||
open_ai_model = os.getenv("OPENAI_MODEL")
|
||||
llama_url = os.getenv("OLLAMA_URL")
|
||||
llama_model = os.getenv("OLLAMA_MODEL")
|
||||
|
||||
# Initialize clients
|
||||
openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None
|
||||
llama_client = AsyncClient(host=llama_url) if llama_url else None
|
||||
|
||||
|
||||
async def ask_open_ai(prompt: str) -> str:
|
||||
if not openai_client:
|
||||
raise ValueError("OpenAI client not initialized")
|
||||
|
||||
response = openai_client.chat.completions.create(
|
||||
model=open_ai_model or "gpt-4.1-mini",
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
)
|
||||
|
||||
return response.choices[0].message.content or ""
|
||||
|
||||
|
||||
async def ask_ollama(prompt: str) -> str:
|
||||
if not llama_client:
|
||||
raise ValueError("Ollama client not initialized")
|
||||
|
||||
response = await llama_client.chat(
|
||||
model=llama_model or "", messages=[{"role": "user", "content": prompt}]
|
||||
)
|
||||
|
||||
return response.message.content or ""
|
||||
4
api/backend/ai/schemas/__init__.py
Normal file
4
api/backend/ai/schemas/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# LOCAL
|
||||
from .ai import AI
|
||||
|
||||
__all__ = ["AI"]
|
||||
9
api/backend/ai/schemas/ai.py
Normal file
9
api/backend/ai/schemas/ai.py
Normal file
@@ -0,0 +1,9 @@
|
||||
# STL
|
||||
from typing import Any
|
||||
|
||||
# PDM
|
||||
import pydantic
|
||||
|
||||
|
||||
class AI(pydantic.BaseModel):
|
||||
messages: list[Any]
|
||||
@@ -1,39 +1,57 @@
|
||||
# STL
|
||||
import os
|
||||
import logging
|
||||
import apscheduler # type: ignore
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
# PDM
|
||||
import apscheduler.schedulers
|
||||
import apscheduler.schedulers.background
|
||||
from fastapi import FastAPI, Request, status
|
||||
from fastapi.responses import JSONResponse
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
# LOCAL
|
||||
from api.backend.ai.ai_router import ai_router
|
||||
from api.backend.auth.auth_router import auth_router
|
||||
from api.backend.utils import get_log_level
|
||||
from api.backend.routers.job_router import job_router
|
||||
from api.backend.routers.stats_router import stats_router
|
||||
from api.backend.database.startup import init_database
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
|
||||
from api.backend.scheduler import scheduler
|
||||
from api.backend.ai.ai_router import ai_router
|
||||
from api.backend.job.job_router import job_router
|
||||
from api.backend.auth.auth_router import auth_router
|
||||
from api.backend.stats.stats_router import stats_router
|
||||
from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
|
||||
|
||||
log_level = os.getenv("LOG_LEVEL")
|
||||
LOG_LEVEL = get_log_level(log_level)
|
||||
|
||||
logging.basicConfig(
|
||||
level=LOG_LEVEL,
|
||||
format="%(levelname)s: %(asctime)s - %(name)s - %(message)s",
|
||||
format="%(levelname)s: %(asctime)s - [%(name)s] - %(message)s",
|
||||
handlers=[logging.StreamHandler()],
|
||||
)
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
app = FastAPI(title="api", root_path="/api")
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(_: FastAPI):
|
||||
# Startup
|
||||
LOG.info("Starting application...")
|
||||
|
||||
LOG.info("Starting cron scheduler...")
|
||||
await start_cron_scheduler(scheduler)
|
||||
scheduler.start()
|
||||
|
||||
LOG.info("Cron scheduler started successfully")
|
||||
|
||||
yield
|
||||
|
||||
# Shutdown
|
||||
LOG.info("Shutting down application...")
|
||||
LOG.info("Stopping cron scheduler...")
|
||||
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
|
||||
LOG.info("Cron scheduler stopped")
|
||||
LOG.info("Application shutdown complete")
|
||||
|
||||
|
||||
app = FastAPI(title="api", root_path="/api", lifespan=lifespan)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
@@ -43,28 +61,12 @@ app.add_middleware(
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
app.include_router(auth_router)
|
||||
app.include_router(ai_router)
|
||||
app.include_router(job_router)
|
||||
app.include_router(stats_router)
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
start_cron_scheduler(scheduler)
|
||||
scheduler.start()
|
||||
|
||||
if os.getenv("ENV") != "test":
|
||||
init_database()
|
||||
LOG.info("Starting up...")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
def shutdown_scheduler():
|
||||
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
|
||||
|
||||
|
||||
@app.exception_handler(RequestValidationError)
|
||||
async def validation_exception_handler(request: Request, exc: RequestValidationError):
|
||||
exc_str = f"{exc}".replace("\n", " ").replace(" ", " ")
|
||||
|
||||
@@ -1,13 +1,16 @@
|
||||
# STL
|
||||
from datetime import timedelta
|
||||
import os
|
||||
import logging
|
||||
from datetime import timedelta
|
||||
|
||||
# PDM
|
||||
from fastapi import Depends, APIRouter, HTTPException, status
|
||||
from fastapi.security import OAuth2PasswordRequestForm
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
# LOCAL
|
||||
from api.backend.schemas import User, Token, UserCreate
|
||||
from api.backend.auth.schemas import User, Token, UserCreate
|
||||
from api.backend.database.base import AsyncSessionLocal, get_db
|
||||
from api.backend.auth.auth_utils import (
|
||||
ACCESS_TOKEN_EXPIRE_MINUTES,
|
||||
get_current_user,
|
||||
@@ -15,18 +18,19 @@ from api.backend.auth.auth_utils import (
|
||||
get_password_hash,
|
||||
create_access_token,
|
||||
)
|
||||
import logging
|
||||
|
||||
from api.backend.database.common import update
|
||||
from api.backend.database.models import User as DatabaseUser
|
||||
from api.backend.routers.handle_exceptions import handle_exceptions
|
||||
|
||||
auth_router = APIRouter()
|
||||
|
||||
LOG = logging.getLogger("auth_router")
|
||||
LOG = logging.getLogger("Auth")
|
||||
|
||||
|
||||
@auth_router.post("/auth/token", response_model=Token)
|
||||
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
|
||||
user = await authenticate_user(form_data.username, form_data.password)
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends(), db: AsyncSession = Depends(get_db)):
|
||||
user = await authenticate_user(form_data.username, form_data.password, db)
|
||||
|
||||
if not user:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
@@ -47,23 +51,37 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends(
|
||||
|
||||
|
||||
@auth_router.post("/auth/signup", response_model=User)
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def create_user(user: UserCreate):
|
||||
hashed_password = get_password_hash(user.password)
|
||||
user_dict = user.model_dump()
|
||||
user_dict["hashed_password"] = hashed_password
|
||||
del user_dict["password"]
|
||||
|
||||
query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)"
|
||||
_ = update(query, (user_dict["email"], hashed_password, user_dict["full_name"]))
|
||||
async with AsyncSessionLocal() as session:
|
||||
new_user = DatabaseUser(
|
||||
email=user.email,
|
||||
hashed_password=user_dict["hashed_password"],
|
||||
full_name=user.full_name,
|
||||
)
|
||||
|
||||
session.add(new_user)
|
||||
await session.commit()
|
||||
|
||||
return user_dict
|
||||
|
||||
|
||||
@auth_router.get("/auth/users/me", response_model=User)
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def read_users_me(current_user: User = Depends(get_current_user)):
|
||||
return current_user
|
||||
|
||||
|
||||
@auth_router.get("/auth/check")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def check_auth():
|
||||
return {"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True"}
|
||||
return {
|
||||
"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True",
|
||||
"recordings_enabled": os.environ.get("RECORDINGS_ENABLED", "true").lower()
|
||||
== "true",
|
||||
}
|
||||
|
||||
@@ -1,22 +1,24 @@
|
||||
# STL
|
||||
import os
|
||||
import logging
|
||||
from typing import Any, Optional
|
||||
from datetime import datetime, timedelta
|
||||
import logging
|
||||
|
||||
# PDM
|
||||
from jose import JWTError, jwt
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import Depends, HTTPException, status
|
||||
from sqlalchemy import select
|
||||
from passlib.context import CryptContext
|
||||
from fastapi.security import OAuth2PasswordBearer
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
# LOCAL
|
||||
from api.backend.schemas import User, UserInDB, TokenData
|
||||
from api.backend.auth.schemas import User, UserInDB, TokenData
|
||||
from api.backend.database.base import get_db
|
||||
from api.backend.database.models import User as UserModel
|
||||
|
||||
from api.backend.database.common import query
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
LOG = logging.getLogger("Auth")
|
||||
|
||||
_ = load_dotenv()
|
||||
|
||||
@@ -38,18 +40,24 @@ def get_password_hash(password: str):
|
||||
return pwd_context.hash(password)
|
||||
|
||||
|
||||
async def get_user(email: str):
|
||||
user_query = "SELECT * FROM users WHERE email = ?"
|
||||
user = query(user_query, (email,))[0]
|
||||
async def get_user(session: AsyncSession, email: str) -> UserInDB | None:
|
||||
stmt = select(UserModel).where(UserModel.email == email)
|
||||
result = await session.execute(stmt)
|
||||
user = result.scalars().first()
|
||||
|
||||
if not user:
|
||||
return
|
||||
return None
|
||||
|
||||
return UserInDB(**user)
|
||||
return UserInDB(
|
||||
email=str(user.email),
|
||||
hashed_password=str(user.hashed_password),
|
||||
full_name=str(user.full_name),
|
||||
disabled=bool(user.disabled),
|
||||
)
|
||||
|
||||
|
||||
async def authenticate_user(email: str, password: str):
|
||||
user = await get_user(email)
|
||||
async def authenticate_user(email: str, password: str, db: AsyncSession):
|
||||
user = await get_user(db, email)
|
||||
|
||||
if not user:
|
||||
return False
|
||||
@@ -75,7 +83,9 @@ def create_access_token(
|
||||
return encoded_jwt
|
||||
|
||||
|
||||
async def get_current_user(token: str = Depends(oauth2_scheme)):
|
||||
async def get_current_user(
|
||||
db: AsyncSession = Depends(get_db), token: str = Depends(oauth2_scheme)
|
||||
):
|
||||
LOG.debug(f"Getting current user with token: {token}")
|
||||
|
||||
if not token:
|
||||
@@ -83,7 +93,7 @@ async def get_current_user(token: str = Depends(oauth2_scheme)):
|
||||
return EMPTY_USER
|
||||
|
||||
if len(token.split(".")) != 3:
|
||||
LOG.error(f"Malformed token: {token}")
|
||||
LOG.debug(f"Malformed token: {token}")
|
||||
return EMPTY_USER
|
||||
|
||||
try:
|
||||
@@ -118,14 +128,15 @@ async def get_current_user(token: str = Depends(oauth2_scheme)):
|
||||
LOG.error(f"Exception occurred: {e}")
|
||||
return EMPTY_USER
|
||||
|
||||
user = await get_user(email=token_data.email)
|
||||
user = await get_user(db, email=token_data.email or "")
|
||||
|
||||
if user is None:
|
||||
return EMPTY_USER
|
||||
|
||||
return user
|
||||
|
||||
|
||||
async def require_user(token: str = Depends(oauth2_scheme)):
|
||||
async def require_user(db: AsyncSession, token: str = Depends(oauth2_scheme)):
|
||||
credentials_exception = HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Could not validate credentials",
|
||||
@@ -136,6 +147,7 @@ async def require_user(token: str = Depends(oauth2_scheme)):
|
||||
payload: Optional[dict[str, Any]] = jwt.decode(
|
||||
token, SECRET_KEY, algorithms=[ALGORITHM]
|
||||
)
|
||||
|
||||
if not payload:
|
||||
raise credentials_exception
|
||||
|
||||
@@ -149,7 +161,7 @@ async def require_user(token: str = Depends(oauth2_scheme)):
|
||||
except JWTError:
|
||||
raise credentials_exception
|
||||
|
||||
user = await get_user(email=token_data.email)
|
||||
user = await get_user(db, email=token_data.email or "")
|
||||
|
||||
if user is None:
|
||||
raise credentials_exception
|
||||
|
||||
4
api/backend/auth/schemas/__init__.py
Normal file
4
api/backend/auth/schemas/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# LOCAL
|
||||
from .auth import User, Token, UserInDB, TokenData, UserCreate
|
||||
|
||||
__all__ = ["User", "Token", "UserInDB", "TokenData", "UserCreate"]
|
||||
@@ -1 +1,24 @@
|
||||
DATABASE_PATH = "data/database.db"
|
||||
# STL
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
DATABASE_URL = os.getenv("DATABASE_URL", "sqlite+aiosqlite:///data/database.db")
|
||||
RECORDINGS_DIR = Path("media/recordings")
|
||||
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
|
||||
MEDIA_DIR = Path("media")
|
||||
MEDIA_TYPES = [
|
||||
"audio",
|
||||
"documents",
|
||||
"images",
|
||||
"pdfs",
|
||||
"presentations",
|
||||
"spreadsheets",
|
||||
"videos",
|
||||
]
|
||||
|
||||
REGISTRATION_ENABLED = os.getenv("REGISTRATION_ENABLED", "true").lower() == "true"
|
||||
DEFAULT_USER_EMAIL = os.getenv("DEFAULT_USER_EMAIL")
|
||||
DEFAULT_USER_PASSWORD = os.getenv("DEFAULT_USER_PASSWORD")
|
||||
DEFAULT_USER_FULL_NAME = os.getenv("DEFAULT_USER_FULL_NAME")
|
||||
|
||||
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
from .common import insert, QUERIES, update
|
||||
|
||||
__all__ = ["insert", "QUERIES", "update"]
|
||||
|
||||
26
api/backend/database/base.py
Normal file
26
api/backend/database/base.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# STL
|
||||
from typing import AsyncGenerator
|
||||
|
||||
# PDM
|
||||
from sqlalchemy.orm import declarative_base
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
||||
|
||||
# LOCAL
|
||||
from api.backend.constants import DATABASE_URL
|
||||
|
||||
engine = create_async_engine(DATABASE_URL, echo=False, future=True)
|
||||
|
||||
AsyncSessionLocal = async_sessionmaker(
|
||||
bind=engine,
|
||||
autoflush=False,
|
||||
autocommit=False,
|
||||
expire_on_commit=False,
|
||||
class_=AsyncSession,
|
||||
)
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
async def get_db() -> AsyncGenerator[AsyncSession, None]:
|
||||
async with AsyncSessionLocal() as session:
|
||||
yield session
|
||||
@@ -1,92 +0,0 @@
|
||||
import sqlite3
|
||||
from typing import Any, Optional
|
||||
from api.backend.constants import DATABASE_PATH
|
||||
from api.backend.utils import format_json, format_sql_row_to_python
|
||||
from api.backend.database.schema import INIT_QUERY
|
||||
from api.backend.database.queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
|
||||
import logging
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def connect():
|
||||
connection = sqlite3.connect(DATABASE_PATH)
|
||||
connection.set_trace_callback(print)
|
||||
cursor = connection.cursor()
|
||||
return cursor
|
||||
|
||||
|
||||
def insert(query: str, values: tuple[Any, ...]):
|
||||
connection = sqlite3.connect(DATABASE_PATH)
|
||||
cursor = connection.cursor()
|
||||
copy = list(values)
|
||||
format_json(copy)
|
||||
|
||||
try:
|
||||
_ = cursor.execute(query, copy)
|
||||
connection.commit()
|
||||
except sqlite3.Error as e:
|
||||
LOG.error(f"An error occurred: {e}")
|
||||
finally:
|
||||
cursor.close()
|
||||
connection.close()
|
||||
|
||||
|
||||
def query(query: str, values: Optional[tuple[Any, ...]] = None):
|
||||
connection = sqlite3.connect(DATABASE_PATH)
|
||||
connection.row_factory = sqlite3.Row
|
||||
cursor = connection.cursor()
|
||||
rows = []
|
||||
try:
|
||||
if values:
|
||||
_ = cursor.execute(query, values)
|
||||
else:
|
||||
_ = cursor.execute(query)
|
||||
|
||||
rows = cursor.fetchall()
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
connection.close()
|
||||
|
||||
formatted_rows: list[dict[str, Any]] = []
|
||||
|
||||
for row in rows:
|
||||
row = dict(row)
|
||||
formatted_row = format_sql_row_to_python(row)
|
||||
formatted_rows.append(formatted_row)
|
||||
|
||||
return formatted_rows
|
||||
|
||||
|
||||
def update(query: str, values: Optional[tuple[Any, ...]] = None):
|
||||
connection = sqlite3.connect(DATABASE_PATH)
|
||||
cursor = connection.cursor()
|
||||
|
||||
copy = None
|
||||
|
||||
if values:
|
||||
copy = list(values)
|
||||
format_json(copy)
|
||||
|
||||
try:
|
||||
if copy:
|
||||
res = cursor.execute(query, copy)
|
||||
else:
|
||||
res = cursor.execute(query)
|
||||
connection.commit()
|
||||
return res.rowcount
|
||||
except sqlite3.Error as e:
|
||||
LOG.error(f"An error occurred: {e}")
|
||||
finally:
|
||||
cursor.close()
|
||||
connection.close()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
QUERIES = {
|
||||
"init": INIT_QUERY,
|
||||
"insert_job": JOB_INSERT_QUERY,
|
||||
"delete_job": DELETE_JOB_QUERY,
|
||||
}
|
||||
65
api/backend/database/models.py
Normal file
65
api/backend/database/models.py
Normal file
@@ -0,0 +1,65 @@
|
||||
# PDM
|
||||
from sqlalchemy import JSON, Column, String, Boolean, DateTime, ForeignKey, func
|
||||
from sqlalchemy.orm import relationship
|
||||
|
||||
# LOCAL
|
||||
from api.backend.database.base import Base
|
||||
|
||||
|
||||
class User(Base):
|
||||
__tablename__ = "users"
|
||||
|
||||
email = Column(String(255), primary_key=True, nullable=False)
|
||||
hashed_password = Column(String(255), nullable=False)
|
||||
full_name = Column(String(255), nullable=True)
|
||||
disabled = Column(Boolean, default=False)
|
||||
|
||||
jobs = relationship("Job", back_populates="user_obj", cascade="all, delete-orphan")
|
||||
cron_jobs = relationship(
|
||||
"CronJob", back_populates="user_obj", cascade="all, delete-orphan"
|
||||
)
|
||||
|
||||
|
||||
class Job(Base):
|
||||
__tablename__ = "jobs"
|
||||
|
||||
id = Column(String(64), primary_key=True, nullable=False)
|
||||
url = Column(String(2048), nullable=False)
|
||||
elements = Column(JSON, nullable=False)
|
||||
user = Column(String(255), ForeignKey("users.email"), nullable=True)
|
||||
time_created = Column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
result = Column(JSON, nullable=False)
|
||||
status = Column(String(50), nullable=False)
|
||||
chat = Column(JSON, nullable=True)
|
||||
job_options = Column(JSON, nullable=True)
|
||||
agent_mode = Column(Boolean, default=False, nullable=False)
|
||||
prompt = Column(String(1024), nullable=True)
|
||||
favorite = Column(Boolean, default=False, nullable=False)
|
||||
|
||||
user_obj = relationship("User", back_populates="jobs")
|
||||
cron_jobs = relationship(
|
||||
"CronJob", back_populates="job_obj", cascade="all, delete-orphan"
|
||||
)
|
||||
|
||||
|
||||
class CronJob(Base):
|
||||
__tablename__ = "cron_jobs"
|
||||
|
||||
id = Column(String(64), primary_key=True, nullable=False)
|
||||
user_email = Column(String(255), ForeignKey("users.email"), nullable=False)
|
||||
job_id = Column(String(64), ForeignKey("jobs.id"), nullable=False)
|
||||
cron_expression = Column(String(255), nullable=False)
|
||||
time_created = Column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
time_updated = Column(
|
||||
DateTime(timezone=True),
|
||||
server_default=func.now(),
|
||||
onupdate=func.now(),
|
||||
nullable=False,
|
||||
)
|
||||
|
||||
user_obj = relationship("User", back_populates="cron_jobs")
|
||||
job_obj = relationship("Job", back_populates="cron_jobs")
|
||||
@@ -1,3 +0,0 @@
|
||||
from .queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
|
||||
|
||||
__all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"]
|
||||
|
||||
72
api/backend/database/queries/job/job_queries.py
Normal file
72
api/backend/database/queries/job/job_queries.py
Normal file
@@ -0,0 +1,72 @@
|
||||
# STL
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
# PDM
|
||||
from sqlalchemy import delete as sql_delete
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import update as sql_update
|
||||
|
||||
# LOCAL
|
||||
from api.backend.database.base import AsyncSessionLocal
|
||||
from api.backend.database.models import Job
|
||||
|
||||
LOG = logging.getLogger("Database")
|
||||
|
||||
|
||||
async def insert_job(item: dict[str, Any]) -> None:
|
||||
async with AsyncSessionLocal() as session:
|
||||
job = Job(
|
||||
id=item["id"],
|
||||
url=item["url"],
|
||||
elements=item["elements"],
|
||||
user=item["user"],
|
||||
time_created=item["time_created"],
|
||||
result=item["result"],
|
||||
status=item["status"],
|
||||
chat=item["chat"],
|
||||
job_options=item["job_options"],
|
||||
agent_mode=item["agent_mode"],
|
||||
prompt=item["prompt"],
|
||||
)
|
||||
session.add(job)
|
||||
await session.commit()
|
||||
LOG.info(f"Inserted item: {item}")
|
||||
|
||||
|
||||
async def get_queued_job():
|
||||
async with AsyncSessionLocal() as session:
|
||||
stmt = (
|
||||
select(Job)
|
||||
.where(Job.status == "Queued")
|
||||
.order_by(Job.time_created.desc())
|
||||
.limit(1)
|
||||
)
|
||||
result = await session.execute(stmt)
|
||||
job = result.scalars().first()
|
||||
LOG.info(f"Got queued job: {job}")
|
||||
return job
|
||||
|
||||
|
||||
async def update_job(ids: list[str], updates: dict[str, Any]):
|
||||
if not updates:
|
||||
return
|
||||
|
||||
async with AsyncSessionLocal() as session:
|
||||
stmt = sql_update(Job).where(Job.id.in_(ids)).values(**updates)
|
||||
result = await session.execute(stmt)
|
||||
await session.commit()
|
||||
LOG.debug(f"Updated job count: {result.rowcount}")
|
||||
|
||||
|
||||
async def delete_jobs(jobs: list[str]):
|
||||
if not jobs:
|
||||
LOG.info("No jobs to delete.")
|
||||
return False
|
||||
|
||||
async with AsyncSessionLocal() as session:
|
||||
stmt = sql_delete(Job).where(Job.id.in_(jobs))
|
||||
result = await session.execute(stmt)
|
||||
await session.commit()
|
||||
LOG.info(f"Deleted jobs count: {result.rowcount}")
|
||||
return result.rowcount
|
||||
@@ -1,9 +0,0 @@
|
||||
JOB_INSERT_QUERY = """
|
||||
INSERT INTO jobs
|
||||
(id, url, elements, user, time_created, result, status, chat, job_options)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
"""
|
||||
|
||||
DELETE_JOB_QUERY = """
|
||||
DELETE FROM jobs WHERE id IN ()
|
||||
"""
|
||||
43
api/backend/database/queries/statistics/statistic_queries.py
Normal file
43
api/backend/database/queries/statistics/statistic_queries.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# PDM
|
||||
from sqlalchemy import func, select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
# LOCAL
|
||||
from api.backend.database.models import Job
|
||||
|
||||
|
||||
async def average_elements_per_link(session: AsyncSession, user_email: str):
|
||||
date_func = func.date(Job.time_created)
|
||||
|
||||
stmt = (
|
||||
select(
|
||||
date_func.label("date"),
|
||||
func.avg(func.json_array_length(Job.elements)).label("average_elements"),
|
||||
func.count().label("count"),
|
||||
)
|
||||
.where(Job.status == "Completed", Job.user == user_email)
|
||||
.group_by(date_func)
|
||||
.order_by("date")
|
||||
)
|
||||
|
||||
result = await session.execute(stmt)
|
||||
rows = result.all()
|
||||
return [dict(row._mapping) for row in rows]
|
||||
|
||||
|
||||
async def get_jobs_per_day(session: AsyncSession, user_email: str):
|
||||
date_func = func.date(Job.time_created)
|
||||
|
||||
stmt = (
|
||||
select(
|
||||
date_func.label("date"),
|
||||
func.count().label("job_count"),
|
||||
)
|
||||
.where(Job.status == "Completed", Job.user == user_email)
|
||||
.group_by(date_func)
|
||||
.order_by("date")
|
||||
)
|
||||
|
||||
result = await session.execute(stmt)
|
||||
rows = result.all()
|
||||
return [dict(row._mapping) for row in rows]
|
||||
@@ -1,3 +0,0 @@
|
||||
from .schema import INIT_QUERY
|
||||
|
||||
__all__ = ["INIT_QUERY"]
|
||||
@@ -1,30 +0,0 @@
|
||||
INIT_QUERY = """
|
||||
CREATE TABLE IF NOT EXISTS jobs (
|
||||
id STRING PRIMARY KEY NOT NULL,
|
||||
url STRING NOT NULL,
|
||||
elements JSON NOT NULL,
|
||||
user STRING,
|
||||
time_created DATETIME NOT NULL,
|
||||
result JSON NOT NULL,
|
||||
status STRING NOT NULL,
|
||||
chat JSON,
|
||||
job_options JSON
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS users (
|
||||
email STRING PRIMARY KEY NOT NULL,
|
||||
hashed_password STRING NOT NULL,
|
||||
full_name STRING,
|
||||
disabled BOOLEAN
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS cron_jobs (
|
||||
id STRING PRIMARY KEY NOT NULL,
|
||||
user_email STRING NOT NULL,
|
||||
job_id STRING NOT NULL,
|
||||
cron_expression STRING NOT NULL,
|
||||
time_created DATETIME NOT NULL,
|
||||
time_updated DATETIME NOT NULL,
|
||||
FOREIGN KEY (job_id) REFERENCES jobs(id)
|
||||
);
|
||||
"""
|
||||
@@ -1,43 +1,56 @@
|
||||
import os
|
||||
from api.backend.database.common import connect, QUERIES, insert
|
||||
# STL
|
||||
import logging
|
||||
|
||||
# PDM
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
|
||||
# LOCAL
|
||||
from api.backend.constants import (
|
||||
DEFAULT_USER_EMAIL,
|
||||
REGISTRATION_ENABLED,
|
||||
DEFAULT_USER_PASSWORD,
|
||||
DEFAULT_USER_FULL_NAME,
|
||||
)
|
||||
from api.backend.database.base import Base, AsyncSessionLocal, engine
|
||||
from api.backend.auth.auth_utils import get_password_hash
|
||||
from api.backend.database.models import User
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
LOG = logging.getLogger("Database")
|
||||
|
||||
async def init_database():
|
||||
LOG.info("Creating database schema...")
|
||||
|
||||
def init_database():
|
||||
cursor = connect()
|
||||
async with engine.begin() as conn:
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
|
||||
for query in QUERIES["init"].strip().split(";"):
|
||||
if query.strip():
|
||||
LOG.info(f"Executing query: {query}")
|
||||
_ = cursor.execute(query)
|
||||
if not REGISTRATION_ENABLED:
|
||||
default_user_email = DEFAULT_USER_EMAIL
|
||||
default_user_password = DEFAULT_USER_PASSWORD
|
||||
default_user_full_name = DEFAULT_USER_FULL_NAME
|
||||
|
||||
if os.environ.get("REGISTRATION_ENABLED", "True") == "False":
|
||||
default_user_email = os.environ.get("DEFAULT_USER_EMAIL")
|
||||
default_user_password = os.environ.get("DEFAULT_USER_PASSWORD")
|
||||
default_user_full_name = os.environ.get("DEFAULT_USER_FULL_NAME")
|
||||
|
||||
if (
|
||||
not default_user_email
|
||||
or not default_user_password
|
||||
or not default_user_full_name
|
||||
):
|
||||
LOG.error(
|
||||
"DEFAULT_USER_EMAIL, DEFAULT_USER_PASSWORD, or DEFAULT_USER_FULL_NAME is not set!"
|
||||
)
|
||||
if not (default_user_email and default_user_password and default_user_full_name):
|
||||
LOG.error("DEFAULT_USER_* env vars are not set!")
|
||||
exit(1)
|
||||
|
||||
query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)"
|
||||
_ = insert(
|
||||
query,
|
||||
(
|
||||
default_user_email,
|
||||
get_password_hash(default_user_password),
|
||||
default_user_full_name,
|
||||
),
|
||||
)
|
||||
async with AsyncSessionLocal() as session:
|
||||
user = await session.get(User, default_user_email)
|
||||
if user:
|
||||
LOG.info("Default user already exists. Skipping creation.")
|
||||
return
|
||||
|
||||
LOG.info("Creating default user...")
|
||||
new_user = User(
|
||||
email=default_user_email,
|
||||
hashed_password=get_password_hash(default_user_password),
|
||||
full_name=default_user_full_name,
|
||||
disabled=False,
|
||||
)
|
||||
|
||||
try:
|
||||
session.add(new_user)
|
||||
await session.commit()
|
||||
LOG.info(f"Created default user: {default_user_email}")
|
||||
except IntegrityError as e:
|
||||
await session.rollback()
|
||||
LOG.warning(f"Could not create default user (already exists?): {e}")
|
||||
|
||||
cursor.close()
|
||||
|
||||
37
api/backend/database/utils.py
Normal file
37
api/backend/database/utils.py
Normal file
@@ -0,0 +1,37 @@
|
||||
# STL
|
||||
import json
|
||||
from typing import Any
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def format_list_for_query(ids: list[str]):
|
||||
return (
|
||||
f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)"
|
||||
)
|
||||
|
||||
|
||||
def format_sql_row_to_python(row: dict[str, Any]):
|
||||
new_row: dict[str, Any] = {}
|
||||
for key, value in row.items():
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
new_row[key] = json.loads(value)
|
||||
except json.JSONDecodeError:
|
||||
new_row[key] = value
|
||||
else:
|
||||
new_row[key] = value
|
||||
|
||||
return new_row
|
||||
|
||||
|
||||
def format_json(items: list[Any]):
|
||||
for idx, item in enumerate(items):
|
||||
if isinstance(item, (dict, list)):
|
||||
formatted_item = json.dumps(item)
|
||||
items[idx] = formatted_item
|
||||
|
||||
|
||||
def parse_datetime(dt_str: str) -> datetime:
|
||||
if dt_str.endswith("Z"):
|
||||
dt_str = dt_str.replace("Z", "+00:00") # valid ISO format for UTC
|
||||
return datetime.fromisoformat(dt_str)
|
||||
@@ -1,17 +1,9 @@
|
||||
from .job import (
|
||||
insert,
|
||||
update_job,
|
||||
delete_jobs,
|
||||
get_jobs_per_day,
|
||||
get_queued_job,
|
||||
average_elements_per_link,
|
||||
)
|
||||
# LOCAL
|
||||
from .job import insert, update_job, delete_jobs, get_queued_job
|
||||
|
||||
__all__ = [
|
||||
"insert",
|
||||
"update_job",
|
||||
"delete_jobs",
|
||||
"get_jobs_per_day",
|
||||
"get_queued_job",
|
||||
"average_elements_per_link",
|
||||
]
|
||||
|
||||
@@ -1,78 +1,75 @@
|
||||
import datetime
|
||||
from typing import Any
|
||||
# STL
|
||||
import uuid
|
||||
from api.backend.database.common import insert, query
|
||||
from api.backend.models import CronJob
|
||||
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
|
||||
from apscheduler.triggers.cron import CronTrigger # type: ignore
|
||||
|
||||
from api.backend.job import insert as insert_job
|
||||
import logging
|
||||
import datetime
|
||||
from typing import Any, List
|
||||
|
||||
LOG = logging.getLogger("Cron Scheduler")
|
||||
# PDM
|
||||
from sqlalchemy import select
|
||||
from apscheduler.triggers.cron import CronTrigger
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
|
||||
# LOCAL
|
||||
from api.backend.job import insert as insert_job
|
||||
from api.backend.database.base import AsyncSessionLocal
|
||||
from api.backend.database.models import Job, CronJob
|
||||
|
||||
LOG = logging.getLogger("Cron")
|
||||
|
||||
|
||||
def insert_cron_job(cron_job: CronJob):
|
||||
query = """
|
||||
INSERT INTO cron_jobs (id, user_email, job_id, cron_expression, time_created, time_updated)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
"""
|
||||
values = (
|
||||
cron_job.id,
|
||||
cron_job.user_email,
|
||||
cron_job.job_id,
|
||||
cron_job.cron_expression,
|
||||
cron_job.time_created,
|
||||
cron_job.time_updated,
|
||||
)
|
||||
|
||||
insert(query, values)
|
||||
|
||||
async def insert_cron_job(cron_job: CronJob) -> bool:
|
||||
async with AsyncSessionLocal() as session:
|
||||
session.add(cron_job)
|
||||
await session.commit()
|
||||
return True
|
||||
|
||||
|
||||
def delete_cron_job(id: str, user_email: str):
|
||||
query = """
|
||||
DELETE FROM cron_jobs
|
||||
WHERE id = ? AND user_email = ?
|
||||
"""
|
||||
values = (id, user_email)
|
||||
insert(query, values)
|
||||
|
||||
async def delete_cron_job(id: str, user_email: str) -> bool:
|
||||
async with AsyncSessionLocal() as session:
|
||||
stmt = select(CronJob).where(CronJob.id == id, CronJob.user_email == user_email)
|
||||
result = await session.execute(stmt)
|
||||
cron_job = result.scalars().first()
|
||||
if cron_job:
|
||||
await session.delete(cron_job)
|
||||
await session.commit()
|
||||
return True
|
||||
|
||||
|
||||
def get_cron_jobs(user_email: str):
|
||||
cron_jobs = query("SELECT * FROM cron_jobs WHERE user_email = ?", (user_email,))
|
||||
|
||||
return cron_jobs
|
||||
async def get_cron_jobs(user_email: str) -> List[CronJob]:
|
||||
async with AsyncSessionLocal() as session:
|
||||
stmt = select(CronJob).where(CronJob.user_email == user_email)
|
||||
result = await session.execute(stmt)
|
||||
return list(result.scalars().all())
|
||||
|
||||
|
||||
def get_all_cron_jobs():
|
||||
cron_jobs = query("SELECT * FROM cron_jobs")
|
||||
|
||||
return cron_jobs
|
||||
async def get_all_cron_jobs() -> List[CronJob]:
|
||||
async with AsyncSessionLocal() as session:
|
||||
stmt = select(CronJob)
|
||||
result = await session.execute(stmt)
|
||||
return list(result.scalars().all())
|
||||
|
||||
|
||||
def insert_job_from_cron_job(job: dict[str, Any]):
|
||||
insert_job(
|
||||
{
|
||||
**job,
|
||||
"id": uuid.uuid4().hex,
|
||||
"status": "Queued",
|
||||
"result": "",
|
||||
"chat": None,
|
||||
"time_created": datetime.datetime.now(),
|
||||
"time_updated": datetime.datetime.now(),
|
||||
}
|
||||
)
|
||||
async def insert_job_from_cron_job(job: dict[str, Any]):
|
||||
async with AsyncSessionLocal() as session:
|
||||
await insert_job(
|
||||
{
|
||||
**job,
|
||||
"id": uuid.uuid4().hex,
|
||||
"status": "Queued",
|
||||
"result": "",
|
||||
"chat": None,
|
||||
"time_created": datetime.datetime.now(datetime.timezone.utc),
|
||||
"time_updated": datetime.datetime.now(datetime.timezone.utc),
|
||||
},
|
||||
session,
|
||||
)
|
||||
|
||||
|
||||
def get_cron_job_trigger(cron_expression: str):
|
||||
expression_parts = cron_expression.split()
|
||||
|
||||
if len(expression_parts) != 5:
|
||||
print(f"Invalid cron expression: {cron_expression}")
|
||||
LOG.warning(f"Invalid cron expression: {cron_expression}")
|
||||
return None
|
||||
|
||||
minute, hour, day, month, day_of_week = expression_parts
|
||||
@@ -82,19 +79,37 @@ def get_cron_job_trigger(cron_expression: str):
|
||||
)
|
||||
|
||||
|
||||
def start_cron_scheduler(scheduler: BackgroundScheduler):
|
||||
cron_jobs = get_all_cron_jobs()
|
||||
async def start_cron_scheduler(scheduler: AsyncIOScheduler):
|
||||
async with AsyncSessionLocal() as session:
|
||||
stmt = select(CronJob)
|
||||
result = await session.execute(stmt)
|
||||
cron_jobs = result.scalars().all()
|
||||
|
||||
LOG.info(f"Cron jobs: {cron_jobs}")
|
||||
LOG.info(f"Cron jobs: {cron_jobs}")
|
||||
|
||||
for job in cron_jobs:
|
||||
queried_job = query("SELECT * FROM jobs WHERE id = ?", (job["job_id"],))
|
||||
for cron_job in cron_jobs:
|
||||
stmt = select(Job).where(Job.id == cron_job.job_id)
|
||||
result = await session.execute(stmt)
|
||||
queried_job = result.scalars().first()
|
||||
|
||||
LOG.info(f"Adding job: {queried_job}")
|
||||
LOG.info(f"Adding job: {queried_job}")
|
||||
|
||||
scheduler.add_job(
|
||||
insert_job_from_cron_job,
|
||||
get_cron_job_trigger(job["cron_expression"]),
|
||||
id=job["id"],
|
||||
args=[queried_job[0]],
|
||||
)
|
||||
trigger = get_cron_job_trigger(cron_job.cron_expression) # type: ignore
|
||||
if not trigger:
|
||||
continue
|
||||
|
||||
job_dict = (
|
||||
{
|
||||
c.key: getattr(queried_job, c.key)
|
||||
for c in queried_job.__table__.columns
|
||||
}
|
||||
if queried_job
|
||||
else {}
|
||||
)
|
||||
|
||||
scheduler.add_job(
|
||||
insert_job_from_cron_job,
|
||||
trigger,
|
||||
id=cron_job.id,
|
||||
args=[job_dict],
|
||||
)
|
||||
|
||||
@@ -1,97 +1,113 @@
|
||||
# STL
|
||||
import logging
|
||||
import datetime
|
||||
from typing import Any
|
||||
|
||||
# PDM
|
||||
from sqlalchemy import delete as sql_delete
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import update as sql_update
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
# LOCAL
|
||||
from api.backend.utils import format_list_for_query
|
||||
from api.backend.database.common import (
|
||||
insert as common_insert,
|
||||
query as common_query,
|
||||
QUERIES,
|
||||
update as common_update,
|
||||
)
|
||||
from api.backend.database.base import AsyncSessionLocal
|
||||
from api.backend.database.models import Job
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
LOG = logging.getLogger("Job")
|
||||
|
||||
|
||||
def insert(item: dict[str, Any]) -> None:
|
||||
common_insert(
|
||||
QUERIES["insert_job"],
|
||||
(
|
||||
async def insert(item: dict[str, Any], db: AsyncSession) -> None:
|
||||
existing = await db.get(Job, item["id"])
|
||||
if existing:
|
||||
await multi_field_update_job(
|
||||
item["id"],
|
||||
item["url"],
|
||||
item["elements"],
|
||||
item["user"],
|
||||
item["time_created"],
|
||||
item["result"],
|
||||
item["status"],
|
||||
item["chat"],
|
||||
item["job_options"],
|
||||
),
|
||||
{
|
||||
"agent_mode": item["agent_mode"],
|
||||
"prompt": item["prompt"],
|
||||
"job_options": item["job_options"],
|
||||
"elements": item["elements"],
|
||||
"status": "Queued",
|
||||
"result": [],
|
||||
"time_created": datetime.datetime.now(datetime.timezone.utc),
|
||||
"chat": None,
|
||||
},
|
||||
db,
|
||||
)
|
||||
return
|
||||
|
||||
job = Job(
|
||||
id=item["id"],
|
||||
url=item["url"],
|
||||
elements=item["elements"],
|
||||
user=item["user"],
|
||||
time_created=datetime.datetime.now(datetime.timezone.utc),
|
||||
result=item["result"],
|
||||
status=item["status"],
|
||||
chat=item["chat"],
|
||||
job_options=item["job_options"],
|
||||
agent_mode=item["agent_mode"],
|
||||
prompt=item["prompt"],
|
||||
)
|
||||
LOG.info(f"Inserted item: {item}")
|
||||
|
||||
db.add(job)
|
||||
await db.commit()
|
||||
LOG.debug(f"Inserted item: {item}")
|
||||
|
||||
|
||||
async def check_for_job_completion(id: str) -> dict[str, Any]:
|
||||
async with AsyncSessionLocal() as session:
|
||||
job = await session.get(Job, id)
|
||||
return job.__dict__ if job else {}
|
||||
|
||||
|
||||
async def get_queued_job():
|
||||
query = (
|
||||
"SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
|
||||
)
|
||||
res = common_query(query)
|
||||
LOG.info(f"Got queued job: {res}")
|
||||
return res[0] if res else None
|
||||
async with AsyncSessionLocal() as session:
|
||||
stmt = (
|
||||
select(Job)
|
||||
.where(Job.status == "Queued")
|
||||
.order_by(Job.time_created.desc())
|
||||
.limit(1)
|
||||
)
|
||||
result = await session.execute(stmt)
|
||||
job = result.scalars().first()
|
||||
LOG.debug(f"Got queued job: {job}")
|
||||
return job.__dict__ if job else None
|
||||
|
||||
|
||||
async def update_job(ids: list[str], field: str, value: Any):
|
||||
query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
|
||||
res = common_update(query, tuple([value] + ids))
|
||||
LOG.info(f"Updated job: {res}")
|
||||
async with AsyncSessionLocal() as session:
|
||||
stmt = sql_update(Job).where(Job.id.in_(ids)).values({field: value})
|
||||
res = await session.execute(stmt)
|
||||
await session.commit()
|
||||
LOG.debug(f"Updated job count: {res.rowcount}")
|
||||
|
||||
|
||||
async def multi_field_update_job(
|
||||
id: str, fields: dict[str, Any], session: AsyncSession | None = None
|
||||
):
|
||||
close_session = False
|
||||
if not session:
|
||||
session = AsyncSessionLocal()
|
||||
close_session = True
|
||||
|
||||
try:
|
||||
stmt = sql_update(Job).where(Job.id == id).values(**fields)
|
||||
await session.execute(stmt)
|
||||
await session.commit()
|
||||
LOG.debug(f"Updated job {id} fields: {fields}")
|
||||
finally:
|
||||
if close_session:
|
||||
await session.close()
|
||||
|
||||
|
||||
async def delete_jobs(jobs: list[str]):
|
||||
if not jobs:
|
||||
LOG.info("No jobs to delete.")
|
||||
LOG.debug("No jobs to delete.")
|
||||
return False
|
||||
|
||||
query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}"
|
||||
res = common_update(query, tuple(jobs))
|
||||
|
||||
return res > 0
|
||||
|
||||
|
||||
async def average_elements_per_link(user: str):
|
||||
job_query = """
|
||||
SELECT
|
||||
DATE(time_created) AS date,
|
||||
AVG(json_array_length(elements)) AS average_elements,
|
||||
COUNT(*) AS count
|
||||
FROM
|
||||
jobs
|
||||
WHERE
|
||||
status = 'Completed' AND user = ?
|
||||
GROUP BY
|
||||
DATE(time_created)
|
||||
ORDER BY
|
||||
date ASC;
|
||||
"""
|
||||
results = common_query(job_query, (user,))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
async def get_jobs_per_day(user: str):
|
||||
job_query = """
|
||||
SELECT
|
||||
DATE(time_created) AS date,
|
||||
COUNT(*) AS job_count
|
||||
FROM
|
||||
jobs
|
||||
WHERE
|
||||
status = 'Completed' AND user = ?
|
||||
GROUP BY
|
||||
DATE(time_created)
|
||||
ORDER BY
|
||||
date ASC;
|
||||
"""
|
||||
results = common_query(job_query, (user,))
|
||||
|
||||
return results
|
||||
async with AsyncSessionLocal() as session:
|
||||
stmt = sql_delete(Job).where(Job.id.in_(jobs))
|
||||
res = await session.execute(stmt)
|
||||
await session.commit()
|
||||
LOG.debug(f"Deleted jobs: {res.rowcount}")
|
||||
return res.rowcount > 0
|
||||
|
||||
280
api/backend/job/job_router.py
Normal file
280
api/backend/job/job_router.py
Normal file
@@ -0,0 +1,280 @@
|
||||
# STL
|
||||
import csv
|
||||
import uuid
|
||||
import random
|
||||
import logging
|
||||
import datetime
|
||||
from io import StringIO
|
||||
|
||||
# PDM
|
||||
from fastapi import Depends, APIRouter
|
||||
from sqlalchemy import select
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from apscheduler.triggers.cron import CronTrigger # type: ignore
|
||||
|
||||
# LOCAL
|
||||
from api.backend.job import insert, update_job, delete_jobs
|
||||
from api.backend.constants import MEDIA_DIR, MEDIA_TYPES, RECORDINGS_DIR
|
||||
from api.backend.scheduler import scheduler
|
||||
from api.backend.schemas.job import Job, UpdateJobs, DownloadJob, DeleteScrapeJobs
|
||||
from api.backend.auth.schemas import User
|
||||
from api.backend.schemas.cron import CronJob as PydanticCronJob
|
||||
from api.backend.schemas.cron import DeleteCronJob
|
||||
from api.backend.database.base import get_db
|
||||
from api.backend.auth.auth_utils import get_current_user
|
||||
from api.backend.database.models import Job as DatabaseJob
|
||||
from api.backend.database.models import CronJob
|
||||
from api.backend.job.utils.text_utils import clean_text
|
||||
from api.backend.job.models.job_options import FetchOptions
|
||||
from api.backend.routers.handle_exceptions import handle_exceptions
|
||||
from api.backend.job.utils.clean_job_format import clean_job_format
|
||||
from api.backend.job.cron_scheduling.cron_scheduling import (
|
||||
get_cron_jobs,
|
||||
delete_cron_job,
|
||||
insert_cron_job,
|
||||
get_cron_job_trigger,
|
||||
insert_job_from_cron_job,
|
||||
)
|
||||
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
|
||||
|
||||
LOG = logging.getLogger("Job")
|
||||
|
||||
job_router = APIRouter()
|
||||
|
||||
|
||||
@job_router.post("/update")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def update(update_jobs: UpdateJobs, _: User = Depends(get_current_user)):
|
||||
await update_job(update_jobs.ids, update_jobs.field, update_jobs.value)
|
||||
return {"message": "Jobs updated successfully"}
|
||||
|
||||
|
||||
@job_router.post("/submit-scrape-job")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def submit_scrape_job(job: Job, db: AsyncSession = Depends(get_db)):
|
||||
LOG.info(f"Recieved job: {job}")
|
||||
|
||||
if not job.id:
|
||||
job.id = uuid.uuid4().hex
|
||||
|
||||
job_dict = job.model_dump()
|
||||
await insert(job_dict, db)
|
||||
|
||||
return JSONResponse(
|
||||
content={"id": job.id, "message": "Job submitted successfully."}
|
||||
)
|
||||
|
||||
|
||||
@job_router.post("/retrieve-scrape-jobs")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def retrieve_scrape_jobs(
|
||||
fetch_options: FetchOptions,
|
||||
user: User = Depends(get_current_user),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
LOG.info(
|
||||
f"Retrieving jobs for account: {user.email if user.email else 'Guest User'}"
|
||||
)
|
||||
if fetch_options.chat:
|
||||
stmt = select(DatabaseJob.chat).filter(DatabaseJob.user == user.email)
|
||||
else:
|
||||
stmt = select(DatabaseJob).filter(DatabaseJob.user == user.email)
|
||||
|
||||
results = await db.execute(stmt)
|
||||
rows = results.all() if fetch_options.chat else results.scalars().all()
|
||||
|
||||
return JSONResponse(content=jsonable_encoder(rows[::-1]))
|
||||
|
||||
|
||||
@job_router.get("/job/{id}")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def job(
|
||||
id: str, user: User = Depends(get_current_user), db: AsyncSession = Depends(get_db)
|
||||
):
|
||||
LOG.info(f"Retrieving jobs for account: {user.email}")
|
||||
|
||||
stmt = select(DatabaseJob).filter(
|
||||
DatabaseJob.user == user.email, DatabaseJob.id == id
|
||||
)
|
||||
|
||||
results = await db.execute(stmt)
|
||||
|
||||
return JSONResponse(
|
||||
content=jsonable_encoder([job.__dict__ for job in results.scalars().all()])
|
||||
)
|
||||
|
||||
|
||||
@job_router.post("/download")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def download(download_job: DownloadJob, db: AsyncSession = Depends(get_db)):
|
||||
LOG.info(f"Downloading job with ids: {download_job.ids}")
|
||||
stmt = select(DatabaseJob).where(DatabaseJob.id.in_(download_job.ids))
|
||||
result = await db.execute(stmt)
|
||||
results = [job.__dict__ for job in result.scalars().all()]
|
||||
|
||||
if download_job.job_format == "csv":
|
||||
csv_buffer = StringIO()
|
||||
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
|
||||
|
||||
headers = [
|
||||
"id",
|
||||
"url",
|
||||
"element_name",
|
||||
"xpath",
|
||||
"text",
|
||||
"user",
|
||||
"time_created",
|
||||
]
|
||||
csv_writer.writerow(headers)
|
||||
|
||||
for result in results:
|
||||
for res in result["result"]:
|
||||
for url, elements in res.items():
|
||||
for element_name, values in elements.items():
|
||||
for value in values:
|
||||
text = clean_text(value.get("text", "")).strip()
|
||||
if text:
|
||||
csv_writer.writerow(
|
||||
[
|
||||
result.get("id", "")
|
||||
+ "-"
|
||||
+ str(random.randint(0, 1000000)),
|
||||
url,
|
||||
element_name,
|
||||
value.get("xpath", ""),
|
||||
text,
|
||||
result.get("user", ""),
|
||||
result.get("time_created", ""),
|
||||
]
|
||||
)
|
||||
|
||||
_ = csv_buffer.seek(0)
|
||||
response = StreamingResponse(
|
||||
csv_buffer,
|
||||
media_type="text/csv",
|
||||
)
|
||||
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
|
||||
return response
|
||||
|
||||
elif download_job.job_format == "md":
|
||||
response = StreamingResponse(
|
||||
stream_md_from_job_results(results),
|
||||
media_type="text/markdown",
|
||||
)
|
||||
|
||||
response.headers["Content-Disposition"] = "attachment; filename=export.md"
|
||||
return response
|
||||
|
||||
|
||||
@job_router.get("/job/{id}/convert-to-csv")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def convert_to_csv(id: str, db: AsyncSession = Depends(get_db)):
|
||||
stmt = select(DatabaseJob).filter(DatabaseJob.id == id)
|
||||
results = await db.execute(stmt)
|
||||
jobs = results.scalars().all()
|
||||
|
||||
return JSONResponse(content=clean_job_format([job.__dict__ for job in jobs]))
|
||||
|
||||
|
||||
@job_router.post("/delete-scrape-jobs")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def delete(delete_scrape_jobs: DeleteScrapeJobs):
|
||||
result = await delete_jobs(delete_scrape_jobs.ids)
|
||||
return (
|
||||
JSONResponse(content={"message": "Jobs successfully deleted."})
|
||||
if result
|
||||
else JSONResponse(content={"error": "Jobs not deleted."})
|
||||
)
|
||||
|
||||
|
||||
@job_router.post("/schedule-cron-job")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def schedule_cron_job(
|
||||
cron_job: PydanticCronJob,
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
if not cron_job.id:
|
||||
cron_job.id = uuid.uuid4().hex
|
||||
|
||||
now = datetime.datetime.now()
|
||||
if not cron_job.time_created:
|
||||
cron_job.time_created = now
|
||||
|
||||
if not cron_job.time_updated:
|
||||
cron_job.time_updated = now
|
||||
|
||||
await insert_cron_job(CronJob(**cron_job.model_dump()))
|
||||
|
||||
stmt = select(DatabaseJob).where(DatabaseJob.id == cron_job.job_id)
|
||||
result = await db.execute(stmt)
|
||||
queried_job = result.scalars().first()
|
||||
|
||||
if not queried_job:
|
||||
return JSONResponse(status_code=404, content={"error": "Related job not found"})
|
||||
|
||||
scheduler.add_job(
|
||||
insert_job_from_cron_job,
|
||||
get_cron_job_trigger(cron_job.cron_expression),
|
||||
id=cron_job.id,
|
||||
args=[queried_job],
|
||||
)
|
||||
|
||||
return JSONResponse(content={"message": "Cron job scheduled successfully."})
|
||||
|
||||
|
||||
@job_router.post("/delete-cron-job")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def delete_cron_job_request(request: DeleteCronJob):
|
||||
if not request.id:
|
||||
return JSONResponse(
|
||||
content={"error": "Cron job id is required."}, status_code=400
|
||||
)
|
||||
|
||||
await delete_cron_job(request.id, request.user_email)
|
||||
scheduler.remove_job(request.id)
|
||||
|
||||
return JSONResponse(content={"message": "Cron job deleted successfully."})
|
||||
|
||||
|
||||
@job_router.get("/cron-jobs")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def get_cron_jobs_request(user: User = Depends(get_current_user)):
|
||||
cron_jobs = await get_cron_jobs(user.email)
|
||||
return JSONResponse(content=jsonable_encoder(cron_jobs))
|
||||
|
||||
|
||||
@job_router.get("/recordings/{id}")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def get_recording(id: str):
|
||||
path = RECORDINGS_DIR / f"{id}.mp4"
|
||||
if not path.exists():
|
||||
return JSONResponse(content={"error": "Recording not found."}, status_code=404)
|
||||
|
||||
return FileResponse(
|
||||
path, headers={"Content-Type": "video/mp4", "Accept-Ranges": "bytes"}
|
||||
)
|
||||
|
||||
|
||||
@job_router.get("/get-media")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def get_media(id: str):
|
||||
files: dict[str, list[str]] = {}
|
||||
|
||||
for media_type in MEDIA_TYPES:
|
||||
path = MEDIA_DIR / media_type / f"{id}"
|
||||
files[media_type] = [file.name for file in path.glob("*")]
|
||||
|
||||
return JSONResponse(content={"files": files})
|
||||
|
||||
|
||||
@job_router.get("/media")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def get_media_file(id: str, type: str, file: str):
|
||||
path = MEDIA_DIR / type / f"{id}" / file
|
||||
|
||||
if not path.exists():
|
||||
return JSONResponse(content={"error": "Media file not found."}, status_code=404)
|
||||
|
||||
return FileResponse(path)
|
||||
@@ -1,3 +1,5 @@
|
||||
from .job_options import JobOptions
|
||||
# LOCAL
|
||||
from .job import Element, CapturedElement
|
||||
from .job_options import Proxy, JobOptions
|
||||
|
||||
__all__ = ["JobOptions"]
|
||||
__all__ = ["JobOptions", "CapturedElement", "Element", "Proxy"]
|
||||
|
||||
15
api/backend/job/models/job.py
Normal file
15
api/backend/job/models/job.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from typing import Optional
|
||||
|
||||
import pydantic
|
||||
|
||||
|
||||
class Element(pydantic.BaseModel):
|
||||
name: str
|
||||
xpath: str
|
||||
url: Optional[str] = None
|
||||
|
||||
|
||||
class CapturedElement(pydantic.BaseModel):
|
||||
xpath: str
|
||||
text: str
|
||||
name: str
|
||||
@@ -1,8 +1,19 @@
|
||||
from pydantic import BaseModel
|
||||
# STL
|
||||
from typing import Any, Optional
|
||||
|
||||
# PDM
|
||||
from pydantic import BaseModel
|
||||
|
||||
# LOCAL
|
||||
from api.backend.job.models.site_map import SiteMap
|
||||
|
||||
|
||||
class Proxy(BaseModel):
|
||||
server: str
|
||||
username: Optional[str] = None
|
||||
password: Optional[str] = None
|
||||
|
||||
|
||||
class FetchOptions(BaseModel):
|
||||
chat: Optional[bool] = None
|
||||
|
||||
@@ -10,6 +21,8 @@ class FetchOptions(BaseModel):
|
||||
class JobOptions(BaseModel):
|
||||
multi_page_scrape: bool = False
|
||||
custom_headers: dict[str, Any] = {}
|
||||
proxies: list[str] = []
|
||||
proxies: list[Proxy] = []
|
||||
site_map: Optional[SiteMap] = None
|
||||
collect_media: bool = False
|
||||
custom_cookies: list[dict[str, Any]] = []
|
||||
return_html: bool = False
|
||||
|
||||
49
api/backend/job/scraping/add_custom.py
Normal file
49
api/backend/job/scraping/add_custom.py
Normal file
@@ -0,0 +1,49 @@
|
||||
# STL
|
||||
import logging
|
||||
from typing import Any, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# PDM
|
||||
from playwright.async_api import Page, BrowserContext
|
||||
|
||||
LOG = logging.getLogger("Job")
|
||||
|
||||
|
||||
async def add_custom_cookies(
|
||||
custom_cookies: list[dict[str, Any]],
|
||||
url: str,
|
||||
context: BrowserContext,
|
||||
) -> None:
|
||||
parsed_url = urlparse(url)
|
||||
domain = parsed_url.netloc
|
||||
|
||||
for cookie in custom_cookies:
|
||||
cookie_dict = {
|
||||
"name": cookie.get("name", ""),
|
||||
"value": cookie.get("value", ""),
|
||||
"domain": domain,
|
||||
"path": "/",
|
||||
}
|
||||
|
||||
LOG.info(f"Adding cookie: {cookie_dict}")
|
||||
await context.add_cookies([cookie_dict]) # type: ignore
|
||||
|
||||
|
||||
async def add_custom_headers(
|
||||
custom_headers: dict[str, Any],
|
||||
page: Page,
|
||||
) -> None:
|
||||
await page.set_extra_http_headers(custom_headers)
|
||||
|
||||
|
||||
async def add_custom_items(
|
||||
url: str,
|
||||
page: Page,
|
||||
cookies: Optional[list[dict[str, Any]]] = None,
|
||||
headers: Optional[dict[str, Any]] = None,
|
||||
) -> None:
|
||||
if cookies:
|
||||
await add_custom_cookies(cookies, url, page.context)
|
||||
|
||||
if headers:
|
||||
await add_custom_headers(headers, page)
|
||||
@@ -1,20 +1,24 @@
|
||||
# STL
|
||||
import os
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
import re
|
||||
import logging
|
||||
from typing import Dict, List
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
# PDM
|
||||
import aiohttp
|
||||
from playwright.async_api import Page
|
||||
|
||||
from api.backend.utils import LOG
|
||||
LOG = logging.getLogger("Job")
|
||||
|
||||
|
||||
async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
|
||||
async def collect_media(id: str, page: Page) -> dict[str, list[dict[str, str]]]:
|
||||
media_types = {
|
||||
"images": "img",
|
||||
"videos": "video",
|
||||
"audio": "audio",
|
||||
"pdfs": 'a[href$=".pdf"]',
|
||||
"pdfs": 'a[href$=".pdf"], a[href*=".pdf#page="]',
|
||||
"documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]',
|
||||
"presentations": 'a[href$=".ppt"], a[href$=".pptx"]',
|
||||
"spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]',
|
||||
@@ -48,6 +52,11 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
|
||||
root_domain = f"{root_url.scheme}://{root_url.netloc}"
|
||||
url = f"{root_domain}{url}"
|
||||
|
||||
if url and re.match(r"^[\w\-]+/", url):
|
||||
root_url = urlparse(page.url)
|
||||
root_domain = f"{root_url.scheme}://{root_url.netloc}"
|
||||
url = urljoin(root_domain + "/", url)
|
||||
|
||||
if url and url.startswith(("http://", "https://")):
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
@@ -67,15 +76,20 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
|
||||
}.get(media_type, "")
|
||||
filename += ext
|
||||
|
||||
file_path = media_dir / filename
|
||||
if not os.path.exists(media_dir / id):
|
||||
os.makedirs(media_dir / id, exist_ok=True)
|
||||
|
||||
file_path = media_dir / id / f"{filename}"
|
||||
|
||||
async with session.get(url) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
with open(file_path, "wb") as f:
|
||||
while True:
|
||||
chunk = await response.content.read(8192)
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
f.write(chunk)
|
||||
|
||||
urls.append({"url": url, "local_path": str(file_path)})
|
||||
|
||||
@@ -1,83 +1,80 @@
|
||||
import logging
|
||||
# STL
|
||||
import random
|
||||
from typing import Any, Optional, cast
|
||||
import logging
|
||||
from typing import Any, cast
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
# PDM
|
||||
from bs4 import Tag, BeautifulSoup
|
||||
from lxml import etree
|
||||
from camoufox import AsyncCamoufox
|
||||
from playwright.async_api import Page
|
||||
from urllib.parse import urlparse, urljoin
|
||||
|
||||
from api.backend.models import Element, CapturedElement
|
||||
from api.backend.job.scraping.scraping_utils import scrape_content
|
||||
# LOCAL
|
||||
from api.backend.constants import RECORDINGS_ENABLED
|
||||
from api.backend.job.models import Element, CapturedElement
|
||||
from api.backend.job.utils.text_utils import clean_text
|
||||
from api.backend.job.scraping.add_custom import add_custom_items
|
||||
from api.backend.job.scraping.scraping_utils import (
|
||||
sxpath,
|
||||
is_same_domain,
|
||||
scrape_content,
|
||||
)
|
||||
from api.backend.job.site_mapping.site_mapping import handle_site_mapping
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def is_same_domain(url: str, original_url: str) -> bool:
|
||||
parsed_url = urlparse(url)
|
||||
parsed_original_url = urlparse(original_url)
|
||||
return parsed_url.netloc == parsed_original_url.netloc or parsed_url.netloc == ""
|
||||
|
||||
|
||||
def clean_xpath(xpath: str) -> str:
|
||||
parts = xpath.split("/")
|
||||
clean_parts = ["/" if part == "" else part for part in parts]
|
||||
clean_xpath = "//".join(clean_parts).replace("////", "//").replace("'", "\\'")
|
||||
LOG.info(f"Cleaned xpath: {clean_xpath}")
|
||||
|
||||
return clean_xpath
|
||||
|
||||
|
||||
def sxpath(context: etree._Element, xpath: str):
|
||||
return context.xpath(xpath)
|
||||
LOG = logging.getLogger("Job")
|
||||
|
||||
|
||||
async def make_site_request(
|
||||
id: str,
|
||||
url: str,
|
||||
headers: Optional[dict[str, Any]],
|
||||
multi_page_scrape: bool = False,
|
||||
job_options: dict[str, Any],
|
||||
visited_urls: set[str] = set(),
|
||||
pages: set[tuple[str, str]] = set(),
|
||||
original_url: str = "",
|
||||
proxies: Optional[list[str]] = None,
|
||||
site_map: Optional[dict[str, Any]] = None,
|
||||
collect_media: bool = False,
|
||||
):
|
||||
headers = job_options["custom_headers"]
|
||||
multi_page_scrape = job_options["multi_page_scrape"]
|
||||
proxies = job_options["proxies"]
|
||||
site_map = job_options["site_map"]
|
||||
collect_media = job_options["collect_media"]
|
||||
custom_cookies = job_options["custom_cookies"]
|
||||
|
||||
if url in visited_urls:
|
||||
return
|
||||
|
||||
proxy = None
|
||||
|
||||
if proxies:
|
||||
proxy = random.choice(proxies)
|
||||
LOG.info(f"Using proxy: {proxy}")
|
||||
|
||||
async with AsyncCamoufox(headless=True, proxy=proxy) as browser:
|
||||
async with AsyncCamoufox(headless=not RECORDINGS_ENABLED, proxy=proxy) as browser:
|
||||
page: Page = await browser.new_page()
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
|
||||
if headers:
|
||||
await page.set_extra_http_headers(headers)
|
||||
# Add cookies and headers
|
||||
await add_custom_items(url, page, custom_cookies, headers)
|
||||
|
||||
LOG.info(f"Visiting URL: {url}")
|
||||
|
||||
try:
|
||||
await page.goto(url, timeout=60000)
|
||||
await page.wait_for_load_state("networkidle", timeout=10000)
|
||||
await page.wait_for_load_state("networkidle")
|
||||
|
||||
final_url = page.url
|
||||
|
||||
visited_urls.add(url)
|
||||
visited_urls.add(final_url)
|
||||
|
||||
html_content = await scrape_content(page, pages, collect_media)
|
||||
html_content = await scrape_content(id, page, pages, collect_media)
|
||||
|
||||
html_content = await page.content()
|
||||
pages.add((html_content, final_url))
|
||||
|
||||
if site_map:
|
||||
await handle_site_mapping(
|
||||
site_map, page, pages, collect_media=collect_media
|
||||
id, site_map, page, pages, collect_media=collect_media
|
||||
)
|
||||
|
||||
finally:
|
||||
@@ -104,19 +101,18 @@ async def make_site_request(
|
||||
|
||||
if link not in visited_urls and is_same_domain(link, original_url):
|
||||
await make_site_request(
|
||||
id,
|
||||
link,
|
||||
headers=headers,
|
||||
multi_page_scrape=multi_page_scrape,
|
||||
job_options=job_options,
|
||||
visited_urls=visited_urls,
|
||||
pages=pages,
|
||||
original_url=original_url,
|
||||
proxies=proxies,
|
||||
site_map=site_map,
|
||||
collect_media=collect_media,
|
||||
)
|
||||
|
||||
|
||||
async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]):
|
||||
async def collect_scraped_elements(
|
||||
page: tuple[str, str], xpaths: list[Element], return_html: bool
|
||||
):
|
||||
soup = BeautifulSoup(page[0], "lxml")
|
||||
root = etree.HTML(str(soup))
|
||||
|
||||
@@ -126,12 +122,24 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
|
||||
el = sxpath(root, elem.xpath)
|
||||
|
||||
for e in el: # type: ignore
|
||||
if return_html:
|
||||
elements[elem.name] = [
|
||||
CapturedElement(
|
||||
xpath=elem.xpath,
|
||||
text=page[0],
|
||||
name=elem.name,
|
||||
)
|
||||
]
|
||||
continue
|
||||
|
||||
text = (
|
||||
"\t".join(str(t) for t in e.itertext())
|
||||
" ".join(str(t) for t in e.itertext())
|
||||
if isinstance(e, etree._Element)
|
||||
else str(e) # type: ignore
|
||||
)
|
||||
|
||||
text = clean_text(text)
|
||||
|
||||
captured_element = CapturedElement(
|
||||
xpath=elem.xpath, text=text, name=elem.name
|
||||
)
|
||||
@@ -145,32 +153,30 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
|
||||
|
||||
|
||||
async def scrape(
|
||||
id: str,
|
||||
url: str,
|
||||
xpaths: list[Element],
|
||||
headers: Optional[dict[str, Any]] = None,
|
||||
multi_page_scrape: bool = False,
|
||||
proxies: Optional[list[str]] = None,
|
||||
site_map: Optional[dict[str, Any]] = None,
|
||||
collect_media: bool = False,
|
||||
job_options: dict[str, Any],
|
||||
):
|
||||
visited_urls: set[str] = set()
|
||||
pages: set[tuple[str, str]] = set()
|
||||
|
||||
await make_site_request(
|
||||
id,
|
||||
url,
|
||||
headers=headers,
|
||||
multi_page_scrape=multi_page_scrape,
|
||||
job_options=job_options,
|
||||
visited_urls=visited_urls,
|
||||
pages=pages,
|
||||
original_url=url,
|
||||
proxies=proxies,
|
||||
site_map=site_map,
|
||||
collect_media=collect_media,
|
||||
)
|
||||
|
||||
elements: list[dict[str, dict[str, list[CapturedElement]]]] = []
|
||||
|
||||
for page in pages:
|
||||
elements.append(await collect_scraped_elements(page, xpaths))
|
||||
elements.append(
|
||||
await collect_scraped_elements(
|
||||
page, xpaths, job_options.get("return_html", False)
|
||||
)
|
||||
)
|
||||
|
||||
return elements
|
||||
@@ -1,14 +1,21 @@
|
||||
# STL
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Set, Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# PDM
|
||||
from lxml import etree
|
||||
from playwright.async_api import Page
|
||||
|
||||
from api.backend.utils import LOG
|
||||
|
||||
# LOCAL
|
||||
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
|
||||
|
||||
LOG = logging.getLogger("Job")
|
||||
|
||||
|
||||
async def scrape_content(
|
||||
page: Page, pages: Set[Tuple[str, str]], collect_media: bool
|
||||
id: str, page: Page, pages: Set[Tuple[str, str]], collect_media: bool
|
||||
) -> str:
|
||||
last_height = await page.evaluate("document.body.scrollHeight")
|
||||
|
||||
@@ -27,6 +34,25 @@ async def scrape_content(
|
||||
|
||||
if collect_media:
|
||||
LOG.info("Collecting media")
|
||||
await collect_media_utils(page)
|
||||
await collect_media_utils(id, page)
|
||||
|
||||
return html
|
||||
|
||||
|
||||
def is_same_domain(url: str, original_url: str) -> bool:
|
||||
parsed_url = urlparse(url)
|
||||
parsed_original_url = urlparse(original_url)
|
||||
return parsed_url.netloc == parsed_original_url.netloc or parsed_url.netloc == ""
|
||||
|
||||
|
||||
def clean_xpath(xpath: str) -> str:
|
||||
parts = xpath.split("/")
|
||||
clean_parts = ["/" if part == "" else part for part in parts]
|
||||
clean_xpath = "//".join(clean_parts).replace("////", "//").replace("'", "\\'")
|
||||
LOG.info(f"Cleaned xpath: {clean_xpath}")
|
||||
|
||||
return clean_xpath
|
||||
|
||||
|
||||
def sxpath(context: etree._Element, xpath: str):
|
||||
return context.xpath(xpath)
|
||||
|
||||
@@ -1,14 +1,17 @@
|
||||
import logging
|
||||
# STL
|
||||
import asyncio
|
||||
import logging
|
||||
from copy import deepcopy
|
||||
from typing import Any
|
||||
|
||||
# PDM
|
||||
from playwright.async_api import Page
|
||||
|
||||
# LOCAL
|
||||
from api.backend.job.models.site_map import Action, SiteMap
|
||||
from api.backend.job.scraping.scraping_utils import scrape_content
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
LOG = logging.getLogger("Job")
|
||||
|
||||
|
||||
def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]:
|
||||
@@ -24,7 +27,6 @@ def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]:
|
||||
async def handle_input(action: Action, page: Page) -> bool:
|
||||
try:
|
||||
element = page.locator(f"xpath={action.xpath}")
|
||||
await element.wait_for(state="visible", timeout=10000)
|
||||
LOG.info(f"Sending keys: {action.input} to element: {action.xpath}")
|
||||
await element.fill(action.input)
|
||||
return True
|
||||
@@ -36,7 +38,6 @@ async def handle_input(action: Action, page: Page) -> bool:
|
||||
async def handle_click(action: Action, page: Page) -> bool:
|
||||
try:
|
||||
element = page.locator(f"xpath={action.xpath}")
|
||||
await element.wait_for(state="visible", timeout=10000)
|
||||
LOG.info(f"Clicking element: {action.xpath}")
|
||||
await element.click()
|
||||
return True
|
||||
@@ -52,6 +53,7 @@ ACTION_MAP = {
|
||||
|
||||
|
||||
async def handle_site_mapping(
|
||||
id: str,
|
||||
site_map_dict: dict[str, Any],
|
||||
page: Page,
|
||||
pages: set[tuple[str, str]],
|
||||
@@ -68,11 +70,11 @@ async def handle_site_mapping(
|
||||
|
||||
await asyncio.sleep(2)
|
||||
|
||||
await scrape_content(page, pages, collect_media=collect_media)
|
||||
await scrape_content(id, page, pages, collect_media=collect_media)
|
||||
|
||||
cleared_site_map_dict = clear_done_actions(site_map_dict)
|
||||
|
||||
if cleared_site_map_dict["actions"]:
|
||||
await handle_site_mapping(
|
||||
cleared_site_map_dict, page, pages, collect_media=collect_media
|
||||
id, cleared_site_map_dict, page, pages, collect_media=collect_media
|
||||
)
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
# STL
|
||||
from typing import Any
|
||||
|
||||
from api.backend.utils import clean_text
|
||||
# LOCAL
|
||||
from api.backend.job.utils.text_utils import clean_text
|
||||
|
||||
|
||||
def clean_job_format(jobs: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
@@ -26,7 +28,9 @@ def clean_job_format(jobs: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
"xpath": value.get("xpath", ""),
|
||||
"text": text,
|
||||
"user": job.get("user", ""),
|
||||
"time_created": job.get("time_created", ""),
|
||||
"time_created": job.get(
|
||||
"time_created", ""
|
||||
).isoformat(),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
# STL
|
||||
from typing import Any
|
||||
|
||||
from api.backend.utils import clean_text
|
||||
# LOCAL
|
||||
from api.backend.job.utils.text_utils import clean_text
|
||||
|
||||
|
||||
def stream_md_from_job_results(jobs: list[dict[str, Any]]):
|
||||
|
||||
10
api/backend/job/utils/text_utils.py
Normal file
10
api/backend/job/utils/text_utils.py
Normal file
@@ -0,0 +1,10 @@
|
||||
def clean_text(text: str):
|
||||
text = text.strip()
|
||||
text = text.replace("\n", " ")
|
||||
text = text.replace("\t", " ")
|
||||
text = text.replace("\r", " ")
|
||||
text = text.replace("\f", " ")
|
||||
text = text.replace("\v", " ")
|
||||
text = text.replace("\b", " ")
|
||||
text = text.replace("\a", " ")
|
||||
return text
|
||||
31
api/backend/routers/handle_exceptions.py
Normal file
31
api/backend/routers/handle_exceptions.py
Normal file
@@ -0,0 +1,31 @@
|
||||
# STL
|
||||
import logging
|
||||
import traceback
|
||||
from typing import Any, Union, Callable, Awaitable
|
||||
from functools import wraps
|
||||
|
||||
# PDM
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
|
||||
def handle_exceptions(
|
||||
logger: logging.Logger,
|
||||
) -> Callable[
|
||||
[Callable[..., Awaitable[Any]]], Callable[..., Awaitable[Union[Any, JSONResponse]]]
|
||||
]:
|
||||
def decorator(
|
||||
func: Callable[..., Awaitable[Any]],
|
||||
) -> Callable[..., Awaitable[Union[Any, JSONResponse]]]:
|
||||
@wraps(func)
|
||||
async def wrapper(*args: Any, **kwargs: Any) -> Union[Any, JSONResponse]:
|
||||
try:
|
||||
return await func(*args, **kwargs)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Exception occurred: {e}")
|
||||
traceback.print_exc()
|
||||
return JSONResponse(content={"error": str(e)}, status_code=500)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
@@ -1,233 +0,0 @@
|
||||
# STL
|
||||
import datetime
|
||||
import uuid
|
||||
import traceback
|
||||
from io import StringIO
|
||||
import csv
|
||||
import logging
|
||||
import random
|
||||
|
||||
# PDM
|
||||
from fastapi import Depends, APIRouter
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
from api.backend.scheduler import scheduler
|
||||
from apscheduler.triggers.cron import CronTrigger # type: ignore
|
||||
|
||||
# LOCAL
|
||||
from api.backend.job import insert, update_job, delete_jobs
|
||||
from api.backend.models import (
|
||||
DeleteCronJob,
|
||||
UpdateJobs,
|
||||
DownloadJob,
|
||||
DeleteScrapeJobs,
|
||||
Job,
|
||||
CronJob,
|
||||
)
|
||||
from api.backend.schemas import User
|
||||
from api.backend.auth.auth_utils import get_current_user
|
||||
from api.backend.utils import clean_text, format_list_for_query
|
||||
from api.backend.job.models.job_options import FetchOptions
|
||||
|
||||
from api.backend.database.common import query
|
||||
|
||||
from api.backend.job.cron_scheduling.cron_scheduling import (
|
||||
delete_cron_job,
|
||||
get_cron_job_trigger,
|
||||
insert_cron_job,
|
||||
get_cron_jobs,
|
||||
insert_job_from_cron_job,
|
||||
)
|
||||
|
||||
from api.backend.job.utils.clean_job_format import clean_job_format
|
||||
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
job_router = APIRouter()
|
||||
|
||||
|
||||
@job_router.post("/update")
|
||||
async def update(update_jobs: UpdateJobs, _: User = Depends(get_current_user)):
|
||||
"""Used to update jobs"""
|
||||
await update_job(update_jobs.ids, update_jobs.field, update_jobs.value)
|
||||
|
||||
|
||||
@job_router.post("/submit-scrape-job")
|
||||
async def submit_scrape_job(job: Job):
|
||||
LOG.info(f"Recieved job: {job}")
|
||||
try:
|
||||
job.id = uuid.uuid4().hex
|
||||
|
||||
job_dict = job.model_dump()
|
||||
insert(job_dict)
|
||||
|
||||
return JSONResponse(content={"id": job.id})
|
||||
except Exception as e:
|
||||
LOG.error(f"Exception occurred: {traceback.format_exc()}")
|
||||
return JSONResponse(content={"error": str(e)}, status_code=500)
|
||||
|
||||
|
||||
@job_router.post("/retrieve-scrape-jobs")
|
||||
async def retrieve_scrape_jobs(
|
||||
fetch_options: FetchOptions, user: User = Depends(get_current_user)
|
||||
):
|
||||
LOG.info(f"Retrieving jobs for account: {user.email}")
|
||||
ATTRIBUTES = "chat" if fetch_options.chat else "*"
|
||||
|
||||
try:
|
||||
job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?"
|
||||
results = query(job_query, (user.email,))
|
||||
return JSONResponse(content=jsonable_encoder(results[::-1]))
|
||||
except Exception as e:
|
||||
LOG.error(f"Exception occurred: {e}")
|
||||
return JSONResponse(content=[], status_code=500)
|
||||
|
||||
|
||||
@job_router.get("/job/{id}")
|
||||
async def job(id: str, user: User = Depends(get_current_user)):
|
||||
LOG.info(f"Retrieving jobs for account: {user.email}")
|
||||
|
||||
try:
|
||||
job_query = "SELECT * FROM jobs WHERE user = ? AND id = ?"
|
||||
results = query(job_query, (user.email, id))
|
||||
return JSONResponse(content=jsonable_encoder(results))
|
||||
except Exception as e:
|
||||
LOG.error(f"Exception occurred: {e}")
|
||||
return JSONResponse(content={"error": str(e)}, status_code=500)
|
||||
|
||||
|
||||
@job_router.post("/download")
|
||||
async def download(download_job: DownloadJob):
|
||||
LOG.info(f"Downloading job with ids: {download_job.ids}")
|
||||
|
||||
try:
|
||||
job_query = (
|
||||
f"SELECT * FROM jobs WHERE id IN {format_list_for_query(download_job.ids)}"
|
||||
)
|
||||
results = query(job_query, tuple(download_job.ids))
|
||||
|
||||
if download_job.job_format == "csv":
|
||||
csv_buffer = StringIO()
|
||||
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
|
||||
|
||||
headers = [
|
||||
"id",
|
||||
"url",
|
||||
"element_name",
|
||||
"xpath",
|
||||
"text",
|
||||
"user",
|
||||
"time_created",
|
||||
]
|
||||
csv_writer.writerow(headers)
|
||||
|
||||
for result in results:
|
||||
for res in result["result"]:
|
||||
for url, elements in res.items():
|
||||
for element_name, values in elements.items():
|
||||
for value in values:
|
||||
text = clean_text(value.get("text", "")).strip()
|
||||
if text:
|
||||
csv_writer.writerow(
|
||||
[
|
||||
result.get("id", "")
|
||||
+ "-"
|
||||
+ str(random.randint(0, 1000000)),
|
||||
url,
|
||||
element_name,
|
||||
value.get("xpath", ""),
|
||||
text,
|
||||
result.get("user", ""),
|
||||
result.get("time_created", ""),
|
||||
]
|
||||
)
|
||||
|
||||
_ = csv_buffer.seek(0)
|
||||
response = StreamingResponse(
|
||||
csv_buffer,
|
||||
media_type="text/csv",
|
||||
)
|
||||
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
|
||||
return response
|
||||
|
||||
elif download_job.job_format == "md":
|
||||
response = StreamingResponse(
|
||||
stream_md_from_job_results(results),
|
||||
media_type="text/markdown",
|
||||
)
|
||||
|
||||
response.headers["Content-Disposition"] = "attachment; filename=export.md"
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
LOG.error(f"Exception occurred: {e}")
|
||||
traceback.print_exc()
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
@job_router.get("/job/{id}/convert-to-csv")
|
||||
async def convert_to_csv(id: str):
|
||||
try:
|
||||
job_query = f"SELECT * FROM jobs WHERE id = ?"
|
||||
results = query(job_query, (id,))
|
||||
|
||||
return JSONResponse(content=clean_job_format(results))
|
||||
except Exception as e:
|
||||
LOG.error(f"Exception occurred: {e}")
|
||||
traceback.print_exc()
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
@job_router.post("/delete-scrape-jobs")
|
||||
async def delete(delete_scrape_jobs: DeleteScrapeJobs):
|
||||
result = await delete_jobs(delete_scrape_jobs.ids)
|
||||
return (
|
||||
JSONResponse(content={"message": "Jobs successfully deleted."})
|
||||
if result
|
||||
else JSONResponse({"error": "Jobs not deleted."})
|
||||
)
|
||||
|
||||
|
||||
@job_router.post("/schedule-cron-job")
|
||||
async def schedule_cron_job(cron_job: CronJob):
|
||||
if not cron_job.id:
|
||||
cron_job.id = uuid.uuid4().hex
|
||||
|
||||
if not cron_job.time_created:
|
||||
cron_job.time_created = datetime.datetime.now()
|
||||
|
||||
if not cron_job.time_updated:
|
||||
cron_job.time_updated = datetime.datetime.now()
|
||||
|
||||
insert_cron_job(cron_job)
|
||||
|
||||
queried_job = query("SELECT * FROM jobs WHERE id = ?", (cron_job.job_id,))
|
||||
|
||||
scheduler.add_job(
|
||||
insert_job_from_cron_job,
|
||||
get_cron_job_trigger(cron_job.cron_expression),
|
||||
id=cron_job.id,
|
||||
args=[queried_job[0]],
|
||||
)
|
||||
|
||||
return JSONResponse(content={"message": "Cron job scheduled successfully."})
|
||||
|
||||
|
||||
@job_router.post("/delete-cron-job")
|
||||
async def delete_cron_job_request(request: DeleteCronJob):
|
||||
if not request.id:
|
||||
return JSONResponse(
|
||||
content={"error": "Cron job id is required."}, status_code=400
|
||||
)
|
||||
|
||||
delete_cron_job(request.id, request.user_email)
|
||||
scheduler.remove_job(request.id)
|
||||
|
||||
return JSONResponse(content={"message": "Cron job deleted successfully."})
|
||||
|
||||
|
||||
@job_router.get("/cron-jobs")
|
||||
async def get_cron_jobs_request(user: User = Depends(get_current_user)):
|
||||
cron_jobs = get_cron_jobs(user.email)
|
||||
return JSONResponse(content=jsonable_encoder(cron_jobs))
|
||||
@@ -1,29 +0,0 @@
|
||||
# STL
|
||||
import logging
|
||||
|
||||
# PDM
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
# LOCAL
|
||||
from api.backend.job import (
|
||||
get_jobs_per_day,
|
||||
average_elements_per_link,
|
||||
)
|
||||
from api.backend.auth.auth_utils import get_current_user
|
||||
from api.backend.schemas import User
|
||||
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
stats_router = APIRouter()
|
||||
|
||||
|
||||
@stats_router.get("/statistics/get-average-element-per-link")
|
||||
async def get_average_element_per_link(user: User = Depends(get_current_user)):
|
||||
return await average_elements_per_link(user.email)
|
||||
|
||||
|
||||
@stats_router.get("/statistics/get-average-jobs-per-day")
|
||||
async def average_jobs_per_day(user: User = Depends(get_current_user)):
|
||||
data = await get_jobs_per_day(user.email)
|
||||
return data
|
||||
@@ -1,3 +1,4 @@
|
||||
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
|
||||
# PDM
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
|
||||
scheduler = BackgroundScheduler()
|
||||
scheduler = AsyncIOScheduler()
|
||||
17
api/backend/schemas/cron.py
Normal file
17
api/backend/schemas/cron.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from typing import Optional, Union
|
||||
from datetime import datetime
|
||||
import pydantic
|
||||
|
||||
|
||||
class CronJob(pydantic.BaseModel):
|
||||
id: Optional[str] = None
|
||||
user_email: str
|
||||
job_id: str
|
||||
cron_expression: str
|
||||
time_created: Optional[Union[datetime, str]] = None
|
||||
time_updated: Optional[Union[datetime, str]] = None
|
||||
|
||||
|
||||
class DeleteCronJob(pydantic.BaseModel):
|
||||
id: str
|
||||
user_email: str
|
||||
@@ -1,24 +1,24 @@
|
||||
# STL
|
||||
from typing import Any, Literal, Optional, Union
|
||||
from datetime import datetime
|
||||
|
||||
# LOCAL
|
||||
from api.backend.job.models.job_options import JobOptions
|
||||
|
||||
# PDM
|
||||
import pydantic
|
||||
|
||||
|
||||
class Element(pydantic.BaseModel):
|
||||
name: str
|
||||
xpath: str
|
||||
url: Optional[str] = None
|
||||
from api.backend.job.models import Element, CapturedElement
|
||||
|
||||
|
||||
class CapturedElement(pydantic.BaseModel):
|
||||
xpath: str
|
||||
text: str
|
||||
name: str
|
||||
class Job(pydantic.BaseModel):
|
||||
id: Optional[str] = None
|
||||
url: str
|
||||
elements: list[Element]
|
||||
user: str = ""
|
||||
time_created: Optional[Union[datetime, str]] = None
|
||||
result: list[dict[str, dict[str, list[CapturedElement]]]] = []
|
||||
job_options: JobOptions
|
||||
status: str = "Queued"
|
||||
chat: Optional[str] = None
|
||||
agent_mode: bool = False
|
||||
prompt: Optional[str] = None
|
||||
favorite: bool = False
|
||||
|
||||
|
||||
class RetrieveScrapeJobs(pydantic.BaseModel):
|
||||
@@ -34,41 +34,7 @@ class DeleteScrapeJobs(pydantic.BaseModel):
|
||||
ids: list[str]
|
||||
|
||||
|
||||
class GetStatistics(pydantic.BaseModel):
|
||||
user: str
|
||||
|
||||
|
||||
class UpdateJobs(pydantic.BaseModel):
|
||||
ids: list[str]
|
||||
field: str
|
||||
value: Any
|
||||
|
||||
|
||||
class AI(pydantic.BaseModel):
|
||||
messages: list[Any]
|
||||
|
||||
|
||||
class Job(pydantic.BaseModel):
|
||||
id: Optional[str] = None
|
||||
url: str
|
||||
elements: list[Element]
|
||||
user: str = ""
|
||||
time_created: Optional[Union[datetime, str]] = None
|
||||
result: list[dict[str, dict[str, list[CapturedElement]]]] = []
|
||||
job_options: JobOptions
|
||||
status: str = "Queued"
|
||||
chat: Optional[str] = None
|
||||
|
||||
|
||||
class CronJob(pydantic.BaseModel):
|
||||
id: Optional[str] = None
|
||||
user_email: str
|
||||
job_id: str
|
||||
cron_expression: str
|
||||
time_created: Optional[Union[datetime, str]] = None
|
||||
time_updated: Optional[Union[datetime, str]] = None
|
||||
|
||||
|
||||
class DeleteCronJob(pydantic.BaseModel):
|
||||
id: str
|
||||
user_email: str
|
||||
37
api/backend/stats/stats_router.py
Normal file
37
api/backend/stats/stats_router.py
Normal file
@@ -0,0 +1,37 @@
|
||||
# STL
|
||||
import logging
|
||||
|
||||
# PDM
|
||||
from fastapi import Depends, APIRouter
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
# LOCAL
|
||||
from api.backend.auth.schemas import User
|
||||
from api.backend.database.base import get_db
|
||||
from api.backend.auth.auth_utils import get_current_user
|
||||
from api.backend.routers.handle_exceptions import handle_exceptions
|
||||
from api.backend.database.queries.statistics.statistic_queries import (
|
||||
get_jobs_per_day,
|
||||
average_elements_per_link,
|
||||
)
|
||||
|
||||
LOG = logging.getLogger("Statistics")
|
||||
|
||||
stats_router = APIRouter()
|
||||
|
||||
|
||||
@stats_router.get("/statistics/get-average-element-per-link")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def get_average_element_per_link(
|
||||
user: User = Depends(get_current_user), db: AsyncSession = Depends(get_db)
|
||||
):
|
||||
return await average_elements_per_link(db, user.email)
|
||||
|
||||
|
||||
@stats_router.get("/statistics/get-average-jobs-per-day")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def average_jobs_per_day(
|
||||
user: User = Depends(get_current_user), db: AsyncSession = Depends(get_db)
|
||||
):
|
||||
data = await get_jobs_per_day(db, user.email)
|
||||
return data
|
||||
108
api/backend/tests/conftest.py
Normal file
108
api/backend/tests/conftest.py
Normal file
@@ -0,0 +1,108 @@
|
||||
# STL
|
||||
import os
|
||||
import asyncio
|
||||
from typing import Any, Generator, AsyncGenerator
|
||||
|
||||
# PDM
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from httpx import AsyncClient, ASGITransport
|
||||
from proxy import Proxy
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.pool import NullPool
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
||||
|
||||
# LOCAL
|
||||
from api.backend.app import app
|
||||
from api.backend.database.base import get_db
|
||||
from api.backend.database.models import Base
|
||||
from api.backend.tests.constants import TEST_DB_PATH
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def running_proxy():
|
||||
proxy = Proxy(["--hostname", "127.0.0.1", "--port", "8080"])
|
||||
proxy.setup()
|
||||
yield proxy
|
||||
proxy.shutdown()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def test_db_path() -> str:
|
||||
return TEST_DB_PATH
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def test_db(test_db_path: str) -> Generator[str, None, None]:
|
||||
"""Create a fresh test database for each test function."""
|
||||
os.makedirs(os.path.dirname(test_db_path), exist_ok=True)
|
||||
|
||||
if os.path.exists(test_db_path):
|
||||
os.remove(test_db_path)
|
||||
|
||||
# Create async engine for test database
|
||||
test_db_url = f"sqlite+aiosqlite:///{test_db_path}"
|
||||
engine = create_async_engine(test_db_url, echo=False)
|
||||
|
||||
async def setup_db():
|
||||
async with engine.begin() as conn:
|
||||
# Create tables
|
||||
# LOCAL
|
||||
from api.backend.database.models import Base
|
||||
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
|
||||
# Run setup
|
||||
asyncio.run(setup_db())
|
||||
|
||||
yield test_db_path
|
||||
|
||||
if os.path.exists(test_db_path):
|
||||
os.remove(test_db_path)
|
||||
|
||||
|
||||
@pytest_asyncio.fixture(scope="session")
|
||||
async def test_engine():
|
||||
test_db_url = f"sqlite+aiosqlite:///{TEST_DB_PATH}"
|
||||
engine = create_async_engine(test_db_url, poolclass=NullPool)
|
||||
async with engine.begin() as conn:
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
yield engine
|
||||
await engine.dispose()
|
||||
|
||||
|
||||
@pytest_asyncio.fixture(scope="function")
|
||||
async def db_session(test_engine: Any) -> AsyncGenerator[AsyncSession, None]:
|
||||
async_session = async_sessionmaker(
|
||||
bind=test_engine,
|
||||
class_=AsyncSession,
|
||||
expire_on_commit=False,
|
||||
)
|
||||
|
||||
async with async_session() as session:
|
||||
try:
|
||||
yield session
|
||||
finally:
|
||||
# Truncate all tables after each test
|
||||
for table in reversed(Base.metadata.sorted_tables):
|
||||
await session.execute(text(f"DELETE FROM {table.name}"))
|
||||
await session.commit()
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def override_get_db(db_session: AsyncSession):
|
||||
async def _override() -> AsyncGenerator[AsyncSession, None]:
|
||||
yield db_session
|
||||
|
||||
return _override
|
||||
|
||||
|
||||
@pytest_asyncio.fixture()
|
||||
async def client(override_get_db: Any) -> AsyncGenerator[AsyncClient, None]:
|
||||
app.dependency_overrides[get_db] = override_get_db
|
||||
|
||||
transport = ASGITransport(app=app)
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as c:
|
||||
yield c
|
||||
|
||||
app.dependency_overrides.clear()
|
||||
1
api/backend/tests/constants.py
Normal file
1
api/backend/tests/constants.py
Normal file
@@ -0,0 +1 @@
|
||||
TEST_DB_PATH = "tests/test_db.sqlite"
|
||||
@@ -1,7 +1,13 @@
|
||||
from api.backend.models import Element, Job, JobOptions, CapturedElement
|
||||
# STL
|
||||
import uuid
|
||||
|
||||
# PDM
|
||||
from faker import Faker
|
||||
|
||||
# LOCAL
|
||||
from api.backend.job.models import Element, JobOptions, CapturedElement
|
||||
from api.backend.schemas.job import Job
|
||||
|
||||
fake = Faker()
|
||||
|
||||
|
||||
|
||||
@@ -1,40 +1,65 @@
|
||||
# STL
|
||||
import random
|
||||
from datetime import datetime, timezone
|
||||
|
||||
# PDM
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
from unittest.mock import AsyncMock, patch
|
||||
from api.backend.app import app
|
||||
from api.backend.models import DownloadJob
|
||||
from api.backend.tests.factories.job_factory import create_completed_job
|
||||
from httpx import AsyncClient
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
client = TestClient(app)
|
||||
# LOCAL
|
||||
from api.backend.schemas.job import DownloadJob
|
||||
from api.backend.database.models import Job
|
||||
|
||||
mocked_job = create_completed_job().model_dump()
|
||||
mock_results = [mocked_job]
|
||||
mocked_random_int = 123456
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@patch("api.backend.routers.job_router.query")
|
||||
@patch("api.backend.routers.job_router.random.randint")
|
||||
async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
|
||||
# Ensure the mock returns immediately
|
||||
mock_query.return_value = mock_results
|
||||
mock_randint.return_value = mocked_random_int
|
||||
async def test_download(client: AsyncClient, db_session: AsyncSession):
|
||||
# Insert a test job into the DB
|
||||
job_id = "test-job-id"
|
||||
test_job = Job(
|
||||
id=job_id,
|
||||
url="https://example.com",
|
||||
elements=[],
|
||||
user="test@example.com",
|
||||
time_created=datetime.now(timezone.utc),
|
||||
result=[
|
||||
{
|
||||
"https://example.com": {
|
||||
"element_name": [{"xpath": "//div", "text": "example"}]
|
||||
}
|
||||
}
|
||||
],
|
||||
status="Completed",
|
||||
chat=None,
|
||||
job_options={},
|
||||
agent_mode=False,
|
||||
prompt="",
|
||||
favorite=False,
|
||||
)
|
||||
db_session.add(test_job)
|
||||
await db_session.commit()
|
||||
|
||||
# Create a DownloadJob instance
|
||||
download_job = DownloadJob(ids=[mocked_job["id"]], job_format="csv")
|
||||
# Force predictable randint
|
||||
random.seed(0)
|
||||
|
||||
# Make a POST request to the /download endpoint
|
||||
response = client.post("/download", json=download_job.model_dump())
|
||||
# Build request
|
||||
download_job = DownloadJob(ids=[job_id], job_format="csv")
|
||||
response = await client.post("/download", json=download_job.model_dump())
|
||||
|
||||
# Assertions
|
||||
assert response.status_code == 200
|
||||
assert response.headers["Content-Disposition"] == "attachment; filename=export.csv"
|
||||
|
||||
# Check the content of the CSV
|
||||
# Validate CSV contents
|
||||
csv_content = response.content.decode("utf-8")
|
||||
expected_csv = (
|
||||
f'"id","url","element_name","xpath","text","user","time_created"\r\n'
|
||||
f'"{mocked_job["id"]}-{mocked_random_int}","https://example.com","element_name","//div","example",'
|
||||
f'"{mocked_job["user"]}","{mocked_job["time_created"]}"\r\n'
|
||||
lines = csv_content.strip().split("\n")
|
||||
|
||||
assert (
|
||||
lines[0].strip()
|
||||
== '"id","url","element_name","xpath","text","user","time_created"'
|
||||
)
|
||||
assert csv_content == expected_csv
|
||||
assert '"https://example.com"' in lines[1]
|
||||
assert '"element_name"' in lines[1]
|
||||
assert '"//div"' in lines[1]
|
||||
assert '"example"' in lines[1]
|
||||
|
||||
@@ -1,25 +1,117 @@
|
||||
import pytest
|
||||
# STL
|
||||
import logging
|
||||
from playwright.async_api import async_playwright, Error
|
||||
from typing import Dict
|
||||
from datetime import datetime
|
||||
|
||||
# PDM
|
||||
import pytest
|
||||
from httpx import AsyncClient
|
||||
from sqlalchemy import select
|
||||
from fastapi.testclient import TestClient
|
||||
from playwright.async_api import Route, Cookie, async_playwright
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
# LOCAL
|
||||
from api.backend.app import app
|
||||
from api.backend.job.models import Proxy, Element, JobOptions
|
||||
from api.backend.schemas.job import Job
|
||||
from api.backend.database.models import Job as JobModel
|
||||
from api.backend.job.scraping.add_custom import add_custom_items
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
client = TestClient(app)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_proxy():
|
||||
proxy = "127.0.0.1:8080"
|
||||
async def test_add_custom_items():
|
||||
test_cookies = [{"name": "big", "value": "cookie"}]
|
||||
test_headers = {"User-Agent": "test-agent", "Accept": "application/json"}
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.firefox.launch(
|
||||
headless=True, proxy={"server": f"http://{proxy}"}
|
||||
)
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context()
|
||||
page = await context.new_page()
|
||||
|
||||
with pytest.raises(Error) as excinfo:
|
||||
await page.goto("http://example.com")
|
||||
# Set up request interception
|
||||
captured_headers: Dict[str, str] = {}
|
||||
|
||||
assert "NS_ERROR_PROXY_CONNECTION_REFUSED" in str(excinfo.value)
|
||||
async def handle_route(route: Route) -> None:
|
||||
nonlocal captured_headers
|
||||
captured_headers = route.request.headers
|
||||
await route.continue_()
|
||||
|
||||
await page.route("**/*", handle_route)
|
||||
|
||||
await add_custom_items(
|
||||
url="http://example.com",
|
||||
page=page,
|
||||
cookies=test_cookies,
|
||||
headers=test_headers,
|
||||
)
|
||||
|
||||
# Navigate to example.com
|
||||
await page.goto("http://example.com")
|
||||
|
||||
# Verify cookies were added
|
||||
cookies: list[Cookie] = await page.context.cookies()
|
||||
test_cookie = next((c for c in cookies if c.get("name") == "big"), None)
|
||||
|
||||
assert test_cookie is not None
|
||||
assert test_cookie.get("value") == "cookie"
|
||||
assert test_cookie.get("path") == "/" # Default path should be set
|
||||
assert test_cookie.get("sameSite") == "Lax" # Default sameSite should be set
|
||||
|
||||
# Verify headers were added
|
||||
assert captured_headers.get("user-agent") == "test-agent"
|
||||
|
||||
await browser.close()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_proxies(client: AsyncClient, db_session: AsyncSession):
|
||||
job = Job(
|
||||
url="https://example.com",
|
||||
elements=[Element(xpath="//div", name="test")],
|
||||
job_options=JobOptions(
|
||||
proxies=[
|
||||
Proxy(
|
||||
server="127.0.0.1:8080",
|
||||
username="user",
|
||||
password="pass",
|
||||
)
|
||||
],
|
||||
),
|
||||
time_created=datetime.now().isoformat(),
|
||||
)
|
||||
|
||||
response = await client.post("/submit-scrape-job", json=job.model_dump())
|
||||
assert response.status_code == 200
|
||||
|
||||
stmt = select(JobModel)
|
||||
result = await db_session.execute(stmt)
|
||||
jobs = result.scalars().all()
|
||||
|
||||
assert len(jobs) > 0
|
||||
job_from_db = jobs[0]
|
||||
|
||||
job_dict = job_from_db.__dict__
|
||||
job_dict.pop("_sa_instance_state", None)
|
||||
|
||||
assert job_dict is not None
|
||||
print(job_dict)
|
||||
assert job_dict["job_options"]["proxies"] == [
|
||||
{
|
||||
"server": "127.0.0.1:8080",
|
||||
"username": "user",
|
||||
"password": "pass",
|
||||
}
|
||||
]
|
||||
|
||||
# Verify the job was stored correctly in the database
|
||||
assert job_dict["url"] == "https://example.com"
|
||||
assert job_dict["status"] == "Queued"
|
||||
assert len(job_dict["elements"]) == 1
|
||||
assert job_dict["elements"][0]["xpath"] == "//div"
|
||||
assert job_dict["elements"][0]["name"] == "test"
|
||||
|
||||
17
api/backend/tests/utilities/database.py
Normal file
17
api/backend/tests/utilities/database.py
Normal file
@@ -0,0 +1,17 @@
|
||||
# STL
|
||||
import sqlite3
|
||||
|
||||
# LOCAL
|
||||
from api.backend.database.schema import INIT_QUERY
|
||||
from api.backend.tests.constants import TEST_DB_PATH
|
||||
|
||||
|
||||
def connect_to_db():
|
||||
conn = sqlite3.connect(TEST_DB_PATH)
|
||||
cur = conn.cursor()
|
||||
|
||||
for query in INIT_QUERY.split(";"):
|
||||
cur.execute(query)
|
||||
|
||||
conn.commit()
|
||||
return conn, cur
|
||||
@@ -1,17 +1,10 @@
|
||||
from typing import Any, Optional
|
||||
# STL
|
||||
import logging
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def clean_text(text: str):
|
||||
text = text.replace("\r\n", "\n") # Normalize newlines
|
||||
text = text.replace("\n", "\\n") # Escape newlines
|
||||
text = text.replace('"', '\\"') # Escape double quotes
|
||||
return text
|
||||
|
||||
|
||||
def get_log_level(level_name: Optional[str]) -> int:
|
||||
level = logging.INFO
|
||||
|
||||
@@ -20,30 +13,3 @@ def get_log_level(level_name: Optional[str]) -> int:
|
||||
level = getattr(logging, level_name, logging.INFO)
|
||||
|
||||
return level
|
||||
|
||||
|
||||
def format_list_for_query(ids: list[str]):
|
||||
return (
|
||||
f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)"
|
||||
)
|
||||
|
||||
|
||||
def format_sql_row_to_python(row: dict[str, Any]):
|
||||
new_row: dict[str, Any] = {}
|
||||
for key, value in row.items():
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
new_row[key] = json.loads(value)
|
||||
except json.JSONDecodeError:
|
||||
new_row[key] = value
|
||||
else:
|
||||
new_row[key] = value
|
||||
|
||||
return new_row
|
||||
|
||||
|
||||
def format_json(items: list[Any]):
|
||||
for idx, item in enumerate(items):
|
||||
if isinstance(item, (dict, list)):
|
||||
formatted_item = json.dumps(item)
|
||||
items[idx] = formatted_item
|
||||
|
||||
17
api/backend/worker/constants.py
Normal file
17
api/backend/worker/constants.py
Normal file
@@ -0,0 +1,17 @@
|
||||
# STL
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "")
|
||||
NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "")
|
||||
SCRAPERR_FRONTEND_URL = os.getenv("SCRAPERR_FRONTEND_URL", "")
|
||||
EMAIL = os.getenv("EMAIL", "")
|
||||
TO = os.getenv("TO", "")
|
||||
SMTP_HOST = os.getenv("SMTP_HOST", "")
|
||||
SMTP_PORT = int(os.getenv("SMTP_PORT", 587))
|
||||
SMTP_USER = os.getenv("SMTP_USER", "")
|
||||
SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "")
|
||||
USE_TLS = os.getenv("USE_TLS", "false").lower() == "true"
|
||||
|
||||
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
|
||||
RECORDINGS_DIR = Path("/project/app/media/recordings")
|
||||
@@ -1,48 +1,88 @@
|
||||
import os
|
||||
|
||||
from api.backend.job import get_queued_job, update_job
|
||||
from api.backend.scraping import scrape
|
||||
from api.backend.models import Element
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
|
||||
# STL
|
||||
import json
|
||||
import asyncio
|
||||
import traceback
|
||||
import subprocess
|
||||
|
||||
from api.backend.database.startup import init_database
|
||||
# PDM
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
|
||||
from api.backend.worker.post_job_complete.post_job_complete import post_job_complete
|
||||
# LOCAL
|
||||
from api.backend.job import update_job, get_queued_job
|
||||
from api.backend.job.models import Element
|
||||
from api.backend.worker.logger import LOG
|
||||
|
||||
|
||||
NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "")
|
||||
NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "")
|
||||
SCRAPERR_FRONTEND_URL = os.getenv("SCRAPERR_FRONTEND_URL", "")
|
||||
EMAIL = os.getenv("EMAIL", "")
|
||||
TO = os.getenv("TO", "")
|
||||
SMTP_HOST = os.getenv("SMTP_HOST", "")
|
||||
SMTP_PORT = int(os.getenv("SMTP_PORT", 587))
|
||||
SMTP_USER = os.getenv("SMTP_USER", "")
|
||||
SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "")
|
||||
USE_TLS = os.getenv("USE_TLS", "false").lower() == "true"
|
||||
from api.backend.ai.agent.agent import scrape_with_agent
|
||||
from api.backend.worker.constants import (
|
||||
TO,
|
||||
EMAIL,
|
||||
USE_TLS,
|
||||
SMTP_HOST,
|
||||
SMTP_PORT,
|
||||
SMTP_USER,
|
||||
SMTP_PASSWORD,
|
||||
RECORDINGS_DIR,
|
||||
RECORDINGS_ENABLED,
|
||||
NOTIFICATION_CHANNEL,
|
||||
SCRAPERR_FRONTEND_URL,
|
||||
NOTIFICATION_WEBHOOK_URL,
|
||||
)
|
||||
from api.backend.job.scraping.scraping import scrape
|
||||
from api.backend.worker.post_job_complete.post_job_complete import post_job_complete
|
||||
|
||||
|
||||
async def process_job():
|
||||
job = await get_queued_job()
|
||||
ffmpeg_proc = None
|
||||
status = "Queued"
|
||||
|
||||
if job:
|
||||
LOG.info(f"Beginning processing job: {job}.")
|
||||
|
||||
try:
|
||||
output_path = RECORDINGS_DIR / f"{job['id']}.mp4"
|
||||
|
||||
if RECORDINGS_ENABLED:
|
||||
ffmpeg_proc = subprocess.Popen(
|
||||
[
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-video_size",
|
||||
"1280x1024",
|
||||
"-framerate",
|
||||
"15",
|
||||
"-f",
|
||||
"x11grab",
|
||||
"-i",
|
||||
":99",
|
||||
"-codec:v",
|
||||
"libx264",
|
||||
"-preset",
|
||||
"ultrafast",
|
||||
output_path,
|
||||
]
|
||||
)
|
||||
|
||||
_ = await update_job([job["id"]], field="status", value="Scraping")
|
||||
scraped = await scrape(
|
||||
job["url"],
|
||||
[Element(**j) for j in job["elements"]],
|
||||
job["job_options"]["custom_headers"],
|
||||
job["job_options"]["multi_page_scrape"],
|
||||
job["job_options"]["proxies"],
|
||||
job["job_options"]["site_map"],
|
||||
job["job_options"]["collect_media"],
|
||||
)
|
||||
|
||||
proxies = job["job_options"]["proxies"]
|
||||
|
||||
if proxies and isinstance(proxies[0], str) and proxies[0].startswith("{"):
|
||||
try:
|
||||
proxies = [json.loads(p) for p in proxies]
|
||||
except json.JSONDecodeError:
|
||||
LOG.error(f"Failed to parse proxy JSON: {proxies}")
|
||||
proxies = []
|
||||
|
||||
if job["agent_mode"]:
|
||||
scraped = await scrape_with_agent(job)
|
||||
else:
|
||||
scraped = await scrape(
|
||||
job["id"],
|
||||
job["url"],
|
||||
[Element(**j) for j in job["elements"]],
|
||||
{**job["job_options"], "proxies": proxies},
|
||||
)
|
||||
|
||||
LOG.info(
|
||||
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
|
||||
)
|
||||
@@ -75,11 +115,15 @@ async def process_job():
|
||||
},
|
||||
)
|
||||
|
||||
if ffmpeg_proc:
|
||||
ffmpeg_proc.terminate()
|
||||
ffmpeg_proc.wait()
|
||||
|
||||
|
||||
async def main():
|
||||
LOG.info("Starting job worker...")
|
||||
|
||||
init_database()
|
||||
RECORDINGS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
while True:
|
||||
await process_job()
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
# STL
|
||||
import logging
|
||||
import os
|
||||
|
||||
from api.backend.utils import get_log_level
|
||||
# LOCAL
|
||||
from api.backend.app import LOG_LEVEL
|
||||
|
||||
logging.basicConfig(
|
||||
level=get_log_level(os.getenv("LOG_LEVEL")),
|
||||
format="%(levelname)s: %(asctime)s - %(name)s - %(message)s",
|
||||
level=LOG_LEVEL,
|
||||
format="%(levelname)s: %(asctime)s - [%(name)s] - %(message)s",
|
||||
handlers=[logging.StreamHandler()],
|
||||
)
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
LOG = logging.getLogger("Job Worker")
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
# STL
|
||||
from typing import Any
|
||||
|
||||
# LOCAL
|
||||
from api.backend.worker.post_job_complete.models import PostJobCompleteOptions
|
||||
from api.backend.worker.post_job_complete.email_notifcation import (
|
||||
send_job_complete_email,
|
||||
@@ -16,9 +18,10 @@ async def post_job_complete(job: dict[str, Any], options: PostJobCompleteOptions
|
||||
if not options.values():
|
||||
return
|
||||
|
||||
if options["channel"] == "discord":
|
||||
discord_notification(job, options)
|
||||
elif options["channel"] == "email":
|
||||
send_job_complete_email(job, options)
|
||||
else:
|
||||
raise ValueError(f"Invalid channel: {options['channel']}")
|
||||
match options["channel"]:
|
||||
case "discord":
|
||||
discord_notification(job, options)
|
||||
case "email":
|
||||
send_job_complete_email(job, options)
|
||||
case _:
|
||||
raise ValueError(f"Invalid channel: {options['channel']}")
|
||||
|
||||
23
cypress/e2e/00-setup.cy.ts
Normal file
23
cypress/e2e/00-setup.cy.ts
Normal file
@@ -0,0 +1,23 @@
|
||||
describe("Global setup", () => {
|
||||
it("signs up user once", () => {
|
||||
cy.request({
|
||||
method: "POST",
|
||||
url: "/api/signup",
|
||||
body: JSON.stringify({
|
||||
data: {
|
||||
email: "test@test.com",
|
||||
password: "password",
|
||||
full_name: "John Doe",
|
||||
},
|
||||
}),
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
failOnStatusCode: false,
|
||||
}).then((response) => {
|
||||
if (response.status !== 200 && response.status !== 201) {
|
||||
console.warn("Signup failed:", response.status, response.body);
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
101
cypress/e2e/advanced-job-options.cy.ts
Normal file
101
cypress/e2e/advanced-job-options.cy.ts
Normal file
@@ -0,0 +1,101 @@
|
||||
import { login } from "../utilities/authentication.utils";
|
||||
import {
|
||||
addCustomHeaders,
|
||||
addElement,
|
||||
addMedia,
|
||||
addSiteMapAction,
|
||||
checkForMedia,
|
||||
cleanUpJobs,
|
||||
enterJobUrl,
|
||||
openAdvancedJobOptions,
|
||||
submitBasicJob,
|
||||
submitJob,
|
||||
waitForJobCompletion,
|
||||
} from "../utilities/job.utilities";
|
||||
import { mockSubmitJob } from "../utilities/mocks";
|
||||
|
||||
describe.only("Advanced Job Options", () => {
|
||||
beforeEach(() => {
|
||||
mockSubmitJob();
|
||||
login();
|
||||
cy.visit("/");
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
cleanUpJobs();
|
||||
});
|
||||
|
||||
it.only("should handle custom headers", () => {
|
||||
const customHeaders = {
|
||||
"User-Agent": "Test Agent",
|
||||
"Accept-Language": "en-US",
|
||||
};
|
||||
|
||||
addCustomHeaders(customHeaders);
|
||||
submitBasicJob("https://httpbin.org/headers", "headers", "//pre");
|
||||
|
||||
cy.wait("@submitScrapeJob").then((interception) => {
|
||||
expect(interception.response?.statusCode).to.eq(200);
|
||||
expect(
|
||||
interception.request?.body.data.job_options.custom_headers
|
||||
).to.deep.equal(customHeaders);
|
||||
});
|
||||
|
||||
waitForJobCompletion("https://httpbin.org/headers");
|
||||
});
|
||||
|
||||
it("should handle site map actions", () => {
|
||||
addSiteMapAction("click", "//button[contains(text(), 'Load More')]");
|
||||
addSiteMapAction("input", "//input[@type='search']", "test search");
|
||||
|
||||
submitBasicJob("https://example.com", "content", "//div[@class='content']");
|
||||
|
||||
cy.wait("@submitScrapeJob").then((interception) => {
|
||||
expect(interception.response?.statusCode).to.eq(200);
|
||||
const siteMap = interception.request?.body.data.job_options.site_map;
|
||||
expect(siteMap.actions).to.have.length(2);
|
||||
expect(siteMap.actions[0].type).to.equal("click");
|
||||
expect(siteMap.actions[1].type).to.equal("input");
|
||||
});
|
||||
|
||||
waitForJobCompletion("https://example.com");
|
||||
});
|
||||
|
||||
it("should handle multiple elements", () => {
|
||||
enterJobUrl("https://books.toscrape.com");
|
||||
|
||||
addElement("titles", "//h3");
|
||||
addElement("prices", "//p[@class='price_color']");
|
||||
|
||||
submitJob();
|
||||
|
||||
cy.wait("@submitScrapeJob").then((interception) => {
|
||||
expect(interception.response?.statusCode).to.eq(200);
|
||||
expect(interception.request?.body.data.elements).to.have.length(2);
|
||||
});
|
||||
|
||||
waitForJobCompletion("https://books.toscrape.com");
|
||||
});
|
||||
|
||||
it.only("should handle collecting media", () => {
|
||||
enterJobUrl("https://books.toscrape.com");
|
||||
|
||||
openAdvancedJobOptions();
|
||||
addMedia();
|
||||
|
||||
cy.get("body").type("{esc}");
|
||||
|
||||
addElement("images", "//img");
|
||||
|
||||
submitJob();
|
||||
|
||||
cy.wait("@submitScrapeJob").then((interception) => {
|
||||
expect(interception.response?.statusCode).to.eq(200);
|
||||
expect(interception.request?.body.data.job_options.collect_media).to.be
|
||||
.true;
|
||||
});
|
||||
|
||||
waitForJobCompletion("https://books.toscrape.com");
|
||||
checkForMedia();
|
||||
});
|
||||
});
|
||||
38
cypress/e2e/agent.cy.ts
Normal file
38
cypress/e2e/agent.cy.ts
Normal file
@@ -0,0 +1,38 @@
|
||||
import { login } from "../utilities/authentication.utils";
|
||||
import {
|
||||
buildAgentJob,
|
||||
cleanUpJobs,
|
||||
submitJob,
|
||||
waitForJobCompletion,
|
||||
} from "../utilities/job.utilities";
|
||||
import { mockSubmitJob } from "../utilities/mocks";
|
||||
|
||||
describe("Agent", () => {
|
||||
beforeEach(() => {
|
||||
mockSubmitJob();
|
||||
login();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
cleanUpJobs();
|
||||
});
|
||||
|
||||
it("should be able to scrape some data", () => {
|
||||
cy.visit("/agent");
|
||||
cy.wait(1000);
|
||||
|
||||
const url = "https://books.toscrape.com";
|
||||
const prompt = "Collect all the links on the page";
|
||||
buildAgentJob(url, prompt);
|
||||
|
||||
submitJob();
|
||||
|
||||
cy.wait("@submitScrapeJob").then((interception) => {
|
||||
expect(interception.response?.statusCode).to.eq(200);
|
||||
expect(interception.request?.body.data.url).to.eq(url);
|
||||
expect(interception.request?.body.data.prompt).to.eq(prompt);
|
||||
});
|
||||
|
||||
waitForJobCompletion("https://books.toscrape.com");
|
||||
});
|
||||
});
|
||||
@@ -1,60 +1,61 @@
|
||||
describe("Authentication", () => {
|
||||
it("should register", () => {
|
||||
cy.intercept("POST", "/api/signup").as("signup");
|
||||
import { faker } from "@faker-js/faker";
|
||||
import { mockLogin, mockSignup } from "../utilities/mocks";
|
||||
|
||||
cy.visit("/").then(() => {
|
||||
cy.get("button").contains("Login").click();
|
||||
cy.url().should("include", "/login");
|
||||
const mockEmail = faker.internet.email();
|
||||
const mockPassword = faker.internet.password();
|
||||
|
||||
cy.get("form").should("be.visible");
|
||||
cy.get("button")
|
||||
.contains("No Account? Sign up")
|
||||
.should("be.visible")
|
||||
.click();
|
||||
|
||||
cy.get("input[name='email']").type("test@test.com");
|
||||
cy.get("input[name='password']").type("password");
|
||||
cy.get("input[name='fullName']").type("John Doe");
|
||||
cy.get("button[type='submit']").contains("Signup").click();
|
||||
|
||||
cy.wait("@signup").then((interception) => {
|
||||
if (!interception.response) {
|
||||
cy.log("No response received!");
|
||||
throw new Error("signup request did not return a response");
|
||||
}
|
||||
|
||||
cy.log("Response status: " + interception.response.statusCode);
|
||||
cy.log("Response body: " + JSON.stringify(interception.response.body));
|
||||
|
||||
expect(interception.response.statusCode).to.eq(200);
|
||||
});
|
||||
});
|
||||
describe.only("Authentication", () => {
|
||||
beforeEach(() => {
|
||||
cy.visit("/");
|
||||
mockSignup();
|
||||
mockLogin();
|
||||
});
|
||||
|
||||
it("should login", () => {
|
||||
cy.intercept("POST", "/api/token").as("token");
|
||||
it("should register", () => {
|
||||
cy.get("button").contains("Login").click();
|
||||
cy.url().should("include", "/login");
|
||||
|
||||
cy.visit("/").then(() => {
|
||||
cy.get("button")
|
||||
.contains("Login")
|
||||
.click()
|
||||
.then(() => {
|
||||
cy.get("input[name='email']").type("test@test.com");
|
||||
cy.get("input[name='password']").type("password");
|
||||
cy.get("button[type='submit']").contains("Login").click();
|
||||
cy.get("form").should("be.visible");
|
||||
|
||||
cy.wait("@token").then((interception) => {
|
||||
if (!interception.response) {
|
||||
cy.log("No response received!");
|
||||
throw new Error("token request did not return a response");
|
||||
}
|
||||
cy.get("button")
|
||||
.contains("No Account? Sign up")
|
||||
.should("be.visible")
|
||||
.click();
|
||||
|
||||
cy.log("Response status: " + interception.response.statusCode);
|
||||
cy.log("Response body: " + JSON.stringify(interception.response.body));
|
||||
cy.get("input[name='email']").type(mockEmail);
|
||||
cy.get("input[name='password']").type(mockPassword);
|
||||
cy.get("input[name='fullName']").type(faker.person.fullName());
|
||||
cy.get("button[type='submit']").contains("Signup").click();
|
||||
|
||||
expect(interception.response.statusCode).to.eq(200);
|
||||
});
|
||||
});
|
||||
cy.wait("@signup").then((interception) => {
|
||||
if (!interception.response) {
|
||||
throw new Error("signup request did not return a response");
|
||||
}
|
||||
|
||||
expect(interception.response.statusCode).to.eq(200);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
it("should login", () => {
|
||||
cy.intercept("POST", "/api/token").as("token");
|
||||
|
||||
cy.visit("/").then(() => {
|
||||
cy.get("button")
|
||||
.contains("Login")
|
||||
.click()
|
||||
.then(() => {
|
||||
cy.get("input[name='email']").type(mockEmail);
|
||||
cy.get("input[name='password']").type(mockPassword);
|
||||
cy.get("button[type='submit']").contains("Login").click();
|
||||
|
||||
cy.wait("@token").then((interception) => {
|
||||
if (!interception.response) {
|
||||
throw new Error("token request did not return a response");
|
||||
}
|
||||
|
||||
expect(interception.response.statusCode).to.eq(200);
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
34
cypress/e2e/chat.cy.ts
Normal file
34
cypress/e2e/chat.cy.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
import { login } from "../utilities/authentication.utils";
|
||||
import {
|
||||
cleanUpJobs,
|
||||
selectJobFromSelector,
|
||||
submitBasicJob,
|
||||
waitForJobCompletion,
|
||||
} from "../utilities/job.utilities";
|
||||
import { mockLogin } from "../utilities/mocks";
|
||||
|
||||
describe.only("Chat", () => {
|
||||
beforeEach(() => {
|
||||
mockLogin();
|
||||
login();
|
||||
cy.visit("/");
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
cleanUpJobs();
|
||||
});
|
||||
|
||||
it.only("should be able to chat", () => {
|
||||
const url = "https://books.toscrape.com";
|
||||
submitBasicJob(url, "test", "//body");
|
||||
waitForJobCompletion(url);
|
||||
|
||||
cy.visit("/chat");
|
||||
selectJobFromSelector();
|
||||
|
||||
cy.get("[data-cy='message-input']").type("Hello");
|
||||
cy.get("[data-cy='send-message']").click();
|
||||
|
||||
cy.get("[data-cy='ai-message']").should("exist");
|
||||
});
|
||||
});
|
||||
@@ -1,34 +1,37 @@
|
||||
import { login } from "../utilities/authentication.utils";
|
||||
import {
|
||||
addElement,
|
||||
cleanUpJobs,
|
||||
enterJobUrl,
|
||||
submitJob,
|
||||
waitForJobCompletion,
|
||||
} from "../utilities/job.utilities";
|
||||
import { mockSubmitJob } from "../utilities/mocks";
|
||||
|
||||
describe.only("Job", () => {
|
||||
it("should create a job", () => {
|
||||
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
|
||||
|
||||
beforeEach(() => {
|
||||
mockSubmitJob();
|
||||
login();
|
||||
cy.visit("/");
|
||||
});
|
||||
|
||||
cy.get('[data-cy="url-input"]').type("https://example.com");
|
||||
cy.get('[data-cy="name-field"]').type("example");
|
||||
cy.get('[data-cy="xpath-field"]').type("//body");
|
||||
cy.get('[data-cy="add-button"]').click();
|
||||
afterEach(() => {
|
||||
cleanUpJobs();
|
||||
});
|
||||
|
||||
cy.contains("Submit").click();
|
||||
it("should create a job", () => {
|
||||
enterJobUrl("https://books.toscrape.com");
|
||||
addElement("body", "//body");
|
||||
submitJob();
|
||||
|
||||
cy.wait("@submitScrapeJob").then((interception) => {
|
||||
if (!interception.response) {
|
||||
cy.log("No response received!");
|
||||
cy.log("Request body: " + JSON.stringify(interception.request?.body));
|
||||
throw new Error("submitScrapeJob request did not return a response");
|
||||
}
|
||||
|
||||
cy.log("Response status: " + interception.response.statusCode);
|
||||
cy.log("Response body: " + JSON.stringify(interception.response.body));
|
||||
|
||||
expect(interception.response.statusCode).to.eq(200);
|
||||
});
|
||||
|
||||
cy.get("li").contains("Jobs").click();
|
||||
|
||||
cy.contains("div", "https://example.com", { timeout: 10000 }).should(
|
||||
"exist"
|
||||
);
|
||||
cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
|
||||
waitForJobCompletion("https://books.toscrape.com");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
// ***********************************************************
|
||||
|
||||
// Import commands.js using ES2015 syntax:
|
||||
import './commands'
|
||||
import "./commands";
|
||||
|
||||
// Alternatively you can use CommonJS syntax:
|
||||
// require('./commands')
|
||||
// require('./commands')
|
||||
|
||||
68
cypress/utilities/authentication.utils.ts
Normal file
68
cypress/utilities/authentication.utils.ts
Normal file
@@ -0,0 +1,68 @@
|
||||
export const signup = () => {
|
||||
cy.intercept("POST", "/api/token").as("token");
|
||||
|
||||
cy.visit("/").then(() => {
|
||||
cy.get("button").contains("Login").click();
|
||||
cy.url().should("include", "/login");
|
||||
|
||||
cy.get("form").should("be.visible");
|
||||
cy.get("button")
|
||||
.contains("No Account? Sign up")
|
||||
.should("be.visible")
|
||||
.click();
|
||||
|
||||
cy.get("input[name='email']").type("test@test.com");
|
||||
cy.get("input[name='password']").type("password");
|
||||
cy.get("input[name='fullName']").type("John Doe");
|
||||
cy.get("button[type='submit']").contains("Signup").click();
|
||||
|
||||
cy.wait("@token").then((interception) => {
|
||||
if (!interception.response) {
|
||||
cy.log("No response received!");
|
||||
throw new Error("token request did not return a response");
|
||||
}
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
export const login = () => {
|
||||
cy.intercept("POST", "/api/token").as("token");
|
||||
cy.intercept("GET", "/api/me").as("me");
|
||||
cy.intercept("GET", "/api/check").as("check");
|
||||
|
||||
cy.visit("/").then(() => {
|
||||
cy.get("body").then(() => {
|
||||
cy.get("button")
|
||||
.contains("Login")
|
||||
.click()
|
||||
.then(() => {
|
||||
cy.get("input[name='email']").type("test@test.com");
|
||||
cy.get("input[name='password']").type("password");
|
||||
cy.get("button[type='submit']").contains("Login").click();
|
||||
|
||||
cy.wait("@token").then((interception) => {
|
||||
if (!interception.response) {
|
||||
cy.log("No response received!");
|
||||
throw new Error("token request did not return a response");
|
||||
}
|
||||
});
|
||||
|
||||
cy.wait("@me").then((interception) => {
|
||||
if (!interception.response) {
|
||||
cy.log("No response received!");
|
||||
throw new Error("me request did not return a response");
|
||||
}
|
||||
});
|
||||
|
||||
cy.wait("@check").then((interception) => {
|
||||
if (!interception.response) {
|
||||
cy.log("No response received!");
|
||||
throw new Error("check request did not return a response");
|
||||
}
|
||||
});
|
||||
|
||||
cy.url().should("not.include", "/login");
|
||||
});
|
||||
});
|
||||
});
|
||||
};
|
||||
187
cypress/utilities/job.utilities.ts
Normal file
187
cypress/utilities/job.utilities.ts
Normal file
@@ -0,0 +1,187 @@
|
||||
export const cleanUpJobs = () => {
|
||||
cy.intercept("POST", "/api/retrieve").as("retrieve");
|
||||
cy.visit("/jobs");
|
||||
|
||||
cy.wait("@retrieve", { timeout: 15000 });
|
||||
|
||||
cy.get("tbody tr", { timeout: 20000 }).should("have.length.at.least", 1);
|
||||
|
||||
const tryClickSelectAll = (attempt = 1, maxAttempts = 5) => {
|
||||
cy.log(`Attempt ${attempt} to click Select All`);
|
||||
|
||||
cy.get('[data-testid="select-all"]')
|
||||
.closest("button")
|
||||
.then(($btn) => {
|
||||
// Retry if button is disabled
|
||||
if ($btn.is(":disabled") || $btn.css("pointer-events") === "none") {
|
||||
if (attempt < maxAttempts) {
|
||||
cy.wait(1000).then(() =>
|
||||
tryClickSelectAll(attempt + 1, maxAttempts)
|
||||
);
|
||||
} else {
|
||||
throw new Error(
|
||||
"Select All button is still disabled after max retries"
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// Click and then verify if checkbox is checked
|
||||
cy.wrap($btn)
|
||||
.click({ force: true })
|
||||
.then(() => {
|
||||
cy.get("tbody tr")
|
||||
.first()
|
||||
.find("td")
|
||||
.first()
|
||||
.find("input[type='checkbox']")
|
||||
.should("be.checked")
|
||||
.then(() => {
|
||||
cy.log("Select All successful");
|
||||
});
|
||||
});
|
||||
|
||||
// Handle failure case
|
||||
cy.on("fail", () => {
|
||||
cy.log("Error clicking Select All");
|
||||
if (attempt < maxAttempts) {
|
||||
cy.wait(1000).then(() =>
|
||||
tryClickSelectAll(attempt + 1, maxAttempts)
|
||||
);
|
||||
} else {
|
||||
throw new Error(
|
||||
"Checkbox was never checked after clicking Select All"
|
||||
);
|
||||
}
|
||||
return false; // Prevent Cypress from failing the test
|
||||
});
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
tryClickSelectAll();
|
||||
|
||||
cy.get('[data-testid="DeleteIcon"]', { timeout: 10000 })
|
||||
.closest("button")
|
||||
.should("not.be.disabled")
|
||||
.click();
|
||||
};
|
||||
export const submitBasicJob = (url: string, name: string, xpath: string) => {
|
||||
cy.get('[data-cy="url-input"]').type(url);
|
||||
cy.get('[data-cy="name-field"]').type(name);
|
||||
cy.get('[data-cy="xpath-field"]').type(xpath);
|
||||
cy.get('[data-cy="add-button"]').click();
|
||||
cy.contains("Submit").click();
|
||||
};
|
||||
|
||||
export const waitForJobCompletion = (url: string) => {
|
||||
cy.intercept("POST", "/api/retrieve").as("retrieve");
|
||||
|
||||
cy.visit("/jobs");
|
||||
|
||||
cy.wait("@retrieve", { timeout: 30000 });
|
||||
|
||||
cy.contains("div", url, { timeout: 30000 }).should("exist");
|
||||
|
||||
const checkJobStatus = () => {
|
||||
cy.get("[data-testid='job-status']", { timeout: 120000 }).then(($el) => {
|
||||
const status = $el.text().toLowerCase().trim();
|
||||
|
||||
if (status.includes("completed")) {
|
||||
return true;
|
||||
} else if (status.includes("scraping") || status.includes("queued")) {
|
||||
cy.wait(5000);
|
||||
checkJobStatus();
|
||||
} else {
|
||||
throw new Error(`Unexpected job status: ${status}`);
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
checkJobStatus();
|
||||
};
|
||||
|
||||
export const enableMultiPageScraping = () => {
|
||||
cy.get("button").contains("Advanced Options").click();
|
||||
cy.get('[data-cy="multi-page-toggle"]').click();
|
||||
cy.get("body").type("{esc}");
|
||||
};
|
||||
|
||||
export const addCustomHeaders = (headers: Record<string, string>) => {
|
||||
cy.get("button").contains("Advanced Options").click();
|
||||
cy.get('[name="custom_headers"]').type(JSON.stringify(headers), {
|
||||
parseSpecialCharSequences: false,
|
||||
});
|
||||
cy.get("body").type("{esc}");
|
||||
};
|
||||
|
||||
export const addCustomCookies = (cookies: Record<string, string>) => {
|
||||
cy.get("button").contains("Advanced Options").click();
|
||||
cy.get('[name="custom_cookies"]').type(JSON.stringify(cookies));
|
||||
cy.get("body").type("{esc}");
|
||||
};
|
||||
|
||||
export const openAdvancedJobOptions = () => {
|
||||
cy.get("button").contains("Advanced Options").click();
|
||||
};
|
||||
|
||||
export const selectJobFromSelector = () => {
|
||||
checkAiDisabled();
|
||||
cy.get("div[id='select-job']", { timeout: 10000 }).first().click();
|
||||
cy.get("li[role='option']", { timeout: 10000 }).first().click();
|
||||
};
|
||||
|
||||
export const addMedia = () => {
|
||||
cy.get('[data-cy="collect-media-checkbox"]').click();
|
||||
};
|
||||
|
||||
export const checkForMedia = () => {
|
||||
cy.intercept("GET", "/api/media/get-media?id=**").as("getMedia");
|
||||
|
||||
cy.visit("/media");
|
||||
selectJobFromSelector();
|
||||
|
||||
cy.wait("@getMedia", { timeout: 30000 });
|
||||
};
|
||||
|
||||
export const addSiteMapAction = (
|
||||
type: "click" | "input",
|
||||
xpath: string,
|
||||
input?: string
|
||||
) => {
|
||||
cy.get("button").contains("Create Site Map").click();
|
||||
cy.get('[data-cy="site-map-select"]').select(type);
|
||||
cy.get('[data-cy="site-map-xpath"]').type(xpath);
|
||||
if (type === "input" && input) {
|
||||
cy.get('[data-cy="site-map-input"]').type(input);
|
||||
}
|
||||
cy.get('[data-cy="add-site-map-action"]').click();
|
||||
};
|
||||
|
||||
export const addElement = (name: string, xpath: string) => {
|
||||
cy.get('[data-cy="name-field"]').type(name);
|
||||
cy.get('[data-cy="xpath-field"]').type(xpath);
|
||||
cy.get('[data-cy="add-button"]').click();
|
||||
};
|
||||
|
||||
export const checkAiDisabled = () => {
|
||||
cy.getAllLocalStorage().then((result) => {
|
||||
const storage = JSON.parse(
|
||||
result["http://localhost"]["persist:root"] as string
|
||||
);
|
||||
const settings = JSON.parse(storage.settings);
|
||||
expect(settings.aiEnabled).to.equal(true);
|
||||
});
|
||||
};
|
||||
|
||||
export const buildAgentJob = (url: string, prompt: string) => {
|
||||
checkAiDisabled();
|
||||
enterJobUrl(url);
|
||||
cy.get("[data-cy='prompt-input']").type(prompt);
|
||||
};
|
||||
|
||||
export const submitJob = () => {
|
||||
cy.get("button").contains("Submit").click();
|
||||
};
|
||||
|
||||
export const enterJobUrl = (url: string) => {
|
||||
cy.get('[data-cy="url-input"]').type(url);
|
||||
};
|
||||
15
cypress/utilities/mocks.ts
Normal file
15
cypress/utilities/mocks.ts
Normal file
@@ -0,0 +1,15 @@
|
||||
export const mockSubmitJob = () => {
|
||||
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
|
||||
};
|
||||
|
||||
export const mockToken = () => {
|
||||
cy.intercept("POST", "/api/token").as("token");
|
||||
};
|
||||
|
||||
export const mockSignup = () => {
|
||||
cy.intercept("POST", "/api/signup").as("signup");
|
||||
};
|
||||
|
||||
export const mockLogin = () => {
|
||||
cy.intercept("POST", "/api/token").as("token");
|
||||
};
|
||||
1
cypress/utilities/utilities.ts
Normal file
1
cypress/utilities/utilities.ts
Normal file
@@ -0,0 +1 @@
|
||||
export * from "./authentication.utils";
|
||||
@@ -1,6 +1,9 @@
|
||||
version: "3"
|
||||
services:
|
||||
scraperr:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/frontend/Dockerfile
|
||||
command: ["npm", "run", "dev"]
|
||||
volumes:
|
||||
- "$PWD/src:/app/src"
|
||||
@@ -10,7 +13,12 @@ services:
|
||||
- "$PWD/package-lock.json:/app/package-lock.json"
|
||||
- "$PWD/tsconfig.json:/app/tsconfig.json"
|
||||
scraperr_api:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/api/Dockerfile
|
||||
environment:
|
||||
- LOG_LEVEL=INFO
|
||||
volumes:
|
||||
- "$PWD/api:/project/app/api"
|
||||
ports:
|
||||
- "5900:5900"
|
||||
|
||||
@@ -1,11 +1,6 @@
|
||||
services:
|
||||
scraperr:
|
||||
depends_on:
|
||||
- scraperr_api
|
||||
image: jpyles0524/scraperr:latest
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/frontend/Dockerfile
|
||||
container_name: scraperr
|
||||
command: ["npm", "run", "start"]
|
||||
environment:
|
||||
@@ -18,11 +13,9 @@ services:
|
||||
scraperr_api:
|
||||
init: True
|
||||
image: jpyles0524/scraperr_api:latest
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/api/Dockerfile
|
||||
environment:
|
||||
- LOG_LEVEL=INFO
|
||||
- OPENAI_KEY=${OPENAI_KEY}
|
||||
container_name: scraperr_api
|
||||
ports:
|
||||
- 8000:8000
|
||||
|
||||
@@ -3,7 +3,7 @@ FROM python:3.10.12-slim as pybuilder
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl && \
|
||||
apt-get install -y uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 && \
|
||||
apt-get install -y x11vnc xvfb uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 ffmpeg pkg-config default-libmysqlclient-dev gcc && \
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
||||
apt-get remove -y curl && \
|
||||
apt-get autoremove -y && \
|
||||
@@ -14,7 +14,8 @@ RUN pdm config python.use_venv false
|
||||
|
||||
WORKDIR /project/app
|
||||
COPY pyproject.toml pdm.lock /project/app/
|
||||
RUN pdm install
|
||||
|
||||
RUN pdm install -v --frozen-lockfile
|
||||
|
||||
RUN pdm run playwright install --with-deps
|
||||
|
||||
@@ -30,7 +31,15 @@ EXPOSE 8000
|
||||
|
||||
WORKDIR /project/app
|
||||
|
||||
RUN mkdir -p /project/app/media
|
||||
RUN mkdir -p /project/app/data
|
||||
RUN touch /project/app/data/database.db
|
||||
|
||||
EXPOSE 5900
|
||||
|
||||
COPY alembic /project/app/alembic
|
||||
COPY alembic.ini /project/app/alembic.ini
|
||||
|
||||
COPY start.sh /project/app/start.sh
|
||||
|
||||
CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ]
|
||||
@@ -1,10 +1,14 @@
|
||||
# Build next dependencies
|
||||
FROM node:23.1
|
||||
FROM node:23.1-slim
|
||||
WORKDIR /app
|
||||
|
||||
COPY package*.json ./
|
||||
RUN npm install
|
||||
# Copy package files first to leverage Docker cache
|
||||
COPY package.json yarn.lock ./
|
||||
|
||||
# Install dependencies in a separate layer
|
||||
RUN yarn install --frozen-lockfile --network-timeout 600000
|
||||
|
||||
# Copy the rest of the application
|
||||
COPY tsconfig.json /app/tsconfig.json
|
||||
COPY tailwind.config.js /app/tailwind.config.js
|
||||
COPY next.config.mjs /app/next.config.mjs
|
||||
@@ -13,6 +17,7 @@ COPY postcss.config.js /app/postcss.config.js
|
||||
COPY public /app/public
|
||||
COPY src /app/src
|
||||
|
||||
RUN npm run build
|
||||
# Build the application
|
||||
RUN yarn build
|
||||
|
||||
EXPOSE 3000
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 47 KiB After Width: | Height: | Size: 67 KiB |
@@ -15,7 +15,7 @@ type: application
|
||||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 1.0.13
|
||||
version: 1.1.6
|
||||
|
||||
# This is the version number of the application being deployed. This version number should be
|
||||
# incremented each time you make changes to the application. Versions are not expected to
|
||||
|
||||
2
next-env.d.ts
vendored
2
next-env.d.ts
vendored
@@ -2,4 +2,4 @@
|
||||
/// <reference types="next/image-types/global" />
|
||||
|
||||
// NOTE: This file should not be edited
|
||||
// see https://nextjs.org/docs/basic-features/typescript for more information.
|
||||
// see https://nextjs.org/docs/pages/building-your-application/configuring/typescript for more information.
|
||||
|
||||
11371
package-lock.json
generated
11371
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
12
package.json
12
package.json
@@ -12,9 +12,11 @@
|
||||
"@minchat/react-chat-ui": "^0.16.2",
|
||||
"@mui/icons-material": "^5.15.3",
|
||||
"@mui/material": "^5.16.0",
|
||||
"@reduxjs/toolkit": "^2.8.2",
|
||||
"@testing-library/jest-dom": "^5.16.5",
|
||||
"@testing-library/react": "^13.4.0",
|
||||
"@testing-library/user-event": "^13.5.0",
|
||||
"@types/react": "^18.3.21",
|
||||
"axios": "^1.7.2",
|
||||
"bootstrap": "^5.3.0",
|
||||
"chart.js": "^4.4.3",
|
||||
@@ -30,16 +32,19 @@
|
||||
"react-dom": "^18.3.1",
|
||||
"react-markdown": "^9.0.0",
|
||||
"react-modal-image": "^2.6.0",
|
||||
"react-redux": "^9.2.0",
|
||||
"react-router": "^6.14.1",
|
||||
"react-router-dom": "^6.14.1",
|
||||
"react-spinners": "^0.14.1",
|
||||
"react-toastify": "^11.0.5",
|
||||
"redux-persist": "^6.0.0",
|
||||
"typescript": "^4.9.5",
|
||||
"web-vitals": "^2.1.4"
|
||||
},
|
||||
"scripts": {
|
||||
"dev": "next dev",
|
||||
"build": "next build",
|
||||
"start": "next start",
|
||||
"dev": "yarn next dev",
|
||||
"build": "yarn next build",
|
||||
"start": "yarn next start",
|
||||
"serve": "serve -s ./dist",
|
||||
"cy:open": "cypress open",
|
||||
"cy:run": "cypress run"
|
||||
@@ -63,6 +68,7 @@
|
||||
]
|
||||
},
|
||||
"devDependencies": {
|
||||
"@faker-js/faker": "^9.8.0",
|
||||
"@types/cypress": "^1.1.6",
|
||||
"@types/js-cookie": "^3.0.6",
|
||||
"autoprefixer": "^10.4.21",
|
||||
|
||||
@@ -12,7 +12,7 @@ dependencies = [
|
||||
"asyncio>=3.4.3",
|
||||
"aiohttp>=3.9.5",
|
||||
"bs4>=0.0.2",
|
||||
"lxml[html_clean]>=5.2.2",
|
||||
"lxml>=5.2.2",
|
||||
"lxml-stubs>=0.5.1",
|
||||
"fake-useragent>=1.5.1",
|
||||
"requests-html>=0.10.0",
|
||||
@@ -24,7 +24,6 @@ dependencies = [
|
||||
"python-keycloak>=4.2.0",
|
||||
"fastapi-keycloak>=1.0.11",
|
||||
"pymongo>=4.8.0",
|
||||
"motor[asyncio]>=3.5.0",
|
||||
"python-jose[cryptography]>=3.3.0",
|
||||
"passlib[bcrypt]>=1.7.4",
|
||||
"selenium-wire>=5.1.0",
|
||||
@@ -41,6 +40,16 @@ dependencies = [
|
||||
"apscheduler>=3.11.0",
|
||||
"playwright>=1.52.0",
|
||||
"camoufox>=0.4.11",
|
||||
"html2text>=2025.4.15",
|
||||
"proxy-py>=2.4.10",
|
||||
"browserforge==1.2.1",
|
||||
"sqlalchemy>=2.0.41",
|
||||
"aiosqlite>=0.21.0",
|
||||
"alembic>=1.16.4",
|
||||
"asyncpg>=0.30.0",
|
||||
"aiomysql>=0.2.0",
|
||||
"psycopg2-binary>=2.9.10",
|
||||
"mysqlclient>=2.2.7",
|
||||
]
|
||||
requires-python = ">=3.10"
|
||||
readme = "README.md"
|
||||
@@ -97,9 +106,9 @@ strictSetInference = true
|
||||
|
||||
|
||||
[tool.isort]
|
||||
length_sort = "1"
|
||||
length_sort = true
|
||||
profile = "black"
|
||||
sections = "STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER"
|
||||
sections = ["STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
|
||||
import_heading_stdlib = "STL"
|
||||
import_heading_thirdparty = "PDM"
|
||||
import_heading_firstparty = "LOCAL"
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user