mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-11-25 02:26:37 +00:00
Compare commits
45 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
93b0c83381 | ||
|
|
9381ba9232 | ||
|
|
20dccc5527 | ||
|
|
02619eb184 | ||
|
|
58c6c09fc9 | ||
|
|
bf896b4c6b | ||
|
|
e3b9c11ab7 | ||
|
|
32da3375b3 | ||
|
|
b5131cbe4c | ||
|
|
47c4c9a7d1 | ||
|
|
4352988666 | ||
|
|
00759151e6 | ||
|
|
bfae00ca72 | ||
|
|
e810700569 | ||
|
|
9857fa96e0 | ||
|
|
b52fbc538d | ||
|
|
42c0f3ae79 | ||
|
|
9aab2f9b4f | ||
|
|
e182d3e4b8 | ||
|
|
53f35989f5 | ||
|
|
a67ab34cfa | ||
|
|
3bf6657191 | ||
|
|
c38d19a0ca | ||
|
|
a53e7e1aa1 | ||
|
|
84368b1f6d | ||
|
|
ce4c1ceaa7 | ||
|
|
7e1ce58bb8 | ||
|
|
175e7d63bf | ||
|
|
d2c06de247 | ||
|
|
e0159bf9d4 | ||
|
|
6d574ddfd2 | ||
|
|
b089d72786 | ||
|
|
9ee4d577fd | ||
|
|
cddce5164d | ||
|
|
bf3163bfba | ||
|
|
54b513e92c | ||
|
|
6c56f2f161 | ||
|
|
d4edb9d93e | ||
|
|
5ebd96b62b | ||
|
|
d602d3330a | ||
|
|
6639e8b48f | ||
|
|
263e46ba4d | ||
|
|
f815a58efc | ||
|
|
50ec5df657 | ||
|
|
28de0f362c |
4
.dockerignore
Normal file
4
.dockerignore
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
node_modules
|
||||||
|
npm-debug.log
|
||||||
|
Dockerfile
|
||||||
|
.dockerignore
|
||||||
8
.github/actions/push-to-helm/action.yaml
vendored
8
.github/actions/push-to-helm/action.yaml
vendored
@@ -5,6 +5,9 @@ inputs:
|
|||||||
app-repo-token:
|
app-repo-token:
|
||||||
required: true
|
required: true
|
||||||
description: "The token for the target repository"
|
description: "The token for the target repository"
|
||||||
|
version:
|
||||||
|
required: true
|
||||||
|
description: "The version of the Helm chart"
|
||||||
|
|
||||||
runs:
|
runs:
|
||||||
using: 'composite'
|
using: 'composite'
|
||||||
@@ -15,6 +18,11 @@ runs:
|
|||||||
- name: Set up Helm
|
- name: Set up Helm
|
||||||
uses: azure/setup-helm@v3
|
uses: azure/setup-helm@v3
|
||||||
|
|
||||||
|
- name: Update Helm chart version
|
||||||
|
run: |
|
||||||
|
sed -i "s/^version: .*/version: ${{ inputs.version }}/" helm/Chart.yaml
|
||||||
|
shell: bash
|
||||||
|
|
||||||
- name: Package Helm chart
|
- name: Package Helm chart
|
||||||
run: |
|
run: |
|
||||||
mkdir -p packaged
|
mkdir -p packaged
|
||||||
|
|||||||
28
.github/actions/run-cypress-tests/action.yaml
vendored
28
.github/actions/run-cypress-tests/action.yaml
vendored
@@ -2,6 +2,13 @@ name: Run Cypress Tests
|
|||||||
|
|
||||||
description: Run Cypress tests
|
description: Run Cypress tests
|
||||||
|
|
||||||
|
inputs:
|
||||||
|
openai_key:
|
||||||
|
description: "OpenAI API key"
|
||||||
|
required: true
|
||||||
|
default: ""
|
||||||
|
|
||||||
|
|
||||||
runs:
|
runs:
|
||||||
using: "composite"
|
using: "composite"
|
||||||
steps:
|
steps:
|
||||||
@@ -13,13 +20,25 @@ runs:
|
|||||||
with:
|
with:
|
||||||
node-version: 22
|
node-version: 22
|
||||||
|
|
||||||
|
- name: Setup yarn
|
||||||
|
shell: bash
|
||||||
|
run: npm install -g yarn
|
||||||
|
|
||||||
|
- name: Install xvfb for headless testing
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y xvfb libnss3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libasound2t64 libpango-1.0-0 libcairo2 libgtk-3-0 libgdk-pixbuf2.0-0 libx11-6 libx11-xcb1 libxcb1 libxss1 libxtst6 libnspr4
|
||||||
|
|
||||||
- name: Setup Docker project
|
- name: Setup Docker project
|
||||||
shell: bash
|
shell: bash
|
||||||
run: make build up-dev
|
run: |
|
||||||
|
export OPENAI_KEY="${{ inputs.openai_key }}"
|
||||||
|
make build-ci up-ci
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
shell: bash
|
shell: bash
|
||||||
run: npm install
|
run: yarn install
|
||||||
|
|
||||||
- name: Wait for frontend to be ready
|
- name: Wait for frontend to be ready
|
||||||
shell: bash
|
shell: bash
|
||||||
@@ -54,5 +73,8 @@ runs:
|
|||||||
|
|
||||||
- name: Run Cypress tests
|
- name: Run Cypress tests
|
||||||
shell: bash
|
shell: bash
|
||||||
run: npm run cy:run
|
run: |
|
||||||
|
set -e
|
||||||
|
npm run cy:run
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
31
.github/workflows/cypress-tests.yml
vendored
Normal file
31
.github/workflows/cypress-tests.yml
vendored
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
name: Cypress Tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_call:
|
||||||
|
secrets:
|
||||||
|
openai_key:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
cypress-tests:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Run Cypress Tests
|
||||||
|
id: run-tests
|
||||||
|
uses: ./.github/actions/run-cypress-tests
|
||||||
|
with:
|
||||||
|
openai_key: ${{ secrets.openai_key }}
|
||||||
|
|
||||||
|
- name: Check container logs on failure
|
||||||
|
if: steps.run-tests.conclusion == 'failure'
|
||||||
|
run: |
|
||||||
|
echo "Cypress tests failed. Dumping container logs..."
|
||||||
|
docker logs scraperr_api || true
|
||||||
|
|
||||||
|
- name: Fail job if Cypress failed
|
||||||
|
if: steps.run-tests.conclusion == 'failure'
|
||||||
|
run: exit 1
|
||||||
|
|
||||||
35
.github/workflows/docker-image.yml
vendored
35
.github/workflows/docker-image.yml
vendored
@@ -1,24 +1,30 @@
|
|||||||
name: Docker Image
|
name: Docker Image
|
||||||
on:
|
on:
|
||||||
workflow_run:
|
workflow_call:
|
||||||
workflows: ["Unit Tests"]
|
inputs:
|
||||||
types:
|
version:
|
||||||
- completed
|
required: true
|
||||||
workflow_dispatch:
|
type: string
|
||||||
|
secrets:
|
||||||
|
dockerhub_username:
|
||||||
|
required: true
|
||||||
|
dockerhub_token:
|
||||||
|
required: true
|
||||||
|
repo_token:
|
||||||
|
required: true
|
||||||
|
discord_webhook_url:
|
||||||
|
required: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
if: ${{ github.event.workflow_run.conclusion == 'success' && github.ref == 'refs/heads/master' }}
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Get version from helm chart
|
- name: Echo version
|
||||||
run: |
|
run: |
|
||||||
VERSION=$(grep "version:" ./helm/Chart.yaml | cut -d: -f2 | tr -d ' ')
|
echo "Version is ${{ inputs.version }}"
|
||||||
echo "VERSION=$VERSION" >> $GITHUB_ENV
|
|
||||||
echo "Version is $VERSION"
|
|
||||||
|
|
||||||
- name: Login to Docker Hub
|
- name: Login to Docker Hub
|
||||||
uses: docker/login-action@v3
|
uses: docker/login-action@v3
|
||||||
@@ -37,7 +43,7 @@ jobs:
|
|||||||
push: true
|
push: true
|
||||||
tags: |
|
tags: |
|
||||||
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:latest
|
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:latest
|
||||||
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:${{ env.VERSION }}
|
${{ secrets.DOCKERHUB_USERNAME }}/scraperr:${{ inputs.version }}
|
||||||
|
|
||||||
- name: Build and push api
|
- name: Build and push api
|
||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v5
|
||||||
@@ -47,7 +53,7 @@ jobs:
|
|||||||
push: true
|
push: true
|
||||||
tags: |
|
tags: |
|
||||||
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
|
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
|
||||||
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:${{ env.VERSION }}
|
${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:${{ inputs.version }}
|
||||||
|
|
||||||
push-helm-chart:
|
push-helm-chart:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@@ -59,7 +65,8 @@ jobs:
|
|||||||
- name: Push Helm Chart
|
- name: Push Helm Chart
|
||||||
uses: ./.github/actions/push-to-helm
|
uses: ./.github/actions/push-to-helm
|
||||||
with:
|
with:
|
||||||
app-repo-token: ${{ secrets.GPAT_TOKEN }}
|
app-repo-token: ${{ secrets.repo_token }}
|
||||||
|
version: ${{ inputs.version }}
|
||||||
|
|
||||||
success-message:
|
success-message:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@@ -71,7 +78,7 @@ jobs:
|
|||||||
uses: jaypyles/discord-webhook-action@v1.0.0
|
uses: jaypyles/discord-webhook-action@v1.0.0
|
||||||
with:
|
with:
|
||||||
webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
|
webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
|
||||||
content: "Scraperr Successfully Built Docker Images"
|
content: "Scraperr Successfully Built Docker Images (v${{ inputs.version }})"
|
||||||
username: "Scraperr CI"
|
username: "Scraperr CI"
|
||||||
embed-title: "✅ Deployment Status"
|
embed-title: "✅ Deployment Status"
|
||||||
embed-description: "Scraperr successfully built docker images."
|
embed-description: "Scraperr successfully built docker images."
|
||||||
|
|||||||
35
.github/workflows/merge.yml
vendored
Normal file
35
.github/workflows/merge.yml
vendored
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
name: Merge
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
pull_request:
|
||||||
|
types: [closed]
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
tests:
|
||||||
|
uses: ./.github/workflows/tests.yml
|
||||||
|
secrets:
|
||||||
|
openai_key: ${{ secrets.OPENAI_KEY }}
|
||||||
|
discord_webhook_url: ${{ secrets.DISCORD_WEBHOOK_URL }}
|
||||||
|
|
||||||
|
version:
|
||||||
|
needs: tests
|
||||||
|
uses: ./.github/workflows/version.yml
|
||||||
|
secrets:
|
||||||
|
git_token: ${{ secrets.GPAT_TOKEN }}
|
||||||
|
|
||||||
|
build-and-deploy:
|
||||||
|
if: needs.version.outputs.version_bump == 'true'
|
||||||
|
needs: version
|
||||||
|
uses: ./.github/workflows/docker-image.yml
|
||||||
|
secrets:
|
||||||
|
dockerhub_username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
|
dockerhub_token: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||||
|
repo_token: ${{ secrets.GPAT_TOKEN }}
|
||||||
|
discord_webhook_url: ${{ secrets.DISCORD_WEBHOOK_URL }}
|
||||||
|
with:
|
||||||
|
version: ${{ needs.version.outputs.version }}
|
||||||
20
.github/workflows/pr.yml
vendored
Normal file
20
.github/workflows/pr.yml
vendored
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
name: PR
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
checkout:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
tests:
|
||||||
|
uses: ./.github/workflows/tests.yml
|
||||||
|
secrets:
|
||||||
|
openai_key: ${{ secrets.OPENAI_KEY }}
|
||||||
|
discord_webhook_url: ${{ secrets.DISCORD_WEBHOOK_URL }}
|
||||||
27
.github/workflows/pytest.yml
vendored
Normal file
27
.github/workflows/pytest.yml
vendored
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
name: Pytest
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_call:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
unit-tests:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set env
|
||||||
|
run: echo "ENV=test" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Install pdm
|
||||||
|
run: pip install pdm
|
||||||
|
|
||||||
|
- name: Install project dependencies
|
||||||
|
run: pdm install
|
||||||
|
|
||||||
|
- name: Install playwright
|
||||||
|
run: pdm run playwright install
|
||||||
|
|
||||||
|
- name: Run tests
|
||||||
|
run: PYTHONPATH=. pdm run pytest -v -ra api/backend/tests
|
||||||
|
|
||||||
42
.github/workflows/tests.yml
vendored
Normal file
42
.github/workflows/tests.yml
vendored
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
name: Reusable PR Tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_call:
|
||||||
|
secrets:
|
||||||
|
openai_key:
|
||||||
|
required: true
|
||||||
|
discord_webhook_url:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
pytest:
|
||||||
|
uses: ./.github/workflows/pytest.yml
|
||||||
|
|
||||||
|
cypress-tests:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Run Cypress Tests
|
||||||
|
uses: ./.github/actions/run-cypress-tests
|
||||||
|
with:
|
||||||
|
openai_key: ${{ secrets.openai_key }}
|
||||||
|
|
||||||
|
success-message:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs:
|
||||||
|
- pytest
|
||||||
|
- cypress-tests
|
||||||
|
steps:
|
||||||
|
- name: Send Discord Message
|
||||||
|
uses: jaypyles/discord-webhook-action@v1.0.0
|
||||||
|
with:
|
||||||
|
webhook-url: ${{ secrets.discord_webhook_url }}
|
||||||
|
content: "Scraperr Successfully Passed Tests"
|
||||||
|
username: "Scraperr CI"
|
||||||
|
embed-title: "✅ Deployment Status"
|
||||||
|
embed-description: "Scraperr successfully passed all tests."
|
||||||
|
embed-color: 3066993
|
||||||
|
embed-footer-text: "Scraperr CI"
|
||||||
|
embed-timestamp: ${{ github.event.head_commit.timestamp }}
|
||||||
57
.github/workflows/unit-tests.yml
vendored
57
.github/workflows/unit-tests.yml
vendored
@@ -1,57 +0,0 @@
|
|||||||
name: Unit Tests
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
|
|
||||||
workflow_dispatch:
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
unit-tests:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Set env
|
|
||||||
run: echo "ENV=test" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Install pdm
|
|
||||||
run: pip install pdm
|
|
||||||
|
|
||||||
- name: Install project dependencies
|
|
||||||
run: pdm install
|
|
||||||
|
|
||||||
- name: Install playwright
|
|
||||||
run: pdm run playwright install
|
|
||||||
|
|
||||||
- name: Run tests
|
|
||||||
run: PYTHONPATH=. pdm run pytest api/backend/tests
|
|
||||||
|
|
||||||
cypress-tests:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
- uses: ./.github/actions/run-cypress-tests
|
|
||||||
|
|
||||||
success-message:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
needs:
|
|
||||||
- unit-tests
|
|
||||||
- cypress-tests
|
|
||||||
steps:
|
|
||||||
- name: Send Discord Message
|
|
||||||
uses: jaypyles/discord-webhook-action@v1.0.0
|
|
||||||
with:
|
|
||||||
webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
|
|
||||||
content: "Scraperr Successfully Passed Tests"
|
|
||||||
username: "Scraperr CI"
|
|
||||||
embed-title: "✅ Deployment Status"
|
|
||||||
embed-description: "Scraperr successfully passed all tests."
|
|
||||||
embed-color: 3066993 # Green
|
|
||||||
embed-footer-text: "Scraperr CI"
|
|
||||||
embed-timestamp: ${{ github.event.head_commit.timestamp }}
|
|
||||||
87
.github/workflows/version.yml
vendored
Normal file
87
.github/workflows/version.yml
vendored
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
name: Version
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_call:
|
||||||
|
secrets:
|
||||||
|
git_token:
|
||||||
|
required: true
|
||||||
|
outputs:
|
||||||
|
version:
|
||||||
|
description: "The new version number"
|
||||||
|
value: ${{ jobs.version.outputs.version }}
|
||||||
|
version_bump:
|
||||||
|
description: "Whether the version was bumped"
|
||||||
|
value: ${{ jobs.version.outputs.version_bump }}
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
version:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
outputs:
|
||||||
|
version: ${{ steps.set_version.outputs.version }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Get version bump
|
||||||
|
id: get_version_type
|
||||||
|
run: |
|
||||||
|
COMMIT_MSG=$(git log -1 --pretty=%B)
|
||||||
|
|
||||||
|
if [[ $COMMIT_MSG =~ ^feat\(breaking\) ]]; then
|
||||||
|
VERSION_TYPE="major"
|
||||||
|
elif [[ $COMMIT_MSG =~ ^feat! ]]; then
|
||||||
|
VERSION_TYPE="minor"
|
||||||
|
elif [[ $COMMIT_MSG =~ ^(feat|fix|chore): ]]; then
|
||||||
|
VERSION_TYPE="patch"
|
||||||
|
else
|
||||||
|
VERSION_TYPE="patch"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "VERSION_TYPE=$VERSION_TYPE" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Check for version bump
|
||||||
|
id: check_version_bump
|
||||||
|
run: |
|
||||||
|
COMMIT_MSG=$(git log -1 --pretty=%B)
|
||||||
|
if [[ $COMMIT_MSG =~ ^feat\(breaking\) ]]; then
|
||||||
|
echo "version_bump=true" >> $GITHUB_OUTPUT
|
||||||
|
elif [[ $COMMIT_MSG =~ .*\[no\ bump\].* ]]; then
|
||||||
|
echo "version_bump=false" >> $GITHUB_OUTPUT
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Skip version bump
|
||||||
|
if: steps.check_version_bump.outputs.version_bump == 'false'
|
||||||
|
run: |
|
||||||
|
echo "Skipping version bump as requested"
|
||||||
|
gh run cancel ${{ github.run_id }}
|
||||||
|
exit 0
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.git_token }}
|
||||||
|
|
||||||
|
- name: Set version
|
||||||
|
if: steps.check_version_bump.outputs.version_bump != 'false'
|
||||||
|
id: set_version
|
||||||
|
run: |
|
||||||
|
VERSION=$(./scripts/version.sh "$VERSION_TYPE")
|
||||||
|
echo "VERSION=$VERSION" >> $GITHUB_ENV
|
||||||
|
echo "Version is $VERSION"
|
||||||
|
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
||||||
|
env:
|
||||||
|
VERSION_TYPE: ${{ env.VERSION_TYPE }}
|
||||||
|
|
||||||
|
- name: Update chart file
|
||||||
|
if: steps.check_version_bump.outputs.version_bump != 'false'
|
||||||
|
run: |
|
||||||
|
sed -i "s/^version: .*/version: $VERSION/" helm/Chart.yaml
|
||||||
|
|
||||||
|
git config --local user.email "github-actions[bot]@users.noreply.github.com"
|
||||||
|
git config --local user.name "github-actions[bot]"
|
||||||
|
git add helm/Chart.yaml
|
||||||
|
git commit -m "chore: bump version to $VERSION"
|
||||||
|
git push
|
||||||
|
env:
|
||||||
|
VERSION: ${{ env.VERSION }}
|
||||||
16
.gitignore
vendored
16
.gitignore
vendored
@@ -188,4 +188,18 @@ postgres_data
|
|||||||
.vscode
|
.vscode
|
||||||
ollama
|
ollama
|
||||||
data
|
data
|
||||||
media
|
|
||||||
|
media/images
|
||||||
|
media/videos
|
||||||
|
media/audio
|
||||||
|
media/pdfs
|
||||||
|
media/spreadsheets
|
||||||
|
media/presentations
|
||||||
|
media/documents
|
||||||
|
media/recordings
|
||||||
|
media/download_summary.txt
|
||||||
|
|
||||||
|
cypress/screenshots
|
||||||
|
cypress/videos
|
||||||
|
|
||||||
|
docker-compose.dev.local.yml
|
||||||
15
Makefile
15
Makefile
@@ -1,6 +1,6 @@
|
|||||||
.DEFAULT_GOAL := help
|
.DEFAULT_GOAL := help
|
||||||
|
|
||||||
COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.yml
|
COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.local.yml
|
||||||
COMPOSE_PROD = docker compose -f docker-compose.yml
|
COMPOSE_PROD = docker compose -f docker-compose.yml
|
||||||
|
|
||||||
.PHONY: help deps build pull up up-dev down setup deploy
|
.PHONY: help deps build pull up up-dev down setup deploy
|
||||||
@@ -17,6 +17,7 @@ help:
|
|||||||
@echo " make down - Stop and remove containers, networks, images, and volumes"
|
@echo " make down - Stop and remove containers, networks, images, and volumes"
|
||||||
@echo " make setup - Setup server with dependencies and clone repo"
|
@echo " make setup - Setup server with dependencies and clone repo"
|
||||||
@echo " make deploy - Deploy site onto server"
|
@echo " make deploy - Deploy site onto server"
|
||||||
|
@echo " make cypress-start - Start Cypress"
|
||||||
@echo ""
|
@echo ""
|
||||||
|
|
||||||
logs:
|
logs:
|
||||||
@@ -51,3 +52,15 @@ setup:
|
|||||||
|
|
||||||
deploy:
|
deploy:
|
||||||
ansible-playbook -i ./ansible/inventory.yaml ./ansible/deploy_site.yaml -v
|
ansible-playbook -i ./ansible/inventory.yaml ./ansible/deploy_site.yaml -v
|
||||||
|
|
||||||
|
build-ci:
|
||||||
|
docker compose -f docker-compose.yml -f docker-compose.dev.yml build
|
||||||
|
|
||||||
|
up-ci:
|
||||||
|
docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d --force-recreate
|
||||||
|
|
||||||
|
cypress-start:
|
||||||
|
DISPLAY=:0 npx cypress open
|
||||||
|
|
||||||
|
cypress-run:
|
||||||
|
npx cypress run
|
||||||
@@ -13,7 +13,7 @@
|
|||||||
|
|
||||||
## 📋 Overview
|
## 📋 Overview
|
||||||
|
|
||||||
Scraperr enables you to extract data from websites with precision using XPath selectors. This self-hosted application provides a clean interface to manage scraping jobs, view results, and export data.
|
Scrape websites without writing a single line of code.
|
||||||
|
|
||||||
> 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information.
|
> 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information.
|
||||||
|
|
||||||
@@ -29,7 +29,7 @@ Scraperr enables you to extract data from websites with precision using XPath se
|
|||||||
- **Custom Headers**: Add JSON headers to your scraping requests
|
- **Custom Headers**: Add JSON headers to your scraping requests
|
||||||
- **Media Downloads**: Automatically download images, videos, and other media
|
- **Media Downloads**: Automatically download images, videos, and other media
|
||||||
- **Results Visualization**: View scraped data in a structured table format
|
- **Results Visualization**: View scraped data in a structured table format
|
||||||
- **Data Export**: Export your results in various formats
|
- **Data Export**: Export your results in markdown and csv formats
|
||||||
- **Notifcation Channels**: Send completion notifcations, through various channels
|
- **Notifcation Channels**: Send completion notifcations, through various channels
|
||||||
|
|
||||||
## 🚀 Getting Started
|
## 🚀 Getting Started
|
||||||
|
|||||||
6
api/backend/ai/agent/actions.py
Normal file
6
api/backend/ai/agent/actions.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
from typing_extensions import TypedDict
|
||||||
|
|
||||||
|
|
||||||
|
class Action(TypedDict):
|
||||||
|
type: str
|
||||||
|
url: str
|
||||||
93
api/backend/ai/agent/agent.py
Normal file
93
api/backend/ai/agent/agent.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
# STL
|
||||||
|
import random
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
# PDM
|
||||||
|
from camoufox import AsyncCamoufox
|
||||||
|
from playwright.async_api import Page
|
||||||
|
|
||||||
|
# LOCAL
|
||||||
|
from api.backend.ai.clients import ask_ollama, ask_open_ai, open_ai_key
|
||||||
|
from api.backend.job.models import CapturedElement
|
||||||
|
from api.backend.worker.logger import LOG
|
||||||
|
from api.backend.ai.agent.utils import (
|
||||||
|
parse_response,
|
||||||
|
capture_elements,
|
||||||
|
convert_to_markdown,
|
||||||
|
)
|
||||||
|
from api.backend.ai.agent.prompts import (
|
||||||
|
EXTRACT_ELEMENTS_PROMPT,
|
||||||
|
ELEMENT_EXTRACTION_PROMPT,
|
||||||
|
)
|
||||||
|
from api.backend.job.scraping.add_custom import add_custom_items
|
||||||
|
from api.backend.job.scraping.collect_media import collect_media
|
||||||
|
|
||||||
|
ask_ai = ask_open_ai if open_ai_key else ask_ollama
|
||||||
|
|
||||||
|
|
||||||
|
async def scrape_with_agent(agent_job: dict[str, Any]):
|
||||||
|
LOG.info(f"Starting work for agent job: {agent_job}")
|
||||||
|
pages = set()
|
||||||
|
|
||||||
|
if agent_job["job_options"]["proxies"]:
|
||||||
|
proxy = random.choice(agent_job["job_options"]["proxies"])
|
||||||
|
LOG.info(f"Using proxy: {proxy}")
|
||||||
|
|
||||||
|
async with AsyncCamoufox(headless=True) as browser:
|
||||||
|
page: Page = await browser.new_page()
|
||||||
|
|
||||||
|
await add_custom_items(
|
||||||
|
agent_job["url"],
|
||||||
|
page,
|
||||||
|
agent_job["job_options"]["custom_cookies"],
|
||||||
|
agent_job["job_options"]["custom_headers"],
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
await page.goto(agent_job["url"], timeout=60000)
|
||||||
|
|
||||||
|
if agent_job["job_options"]["collect_media"]:
|
||||||
|
await collect_media(agent_job["id"], page)
|
||||||
|
|
||||||
|
html_content = await page.content()
|
||||||
|
markdown_content = convert_to_markdown(html_content)
|
||||||
|
|
||||||
|
response = await ask_ai(
|
||||||
|
ELEMENT_EXTRACTION_PROMPT.format(
|
||||||
|
extraction_prompt=EXTRACT_ELEMENTS_PROMPT,
|
||||||
|
webpage=markdown_content,
|
||||||
|
prompt=agent_job["prompt"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
xpaths = parse_response(response)
|
||||||
|
|
||||||
|
captured_elements = await capture_elements(
|
||||||
|
page, xpaths, agent_job["job_options"]["return_html"]
|
||||||
|
)
|
||||||
|
|
||||||
|
final_url = page.url
|
||||||
|
|
||||||
|
pages.add((html_content, final_url))
|
||||||
|
finally:
|
||||||
|
await page.close()
|
||||||
|
await browser.close()
|
||||||
|
|
||||||
|
name_to_elements = {}
|
||||||
|
|
||||||
|
for page in pages:
|
||||||
|
for element in captured_elements:
|
||||||
|
if element.name not in name_to_elements:
|
||||||
|
name_to_elements[element.name] = []
|
||||||
|
|
||||||
|
name_to_elements[element.name].append(element)
|
||||||
|
|
||||||
|
scraped_elements: list[dict[str, dict[str, list[CapturedElement]]]] = [
|
||||||
|
{
|
||||||
|
page[1]: name_to_elements,
|
||||||
|
}
|
||||||
|
for page in pages
|
||||||
|
]
|
||||||
|
|
||||||
|
return scraped_elements
|
||||||
58
api/backend/ai/agent/prompts.py
Normal file
58
api/backend/ai/agent/prompts.py
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
EXTRACT_ELEMENTS_PROMPT = """
|
||||||
|
You are an assistant that extracts XPath expressions from webpages.
|
||||||
|
|
||||||
|
You will receive HTML content in markdown format.
|
||||||
|
|
||||||
|
Each element in the markdown has their xpath shown above them in a path like:
|
||||||
|
<!-- //div -->
|
||||||
|
|
||||||
|
Respond only with a list of general XPath expressions inside `<xpaths>...</xpaths>` tags.
|
||||||
|
|
||||||
|
You will also decide the decision of what to do next. If there is no decision available, return nothing for that section.
|
||||||
|
"""
|
||||||
|
|
||||||
|
ELEMENT_EXTRACTION_PROMPT = """
|
||||||
|
{extraction_prompt}
|
||||||
|
|
||||||
|
**Guidelines:**
|
||||||
|
- Prefer shorter, more general XPaths like `//div[...]` or `//span[...]`.
|
||||||
|
- Avoid overly specific or deep paths like `//div[3]/ul/li[2]/a`.
|
||||||
|
- Do **not** chain multiple elements deeply (e.g., `//div/span/a`).
|
||||||
|
- Use XPaths further down the tree when possible.
|
||||||
|
- Do not include any extra explanation or text.
|
||||||
|
- One XPath is acceptable if that's all that's needed.
|
||||||
|
- Try and limit it down to 1 - 3 xpaths.
|
||||||
|
- Include a name for each xpath.
|
||||||
|
|
||||||
|
<important>
|
||||||
|
- USE THE MOST SIMPLE XPATHS POSSIBLE.
|
||||||
|
- USE THE MOST GENERAL XPATHS POSSIBLE.
|
||||||
|
- USE THE MOST SPECIFIC XPATHS POSSIBLE.
|
||||||
|
- USE THE MOST GENERAL XPATHS POSSIBLE.
|
||||||
|
</important>
|
||||||
|
|
||||||
|
**Example Format:**
|
||||||
|
```xml
|
||||||
|
<xpaths>
|
||||||
|
- <name: insert_name_here>: <xpath: //div>
|
||||||
|
- <name: insert_name_here>: <xpath: //span>
|
||||||
|
- <name: insert_name_here>: <xpath: //span[contains(@text, 'example')]>
|
||||||
|
- <name: insert_name_here>: <xpath: //div[contains(@text, 'example')]>
|
||||||
|
- <name: insert_name_here>: <xpath: //a[@href]>
|
||||||
|
- etc
|
||||||
|
</xpaths>
|
||||||
|
|
||||||
|
<decision>
|
||||||
|
<next_page>
|
||||||
|
- //a[@href='next_page_url']
|
||||||
|
</next_page>
|
||||||
|
</decision>
|
||||||
|
```
|
||||||
|
|
||||||
|
**Input webpage:**
|
||||||
|
{webpage}
|
||||||
|
|
||||||
|
**Target content:**
|
||||||
|
{prompt}
|
||||||
|
|
||||||
|
"""
|
||||||
272
api/backend/ai/agent/utils.py
Normal file
272
api/backend/ai/agent/utils.py
Normal file
@@ -0,0 +1,272 @@
|
|||||||
|
# STL
|
||||||
|
import re
|
||||||
|
|
||||||
|
# PDM
|
||||||
|
from lxml import html, etree
|
||||||
|
from playwright.async_api import Page
|
||||||
|
|
||||||
|
# LOCAL
|
||||||
|
from api.backend.job.models import CapturedElement
|
||||||
|
from api.backend.job.utils.text_utils import clean_text
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_markdown(html_str: str):
|
||||||
|
parser = html.HTMLParser()
|
||||||
|
tree = html.fromstring(html_str, parser=parser)
|
||||||
|
root = tree.getroottree()
|
||||||
|
|
||||||
|
def format_attributes(el: etree._Element) -> str:
|
||||||
|
"""Convert element attributes into a string."""
|
||||||
|
return " ".join(f'{k}="{v}"' for k, v in el.attrib.items())
|
||||||
|
|
||||||
|
def is_visible(el: etree._Element) -> bool:
|
||||||
|
style = el.attrib.get("style", "").lower()
|
||||||
|
class_ = el.attrib.get("class", "").lower()
|
||||||
|
|
||||||
|
# Check for visibility styles
|
||||||
|
if "display: none" in style or "visibility: hidden" in style:
|
||||||
|
return False
|
||||||
|
if "opacity: 0" in style or "opacity:0" in style:
|
||||||
|
return False
|
||||||
|
if "height: 0" in style or "width: 0" in style:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check for common hidden classes
|
||||||
|
if any(
|
||||||
|
hidden in class_
|
||||||
|
for hidden in ["hidden", "invisible", "truncate", "collapse"]
|
||||||
|
):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check for hidden attributes
|
||||||
|
if el.attrib.get("hidden") is not None:
|
||||||
|
return False
|
||||||
|
if el.attrib.get("aria-hidden") == "true":
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check for empty or whitespace-only content
|
||||||
|
if not el.text and len(el) == 0:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def is_layout_or_decorative(el: etree._Element) -> bool:
|
||||||
|
tag = el.tag.lower()
|
||||||
|
|
||||||
|
# Layout elements
|
||||||
|
if tag in {"nav", "footer", "header", "aside", "main", "section"}:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Decorative elements
|
||||||
|
if tag in {"svg", "path", "circle", "rect", "line", "polygon", "polyline"}:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check id and class for layout/decorative keywords
|
||||||
|
id_class = " ".join(
|
||||||
|
[el.attrib.get("id", ""), el.attrib.get("class", "")]
|
||||||
|
).lower()
|
||||||
|
|
||||||
|
layout_keywords = {
|
||||||
|
"sidebar",
|
||||||
|
"nav",
|
||||||
|
"header",
|
||||||
|
"footer",
|
||||||
|
"menu",
|
||||||
|
"advert",
|
||||||
|
"ads",
|
||||||
|
"breadcrumb",
|
||||||
|
"container",
|
||||||
|
"wrapper",
|
||||||
|
"layout",
|
||||||
|
"grid",
|
||||||
|
"flex",
|
||||||
|
"row",
|
||||||
|
"column",
|
||||||
|
"section",
|
||||||
|
"banner",
|
||||||
|
"hero",
|
||||||
|
"card",
|
||||||
|
"modal",
|
||||||
|
"popup",
|
||||||
|
"tooltip",
|
||||||
|
"dropdown",
|
||||||
|
"overlay",
|
||||||
|
}
|
||||||
|
|
||||||
|
return any(keyword in id_class for keyword in layout_keywords)
|
||||||
|
|
||||||
|
# Tags to ignore in the final markdown output
|
||||||
|
included_tags = {
|
||||||
|
"div",
|
||||||
|
"span",
|
||||||
|
"a",
|
||||||
|
"p",
|
||||||
|
"h1",
|
||||||
|
"h2",
|
||||||
|
"h3",
|
||||||
|
"h4",
|
||||||
|
"h5",
|
||||||
|
"h6",
|
||||||
|
"img",
|
||||||
|
"button",
|
||||||
|
"input",
|
||||||
|
"textarea",
|
||||||
|
"ul",
|
||||||
|
"ol",
|
||||||
|
"li",
|
||||||
|
"table",
|
||||||
|
"tr",
|
||||||
|
"td",
|
||||||
|
"th",
|
||||||
|
"input",
|
||||||
|
"textarea",
|
||||||
|
"select",
|
||||||
|
"option",
|
||||||
|
"optgroup",
|
||||||
|
"fieldset",
|
||||||
|
"legend",
|
||||||
|
}
|
||||||
|
|
||||||
|
special_elements = []
|
||||||
|
normal_elements = []
|
||||||
|
|
||||||
|
for el in tree.iter():
|
||||||
|
if el.tag is etree.Comment:
|
||||||
|
continue
|
||||||
|
|
||||||
|
tag = el.tag.lower()
|
||||||
|
|
||||||
|
if tag not in included_tags:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not is_visible(el):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if is_layout_or_decorative(el):
|
||||||
|
continue
|
||||||
|
|
||||||
|
path = root.getpath(el)
|
||||||
|
attrs = format_attributes(el)
|
||||||
|
attrs_str = f" {attrs}" if attrs else ""
|
||||||
|
text = el.text.strip() if el.text else ""
|
||||||
|
|
||||||
|
if not text and not attrs:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# input elements
|
||||||
|
if tag == "button":
|
||||||
|
prefix = "🔘 **<button>**"
|
||||||
|
special_elements.append(f"<!-- {path} -->\n{prefix} {text}")
|
||||||
|
elif tag == "a":
|
||||||
|
href = el.attrib.get("href", "")
|
||||||
|
prefix = f"🔗 **<a href='{href}'>**"
|
||||||
|
special_elements.append(f"<!-- {path} -->\n{prefix} {text}")
|
||||||
|
elif tag == "input":
|
||||||
|
input_type = el.attrib.get("type", "text")
|
||||||
|
prefix = f"📝 **<input type='{input_type}'>**"
|
||||||
|
special_elements.append(f"<!-- {path} -->\n{prefix}")
|
||||||
|
else:
|
||||||
|
prefix = f"**<{tag}{attrs_str}>**"
|
||||||
|
|
||||||
|
if text:
|
||||||
|
normal_elements.append(f"<!-- {path} -->\n{prefix} {text}")
|
||||||
|
|
||||||
|
return "\n\n".join(normal_elements + special_elements) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
def parse_response(text: str) -> list[dict[str, str]]:
|
||||||
|
xpaths = re.findall(r"<xpaths>(.*?)</xpaths>", text, re.DOTALL)
|
||||||
|
results = []
|
||||||
|
|
||||||
|
if xpaths:
|
||||||
|
lines = xpaths[0].strip().splitlines()
|
||||||
|
for line in lines:
|
||||||
|
if line.strip().startswith("-"):
|
||||||
|
name = re.findall(r"<name: (.*?)>", line)[0]
|
||||||
|
xpath = re.findall(r"<xpath: (.*?)>", line)[0]
|
||||||
|
results.append({"name": name, "xpath": xpath})
|
||||||
|
else:
|
||||||
|
results.append({"name": "", "xpath": line.strip()})
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def parse_next_page(text: str) -> str | None:
|
||||||
|
next_page = re.findall(r"<next_page>(.*?)</next_page>", text, re.DOTALL)
|
||||||
|
|
||||||
|
if next_page:
|
||||||
|
lines = next_page[0].strip().splitlines()
|
||||||
|
next_page = [
|
||||||
|
line.strip().lstrip("-").strip()
|
||||||
|
for line in lines
|
||||||
|
if line.strip().startswith("-")
|
||||||
|
]
|
||||||
|
|
||||||
|
return next_page[0] if next_page else None
|
||||||
|
|
||||||
|
|
||||||
|
async def capture_elements(
|
||||||
|
page: Page, xpaths: list[dict[str, str]], return_html: bool
|
||||||
|
) -> list[CapturedElement]:
|
||||||
|
captured_elements = []
|
||||||
|
seen_texts = set()
|
||||||
|
|
||||||
|
for xpath in xpaths:
|
||||||
|
try:
|
||||||
|
locator = page.locator(f"xpath={xpath['xpath']}")
|
||||||
|
count = await locator.count()
|
||||||
|
|
||||||
|
for i in range(count):
|
||||||
|
if return_html:
|
||||||
|
element_text = (
|
||||||
|
await page.locator(f"xpath={xpath['xpath']}")
|
||||||
|
.nth(i)
|
||||||
|
.inner_html()
|
||||||
|
)
|
||||||
|
|
||||||
|
seen_texts.add(element_text)
|
||||||
|
captured_elements.append(
|
||||||
|
CapturedElement(
|
||||||
|
name=xpath["name"],
|
||||||
|
text=element_text,
|
||||||
|
xpath=xpath["xpath"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
element_text = ""
|
||||||
|
|
||||||
|
element_handle = await locator.nth(i).element_handle()
|
||||||
|
|
||||||
|
if not element_handle:
|
||||||
|
continue
|
||||||
|
|
||||||
|
link = await element_handle.get_attribute("href") or ""
|
||||||
|
|
||||||
|
text = await element_handle.text_content()
|
||||||
|
|
||||||
|
if text:
|
||||||
|
element_text += text
|
||||||
|
|
||||||
|
if link:
|
||||||
|
element_text += f" ({link})"
|
||||||
|
|
||||||
|
cleaned = clean_text(element_text)
|
||||||
|
|
||||||
|
if cleaned in seen_texts:
|
||||||
|
continue
|
||||||
|
|
||||||
|
seen_texts.add(cleaned)
|
||||||
|
|
||||||
|
captured_elements.append(
|
||||||
|
CapturedElement(
|
||||||
|
name=xpath["name"],
|
||||||
|
text=cleaned,
|
||||||
|
xpath=xpath["xpath"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing xpath {xpath}: {e}")
|
||||||
|
|
||||||
|
return captured_elements
|
||||||
@@ -1,32 +1,28 @@
|
|||||||
# STL
|
# STL
|
||||||
import os
|
|
||||||
import logging
|
import logging
|
||||||
from collections.abc import Iterable, AsyncGenerator
|
from collections.abc import Iterable, AsyncGenerator
|
||||||
|
|
||||||
# PDM
|
# PDM
|
||||||
from openai import OpenAI
|
from ollama import Message
|
||||||
from fastapi import APIRouter
|
from fastapi import APIRouter
|
||||||
from fastapi.responses import JSONResponse, StreamingResponse
|
from fastapi.responses import JSONResponse, StreamingResponse
|
||||||
from openai.types.chat import ChatCompletionMessageParam
|
from openai.types.chat import ChatCompletionMessageParam
|
||||||
|
|
||||||
# LOCAL
|
# LOCAL
|
||||||
from ollama import Message, AsyncClient
|
from api.backend.ai.clients import (
|
||||||
from api.backend.models import AI
|
llama_model,
|
||||||
|
open_ai_key,
|
||||||
|
llama_client,
|
||||||
|
open_ai_model,
|
||||||
|
openai_client,
|
||||||
|
)
|
||||||
|
from api.backend.ai.schemas import AI
|
||||||
|
from api.backend.routers.handle_exceptions import handle_exceptions
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger("AI")
|
||||||
|
|
||||||
ai_router = APIRouter()
|
ai_router = APIRouter()
|
||||||
|
|
||||||
# Load environment variables
|
|
||||||
open_ai_key = os.getenv("OPENAI_KEY")
|
|
||||||
open_ai_model = os.getenv("OPENAI_MODEL")
|
|
||||||
llama_url = os.getenv("OLLAMA_URL")
|
|
||||||
llama_model = os.getenv("OLLAMA_MODEL")
|
|
||||||
|
|
||||||
# Initialize clients
|
|
||||||
openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None
|
|
||||||
llama_client = AsyncClient(host=llama_url) if llama_url else None
|
|
||||||
|
|
||||||
|
|
||||||
async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]:
|
async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]:
|
||||||
if llama_client and llama_model:
|
if llama_client and llama_model:
|
||||||
@@ -67,6 +63,7 @@ chat_function = llama_chat if llama_client else openai_chat
|
|||||||
|
|
||||||
|
|
||||||
@ai_router.post("/ai")
|
@ai_router.post("/ai")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
async def ai(c: AI):
|
async def ai(c: AI):
|
||||||
return StreamingResponse(
|
return StreamingResponse(
|
||||||
chat_function(chat_messages=c.messages), media_type="text/plain"
|
chat_function(chat_messages=c.messages), media_type="text/plain"
|
||||||
@@ -74,5 +71,6 @@ async def ai(c: AI):
|
|||||||
|
|
||||||
|
|
||||||
@ai_router.get("/ai/check")
|
@ai_router.get("/ai/check")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
async def check():
|
async def check():
|
||||||
return JSONResponse(content={"ai_enabled": bool(open_ai_key or llama_model)})
|
return JSONResponse(content={"ai_enabled": bool(open_ai_key or llama_model)})
|
||||||
|
|||||||
39
api/backend/ai/clients.py
Normal file
39
api/backend/ai/clients.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
# STL
|
||||||
|
import os
|
||||||
|
|
||||||
|
# PDM
|
||||||
|
from ollama import AsyncClient
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
# Load environment variables
|
||||||
|
open_ai_key = os.getenv("OPENAI_KEY")
|
||||||
|
open_ai_model = os.getenv("OPENAI_MODEL")
|
||||||
|
llama_url = os.getenv("OLLAMA_URL")
|
||||||
|
llama_model = os.getenv("OLLAMA_MODEL")
|
||||||
|
|
||||||
|
# Initialize clients
|
||||||
|
openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None
|
||||||
|
llama_client = AsyncClient(host=llama_url) if llama_url else None
|
||||||
|
|
||||||
|
|
||||||
|
async def ask_open_ai(prompt: str) -> str:
|
||||||
|
if not openai_client:
|
||||||
|
raise ValueError("OpenAI client not initialized")
|
||||||
|
|
||||||
|
response = openai_client.chat.completions.create(
|
||||||
|
model=open_ai_model or "gpt-4.1-mini",
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
)
|
||||||
|
|
||||||
|
return response.choices[0].message.content or ""
|
||||||
|
|
||||||
|
|
||||||
|
async def ask_ollama(prompt: str) -> str:
|
||||||
|
if not llama_client:
|
||||||
|
raise ValueError("Ollama client not initialized")
|
||||||
|
|
||||||
|
response = await llama_client.chat(
|
||||||
|
model=llama_model or "", messages=[{"role": "user", "content": prompt}]
|
||||||
|
)
|
||||||
|
|
||||||
|
return response.message.content or ""
|
||||||
4
api/backend/ai/schemas/__init__.py
Normal file
4
api/backend/ai/schemas/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
# LOCAL
|
||||||
|
from .ai import AI
|
||||||
|
|
||||||
|
__all__ = ["AI"]
|
||||||
9
api/backend/ai/schemas/ai.py
Normal file
9
api/backend/ai/schemas/ai.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
# STL
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
# PDM
|
||||||
|
import pydantic
|
||||||
|
|
||||||
|
|
||||||
|
class AI(pydantic.BaseModel):
|
||||||
|
messages: list[Any]
|
||||||
@@ -1,39 +1,60 @@
|
|||||||
# STL
|
# STL
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
import apscheduler # type: ignore
|
from contextlib import asynccontextmanager
|
||||||
|
|
||||||
# PDM
|
# PDM
|
||||||
import apscheduler.schedulers
|
|
||||||
import apscheduler.schedulers.background
|
|
||||||
from fastapi import FastAPI, Request, status
|
from fastapi import FastAPI, Request, status
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
from fastapi.exceptions import RequestValidationError
|
from fastapi.exceptions import RequestValidationError
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
|
||||||
# LOCAL
|
# LOCAL
|
||||||
from api.backend.ai.ai_router import ai_router
|
|
||||||
from api.backend.auth.auth_router import auth_router
|
|
||||||
from api.backend.utils import get_log_level
|
from api.backend.utils import get_log_level
|
||||||
from api.backend.routers.job_router import job_router
|
|
||||||
from api.backend.routers.stats_router import stats_router
|
|
||||||
from api.backend.database.startup import init_database
|
|
||||||
from fastapi.responses import JSONResponse
|
|
||||||
|
|
||||||
from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
|
|
||||||
from api.backend.scheduler import scheduler
|
from api.backend.scheduler import scheduler
|
||||||
|
from api.backend.ai.ai_router import ai_router
|
||||||
|
from api.backend.job.job_router import job_router
|
||||||
|
from api.backend.auth.auth_router import auth_router
|
||||||
|
from api.backend.database.startup import init_database
|
||||||
|
from api.backend.stats.stats_router import stats_router
|
||||||
|
from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
|
||||||
|
|
||||||
log_level = os.getenv("LOG_LEVEL")
|
log_level = os.getenv("LOG_LEVEL")
|
||||||
LOG_LEVEL = get_log_level(log_level)
|
LOG_LEVEL = get_log_level(log_level)
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=LOG_LEVEL,
|
level=LOG_LEVEL,
|
||||||
format="%(levelname)s: %(asctime)s - %(name)s - %(message)s",
|
format="%(levelname)s: %(asctime)s - [%(name)s] - %(message)s",
|
||||||
handlers=[logging.StreamHandler()],
|
handlers=[logging.StreamHandler()],
|
||||||
)
|
)
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
app = FastAPI(title="api", root_path="/api")
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def lifespan(_: FastAPI):
|
||||||
|
# Startup
|
||||||
|
LOG.info("Starting application...")
|
||||||
|
|
||||||
|
init_database()
|
||||||
|
|
||||||
|
LOG.info("Starting cron scheduler...")
|
||||||
|
start_cron_scheduler(scheduler)
|
||||||
|
scheduler.start()
|
||||||
|
|
||||||
|
LOG.info("Cron scheduler started successfully")
|
||||||
|
|
||||||
|
yield
|
||||||
|
|
||||||
|
# Shutdown
|
||||||
|
LOG.info("Shutting down application...")
|
||||||
|
LOG.info("Stopping cron scheduler...")
|
||||||
|
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
|
||||||
|
LOG.info("Cron scheduler stopped")
|
||||||
|
LOG.info("Application shutdown complete")
|
||||||
|
|
||||||
|
|
||||||
|
app = FastAPI(title="api", root_path="/api", lifespan=lifespan)
|
||||||
|
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
CORSMiddleware,
|
CORSMiddleware,
|
||||||
@@ -43,28 +64,12 @@ app.add_middleware(
|
|||||||
allow_headers=["*"],
|
allow_headers=["*"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
app.include_router(auth_router)
|
app.include_router(auth_router)
|
||||||
app.include_router(ai_router)
|
app.include_router(ai_router)
|
||||||
app.include_router(job_router)
|
app.include_router(job_router)
|
||||||
app.include_router(stats_router)
|
app.include_router(stats_router)
|
||||||
|
|
||||||
|
|
||||||
@app.on_event("startup")
|
|
||||||
async def startup_event():
|
|
||||||
start_cron_scheduler(scheduler)
|
|
||||||
scheduler.start()
|
|
||||||
|
|
||||||
if os.getenv("ENV") != "test":
|
|
||||||
init_database()
|
|
||||||
LOG.info("Starting up...")
|
|
||||||
|
|
||||||
|
|
||||||
@app.on_event("shutdown")
|
|
||||||
def shutdown_scheduler():
|
|
||||||
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
|
|
||||||
|
|
||||||
|
|
||||||
@app.exception_handler(RequestValidationError)
|
@app.exception_handler(RequestValidationError)
|
||||||
async def validation_exception_handler(request: Request, exc: RequestValidationError):
|
async def validation_exception_handler(request: Request, exc: RequestValidationError):
|
||||||
exc_str = f"{exc}".replace("\n", " ").replace(" ", " ")
|
exc_str = f"{exc}".replace("\n", " ").replace(" ", " ")
|
||||||
|
|||||||
@@ -1,13 +1,14 @@
|
|||||||
# STL
|
# STL
|
||||||
from datetime import timedelta
|
|
||||||
import os
|
import os
|
||||||
|
import logging
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
# PDM
|
# PDM
|
||||||
from fastapi import Depends, APIRouter, HTTPException, status
|
from fastapi import Depends, APIRouter, HTTPException, status
|
||||||
from fastapi.security import OAuth2PasswordRequestForm
|
from fastapi.security import OAuth2PasswordRequestForm
|
||||||
|
|
||||||
# LOCAL
|
# LOCAL
|
||||||
from api.backend.schemas import User, Token, UserCreate
|
from api.backend.auth.schemas import User, Token, UserCreate
|
||||||
from api.backend.auth.auth_utils import (
|
from api.backend.auth.auth_utils import (
|
||||||
ACCESS_TOKEN_EXPIRE_MINUTES,
|
ACCESS_TOKEN_EXPIRE_MINUTES,
|
||||||
get_current_user,
|
get_current_user,
|
||||||
@@ -15,18 +16,19 @@ from api.backend.auth.auth_utils import (
|
|||||||
get_password_hash,
|
get_password_hash,
|
||||||
create_access_token,
|
create_access_token,
|
||||||
)
|
)
|
||||||
import logging
|
|
||||||
|
|
||||||
from api.backend.database.common import update
|
from api.backend.database.common import update
|
||||||
|
from api.backend.routers.handle_exceptions import handle_exceptions
|
||||||
|
|
||||||
auth_router = APIRouter()
|
auth_router = APIRouter()
|
||||||
|
|
||||||
LOG = logging.getLogger("auth_router")
|
LOG = logging.getLogger("Auth")
|
||||||
|
|
||||||
|
|
||||||
@auth_router.post("/auth/token", response_model=Token)
|
@auth_router.post("/auth/token", response_model=Token)
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
|
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
|
||||||
user = await authenticate_user(form_data.username, form_data.password)
|
user = await authenticate_user(form_data.username, form_data.password)
|
||||||
|
|
||||||
if not user:
|
if not user:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||||
@@ -47,6 +49,7 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends(
|
|||||||
|
|
||||||
|
|
||||||
@auth_router.post("/auth/signup", response_model=User)
|
@auth_router.post("/auth/signup", response_model=User)
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
async def create_user(user: UserCreate):
|
async def create_user(user: UserCreate):
|
||||||
hashed_password = get_password_hash(user.password)
|
hashed_password = get_password_hash(user.password)
|
||||||
user_dict = user.model_dump()
|
user_dict = user.model_dump()
|
||||||
@@ -60,10 +63,16 @@ async def create_user(user: UserCreate):
|
|||||||
|
|
||||||
|
|
||||||
@auth_router.get("/auth/users/me", response_model=User)
|
@auth_router.get("/auth/users/me", response_model=User)
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
async def read_users_me(current_user: User = Depends(get_current_user)):
|
async def read_users_me(current_user: User = Depends(get_current_user)):
|
||||||
return current_user
|
return current_user
|
||||||
|
|
||||||
|
|
||||||
@auth_router.get("/auth/check")
|
@auth_router.get("/auth/check")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
async def check_auth():
|
async def check_auth():
|
||||||
return {"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True"}
|
return {
|
||||||
|
"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True",
|
||||||
|
"recordings_enabled": os.environ.get("RECORDINGS_ENABLED", "true").lower()
|
||||||
|
== "true",
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
# STL
|
# STL
|
||||||
import os
|
import os
|
||||||
|
import logging
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import logging
|
|
||||||
|
|
||||||
# PDM
|
# PDM
|
||||||
from jose import JWTError, jwt
|
from jose import JWTError, jwt
|
||||||
@@ -12,11 +12,10 @@ from passlib.context import CryptContext
|
|||||||
from fastapi.security import OAuth2PasswordBearer
|
from fastapi.security import OAuth2PasswordBearer
|
||||||
|
|
||||||
# LOCAL
|
# LOCAL
|
||||||
from api.backend.schemas import User, UserInDB, TokenData
|
from api.backend.auth.schemas import User, UserInDB, TokenData
|
||||||
|
|
||||||
from api.backend.database.common import query
|
from api.backend.database.common import query
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger("Auth")
|
||||||
|
|
||||||
_ = load_dotenv()
|
_ = load_dotenv()
|
||||||
|
|
||||||
@@ -118,7 +117,8 @@ async def get_current_user(token: str = Depends(oauth2_scheme)):
|
|||||||
LOG.error(f"Exception occurred: {e}")
|
LOG.error(f"Exception occurred: {e}")
|
||||||
return EMPTY_USER
|
return EMPTY_USER
|
||||||
|
|
||||||
user = await get_user(email=token_data.email)
|
user = await get_user(email=token_data.email or "")
|
||||||
|
|
||||||
if user is None:
|
if user is None:
|
||||||
return EMPTY_USER
|
return EMPTY_USER
|
||||||
|
|
||||||
@@ -136,6 +136,7 @@ async def require_user(token: str = Depends(oauth2_scheme)):
|
|||||||
payload: Optional[dict[str, Any]] = jwt.decode(
|
payload: Optional[dict[str, Any]] = jwt.decode(
|
||||||
token, SECRET_KEY, algorithms=[ALGORITHM]
|
token, SECRET_KEY, algorithms=[ALGORITHM]
|
||||||
)
|
)
|
||||||
|
|
||||||
if not payload:
|
if not payload:
|
||||||
raise credentials_exception
|
raise credentials_exception
|
||||||
|
|
||||||
@@ -149,7 +150,7 @@ async def require_user(token: str = Depends(oauth2_scheme)):
|
|||||||
except JWTError:
|
except JWTError:
|
||||||
raise credentials_exception
|
raise credentials_exception
|
||||||
|
|
||||||
user = await get_user(email=token_data.email)
|
user = await get_user(email=token_data.email or "")
|
||||||
|
|
||||||
if user is None:
|
if user is None:
|
||||||
raise credentials_exception
|
raise credentials_exception
|
||||||
|
|||||||
4
api/backend/auth/schemas/__init__.py
Normal file
4
api/backend/auth/schemas/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
# LOCAL
|
||||||
|
from .auth import User, Token, UserInDB, TokenData, UserCreate
|
||||||
|
|
||||||
|
__all__ = ["User", "Token", "UserInDB", "TokenData", "UserCreate"]
|
||||||
@@ -1 +1,24 @@
|
|||||||
|
# STL
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
DATABASE_PATH = "data/database.db"
|
DATABASE_PATH = "data/database.db"
|
||||||
|
RECORDINGS_DIR = Path("media/recordings")
|
||||||
|
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
|
||||||
|
MEDIA_DIR = Path("media")
|
||||||
|
MEDIA_TYPES = [
|
||||||
|
"audio",
|
||||||
|
"documents",
|
||||||
|
"images",
|
||||||
|
"pdfs",
|
||||||
|
"presentations",
|
||||||
|
"spreadsheets",
|
||||||
|
"videos",
|
||||||
|
]
|
||||||
|
|
||||||
|
REGISTRATION_ENABLED = os.getenv("REGISTRATION_ENABLED", "true").lower() == "true"
|
||||||
|
DEFAULT_USER_EMAIL = os.getenv("DEFAULT_USER_EMAIL")
|
||||||
|
DEFAULT_USER_PASSWORD = os.getenv("DEFAULT_USER_PASSWORD")
|
||||||
|
DEFAULT_USER_FULL_NAME = os.getenv("DEFAULT_USER_FULL_NAME")
|
||||||
|
|
||||||
|
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
from .common import insert, QUERIES, update
|
# LOCAL
|
||||||
|
from .common import insert, update, connect
|
||||||
|
from .schema import INIT_QUERY
|
||||||
|
|
||||||
__all__ = ["insert", "QUERIES", "update"]
|
__all__ = ["insert", "update", "INIT_QUERY", "connect"]
|
||||||
|
|||||||
@@ -1,12 +1,13 @@
|
|||||||
|
# STL
|
||||||
|
import logging
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
from api.backend.constants import DATABASE_PATH
|
|
||||||
from api.backend.utils import format_json, format_sql_row_to_python
|
|
||||||
from api.backend.database.schema import INIT_QUERY
|
|
||||||
from api.backend.database.queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
|
|
||||||
import logging
|
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
# LOCAL
|
||||||
|
from api.backend.constants import DATABASE_PATH
|
||||||
|
from api.backend.database.utils import format_json, format_sql_row_to_python
|
||||||
|
|
||||||
|
LOG = logging.getLogger("Database")
|
||||||
|
|
||||||
|
|
||||||
def connect():
|
def connect():
|
||||||
@@ -25,8 +26,10 @@ def insert(query: str, values: tuple[Any, ...]):
|
|||||||
try:
|
try:
|
||||||
_ = cursor.execute(query, copy)
|
_ = cursor.execute(query, copy)
|
||||||
connection.commit()
|
connection.commit()
|
||||||
|
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
LOG.error(f"An error occurred: {e}")
|
LOG.error(f"An error occurred: {e}")
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
cursor.close()
|
cursor.close()
|
||||||
connection.close()
|
connection.close()
|
||||||
@@ -78,15 +81,9 @@ def update(query: str, values: Optional[tuple[Any, ...]] = None):
|
|||||||
return res.rowcount
|
return res.rowcount
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
LOG.error(f"An error occurred: {e}")
|
LOG.error(f"An error occurred: {e}")
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
cursor.close()
|
cursor.close()
|
||||||
connection.close()
|
connection.close()
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
QUERIES = {
|
|
||||||
"init": INIT_QUERY,
|
|
||||||
"insert_job": JOB_INSERT_QUERY,
|
|
||||||
"delete_job": DELETE_JOB_QUERY,
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
from .queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
|
# LOCAL
|
||||||
|
from .job.job_queries import DELETE_JOB_QUERY, JOB_INSERT_QUERY
|
||||||
|
|
||||||
__all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"]
|
__all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"]
|
||||||
|
|||||||
68
api/backend/database/queries/job/job_queries.py
Normal file
68
api/backend/database/queries/job/job_queries.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
# STL
|
||||||
|
import logging
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
# LOCAL
|
||||||
|
from api.backend.database.utils import format_list_for_query
|
||||||
|
from api.backend.database.common import query, insert, update
|
||||||
|
|
||||||
|
JOB_INSERT_QUERY = """
|
||||||
|
INSERT INTO jobs
|
||||||
|
(id, url, elements, user, time_created, result, status, chat, job_options, agent_mode, prompt)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
"""
|
||||||
|
|
||||||
|
DELETE_JOB_QUERY = """
|
||||||
|
DELETE FROM jobs WHERE id IN ()
|
||||||
|
"""
|
||||||
|
|
||||||
|
LOG = logging.getLogger("Database")
|
||||||
|
|
||||||
|
|
||||||
|
def insert_job(item: dict[str, Any]) -> None:
|
||||||
|
insert(
|
||||||
|
JOB_INSERT_QUERY,
|
||||||
|
(
|
||||||
|
item["id"],
|
||||||
|
item["url"],
|
||||||
|
item["elements"],
|
||||||
|
item["user"],
|
||||||
|
item["time_created"],
|
||||||
|
item["result"],
|
||||||
|
item["status"],
|
||||||
|
item["chat"],
|
||||||
|
item["job_options"],
|
||||||
|
item["agent_mode"],
|
||||||
|
item["prompt"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
LOG.info(f"Inserted item: {item}")
|
||||||
|
|
||||||
|
|
||||||
|
async def get_queued_job():
|
||||||
|
queued_job_query = (
|
||||||
|
"SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
|
||||||
|
)
|
||||||
|
|
||||||
|
res = query(queued_job_query)
|
||||||
|
LOG.info(f"Got queued job: {res}")
|
||||||
|
return res[0] if res else None
|
||||||
|
|
||||||
|
|
||||||
|
async def update_job(ids: list[str], field: str, value: Any):
|
||||||
|
query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
|
||||||
|
res = update(query, tuple([value] + ids))
|
||||||
|
LOG.info(f"Updated job: {res}")
|
||||||
|
|
||||||
|
|
||||||
|
async def delete_jobs(jobs: list[str]):
|
||||||
|
if not jobs:
|
||||||
|
LOG.info("No jobs to delete.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}"
|
||||||
|
res = update(query, tuple(jobs))
|
||||||
|
|
||||||
|
LOG.info(f"Deleted jobs: {res}")
|
||||||
|
|
||||||
|
return res
|
||||||
@@ -1,9 +0,0 @@
|
|||||||
JOB_INSERT_QUERY = """
|
|
||||||
INSERT INTO jobs
|
|
||||||
(id, url, elements, user, time_created, result, status, chat, job_options)
|
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
||||||
"""
|
|
||||||
|
|
||||||
DELETE_JOB_QUERY = """
|
|
||||||
DELETE FROM jobs WHERE id IN ()
|
|
||||||
"""
|
|
||||||
41
api/backend/database/queries/statistics/statistic_queries.py
Normal file
41
api/backend/database/queries/statistics/statistic_queries.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
# LOCAL
|
||||||
|
from api.backend.database.common import query
|
||||||
|
|
||||||
|
|
||||||
|
async def average_elements_per_link(user: str):
|
||||||
|
job_query = """
|
||||||
|
SELECT
|
||||||
|
DATE(time_created) AS date,
|
||||||
|
AVG(json_array_length(elements)) AS average_elements,
|
||||||
|
COUNT(*) AS count
|
||||||
|
FROM
|
||||||
|
jobs
|
||||||
|
WHERE
|
||||||
|
status = 'Completed' AND user = ?
|
||||||
|
GROUP BY
|
||||||
|
DATE(time_created)
|
||||||
|
ORDER BY
|
||||||
|
date ASC;
|
||||||
|
"""
|
||||||
|
results = query(job_query, (user,))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
async def get_jobs_per_day(user: str):
|
||||||
|
job_query = """
|
||||||
|
SELECT
|
||||||
|
DATE(time_created) AS date,
|
||||||
|
COUNT(*) AS job_count
|
||||||
|
FROM
|
||||||
|
jobs
|
||||||
|
WHERE
|
||||||
|
status = 'Completed' AND user = ?
|
||||||
|
GROUP BY
|
||||||
|
DATE(time_created)
|
||||||
|
ORDER BY
|
||||||
|
date ASC;
|
||||||
|
"""
|
||||||
|
results = query(job_query, (user,))
|
||||||
|
|
||||||
|
return results
|
||||||
@@ -27,4 +27,8 @@ CREATE TABLE IF NOT EXISTS cron_jobs (
|
|||||||
time_updated DATETIME NOT NULL,
|
time_updated DATETIME NOT NULL,
|
||||||
FOREIGN KEY (job_id) REFERENCES jobs(id)
|
FOREIGN KEY (job_id) REFERENCES jobs(id)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
ALTER TABLE jobs ADD COLUMN agent_mode BOOLEAN NOT NULL DEFAULT FALSE;
|
||||||
|
ALTER TABLE jobs ADD COLUMN prompt STRING;
|
||||||
|
ALTER TABLE jobs ADD COLUMN favorite BOOLEAN NOT NULL DEFAULT FALSE;
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -1,24 +1,52 @@
|
|||||||
import os
|
# STL
|
||||||
from api.backend.database.common import connect, QUERIES, insert
|
|
||||||
import logging
|
import logging
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
# LOCAL
|
||||||
|
from api.backend.constants import (
|
||||||
|
DEFAULT_USER_EMAIL,
|
||||||
|
REGISTRATION_ENABLED,
|
||||||
|
DEFAULT_USER_PASSWORD,
|
||||||
|
DEFAULT_USER_FULL_NAME,
|
||||||
|
)
|
||||||
from api.backend.auth.auth_utils import get_password_hash
|
from api.backend.auth.auth_utils import get_password_hash
|
||||||
|
from api.backend.database.common import insert, connect
|
||||||
|
from api.backend.database.schema import INIT_QUERY
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger("Database")
|
||||||
|
|
||||||
|
|
||||||
def init_database():
|
def execute_startup_query():
|
||||||
cursor = connect()
|
cursor = connect()
|
||||||
|
|
||||||
for query in QUERIES["init"].strip().split(";"):
|
for query in INIT_QUERY.strip().split(";"):
|
||||||
if query.strip():
|
query = query.strip()
|
||||||
|
|
||||||
|
if not query:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
LOG.info(f"Executing query: {query}")
|
LOG.info(f"Executing query: {query}")
|
||||||
_ = cursor.execute(query)
|
_ = cursor.execute(query)
|
||||||
|
|
||||||
if os.environ.get("REGISTRATION_ENABLED", "True") == "False":
|
except sqlite3.OperationalError as e:
|
||||||
default_user_email = os.environ.get("DEFAULT_USER_EMAIL")
|
if "duplicate column name" in str(e).lower():
|
||||||
default_user_password = os.environ.get("DEFAULT_USER_PASSWORD")
|
LOG.warning(f"Skipping duplicate column error: {e}")
|
||||||
default_user_full_name = os.environ.get("DEFAULT_USER_FULL_NAME")
|
continue
|
||||||
|
else:
|
||||||
|
LOG.error(f"Error executing query: {query}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
|
||||||
|
def init_database():
|
||||||
|
execute_startup_query()
|
||||||
|
|
||||||
|
if not REGISTRATION_ENABLED:
|
||||||
|
default_user_email = DEFAULT_USER_EMAIL
|
||||||
|
default_user_password = DEFAULT_USER_PASSWORD
|
||||||
|
default_user_full_name = DEFAULT_USER_FULL_NAME
|
||||||
|
|
||||||
if (
|
if (
|
||||||
not default_user_email
|
not default_user_email
|
||||||
@@ -39,5 +67,3 @@ def init_database():
|
|||||||
default_user_full_name,
|
default_user_full_name,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
cursor.close()
|
|
||||||
|
|||||||
30
api/backend/database/utils.py
Normal file
30
api/backend/database/utils.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
# STL
|
||||||
|
import json
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
def format_list_for_query(ids: list[str]):
|
||||||
|
return (
|
||||||
|
f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def format_sql_row_to_python(row: dict[str, Any]):
|
||||||
|
new_row: dict[str, Any] = {}
|
||||||
|
for key, value in row.items():
|
||||||
|
if isinstance(value, str):
|
||||||
|
try:
|
||||||
|
new_row[key] = json.loads(value)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
new_row[key] = value
|
||||||
|
else:
|
||||||
|
new_row[key] = value
|
||||||
|
|
||||||
|
return new_row
|
||||||
|
|
||||||
|
|
||||||
|
def format_json(items: list[Any]):
|
||||||
|
for idx, item in enumerate(items):
|
||||||
|
if isinstance(item, (dict, list)):
|
||||||
|
formatted_item = json.dumps(item)
|
||||||
|
items[idx] = formatted_item
|
||||||
@@ -1,17 +1,9 @@
|
|||||||
from .job import (
|
# LOCAL
|
||||||
insert,
|
from .job import insert, update_job, delete_jobs, get_queued_job
|
||||||
update_job,
|
|
||||||
delete_jobs,
|
|
||||||
get_jobs_per_day,
|
|
||||||
get_queued_job,
|
|
||||||
average_elements_per_link,
|
|
||||||
)
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"insert",
|
"insert",
|
||||||
"update_job",
|
"update_job",
|
||||||
"delete_jobs",
|
"delete_jobs",
|
||||||
"get_jobs_per_day",
|
|
||||||
"get_queued_job",
|
"get_queued_job",
|
||||||
"average_elements_per_link",
|
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,15 +1,19 @@
|
|||||||
|
# STL
|
||||||
|
import uuid
|
||||||
|
import logging
|
||||||
import datetime
|
import datetime
|
||||||
from typing import Any
|
from typing import Any
|
||||||
import uuid
|
|
||||||
from api.backend.database.common import insert, query
|
|
||||||
from api.backend.models import CronJob
|
|
||||||
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
|
|
||||||
from apscheduler.triggers.cron import CronTrigger # type: ignore
|
|
||||||
|
|
||||||
|
# PDM
|
||||||
|
from apscheduler.triggers.cron import CronTrigger
|
||||||
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
|
|
||||||
|
# LOCAL
|
||||||
from api.backend.job import insert as insert_job
|
from api.backend.job import insert as insert_job
|
||||||
import logging
|
from api.backend.schemas.cron import CronJob
|
||||||
|
from api.backend.database.common import query, insert
|
||||||
|
|
||||||
LOG = logging.getLogger("Cron Scheduler")
|
LOG = logging.getLogger("Cron")
|
||||||
|
|
||||||
|
|
||||||
def insert_cron_job(cron_job: CronJob):
|
def insert_cron_job(cron_job: CronJob):
|
||||||
@@ -17,6 +21,7 @@ def insert_cron_job(cron_job: CronJob):
|
|||||||
INSERT INTO cron_jobs (id, user_email, job_id, cron_expression, time_created, time_updated)
|
INSERT INTO cron_jobs (id, user_email, job_id, cron_expression, time_created, time_updated)
|
||||||
VALUES (?, ?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?, ?)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
values = (
|
values = (
|
||||||
cron_job.id,
|
cron_job.id,
|
||||||
cron_job.user_email,
|
cron_job.user_email,
|
||||||
@@ -36,6 +41,7 @@ def delete_cron_job(id: str, user_email: str):
|
|||||||
DELETE FROM cron_jobs
|
DELETE FROM cron_jobs
|
||||||
WHERE id = ? AND user_email = ?
|
WHERE id = ? AND user_email = ?
|
||||||
"""
|
"""
|
||||||
|
|
||||||
values = (id, user_email)
|
values = (id, user_email)
|
||||||
insert(query, values)
|
insert(query, values)
|
||||||
|
|
||||||
|
|||||||
@@ -3,20 +3,18 @@ import logging
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
# LOCAL
|
# LOCAL
|
||||||
from api.backend.utils import format_list_for_query
|
from api.backend.database.utils import format_list_for_query
|
||||||
from api.backend.database.common import (
|
from api.backend.database.common import query as common_query
|
||||||
insert as common_insert,
|
from api.backend.database.common import insert as common_insert
|
||||||
query as common_query,
|
from api.backend.database.common import update as common_update
|
||||||
QUERIES,
|
from api.backend.database.queries.job.job_queries import JOB_INSERT_QUERY
|
||||||
update as common_update,
|
|
||||||
)
|
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger("Job")
|
||||||
|
|
||||||
|
|
||||||
def insert(item: dict[str, Any]) -> None:
|
def insert(item: dict[str, Any]) -> None:
|
||||||
common_insert(
|
common_insert(
|
||||||
QUERIES["insert_job"],
|
JOB_INSERT_QUERY,
|
||||||
(
|
(
|
||||||
item["id"],
|
item["id"],
|
||||||
item["url"],
|
item["url"],
|
||||||
@@ -27,9 +25,12 @@ def insert(item: dict[str, Any]) -> None:
|
|||||||
item["status"],
|
item["status"],
|
||||||
item["chat"],
|
item["chat"],
|
||||||
item["job_options"],
|
item["job_options"],
|
||||||
|
item["agent_mode"],
|
||||||
|
item["prompt"],
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
LOG.info(f"Inserted item: {item}")
|
|
||||||
|
LOG.debug(f"Inserted item: {item}")
|
||||||
|
|
||||||
|
|
||||||
async def get_queued_job():
|
async def get_queued_job():
|
||||||
@@ -37,61 +38,22 @@ async def get_queued_job():
|
|||||||
"SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
|
"SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
|
||||||
)
|
)
|
||||||
res = common_query(query)
|
res = common_query(query)
|
||||||
LOG.info(f"Got queued job: {res}")
|
LOG.debug(f"Got queued job: {res}")
|
||||||
return res[0] if res else None
|
return res[0] if res else None
|
||||||
|
|
||||||
|
|
||||||
async def update_job(ids: list[str], field: str, value: Any):
|
async def update_job(ids: list[str], field: str, value: Any):
|
||||||
query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
|
query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
|
||||||
res = common_update(query, tuple([value] + ids))
|
res = common_update(query, tuple([value] + ids))
|
||||||
LOG.info(f"Updated job: {res}")
|
LOG.debug(f"Updated job: {res}")
|
||||||
|
|
||||||
|
|
||||||
async def delete_jobs(jobs: list[str]):
|
async def delete_jobs(jobs: list[str]):
|
||||||
if not jobs:
|
if not jobs:
|
||||||
LOG.info("No jobs to delete.")
|
LOG.debug("No jobs to delete.")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}"
|
query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}"
|
||||||
res = common_update(query, tuple(jobs))
|
res = common_update(query, tuple(jobs))
|
||||||
|
|
||||||
return res > 0
|
return res > 0
|
||||||
|
|
||||||
|
|
||||||
async def average_elements_per_link(user: str):
|
|
||||||
job_query = """
|
|
||||||
SELECT
|
|
||||||
DATE(time_created) AS date,
|
|
||||||
AVG(json_array_length(elements)) AS average_elements,
|
|
||||||
COUNT(*) AS count
|
|
||||||
FROM
|
|
||||||
jobs
|
|
||||||
WHERE
|
|
||||||
status = 'Completed' AND user = ?
|
|
||||||
GROUP BY
|
|
||||||
DATE(time_created)
|
|
||||||
ORDER BY
|
|
||||||
date ASC;
|
|
||||||
"""
|
|
||||||
results = common_query(job_query, (user,))
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
async def get_jobs_per_day(user: str):
|
|
||||||
job_query = """
|
|
||||||
SELECT
|
|
||||||
DATE(time_created) AS date,
|
|
||||||
COUNT(*) AS job_count
|
|
||||||
FROM
|
|
||||||
jobs
|
|
||||||
WHERE
|
|
||||||
status = 'Completed' AND user = ?
|
|
||||||
GROUP BY
|
|
||||||
DATE(time_created)
|
|
||||||
ORDER BY
|
|
||||||
date ASC;
|
|
||||||
"""
|
|
||||||
results = common_query(job_query, (user,))
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|||||||
248
api/backend/job/job_router.py
Normal file
248
api/backend/job/job_router.py
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
# STL
|
||||||
|
import csv
|
||||||
|
import uuid
|
||||||
|
import random
|
||||||
|
import logging
|
||||||
|
import datetime
|
||||||
|
from io import StringIO
|
||||||
|
|
||||||
|
# PDM
|
||||||
|
from fastapi import Depends, APIRouter
|
||||||
|
from fastapi.encoders import jsonable_encoder
|
||||||
|
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
|
||||||
|
from apscheduler.triggers.cron import CronTrigger # type: ignore
|
||||||
|
|
||||||
|
# LOCAL
|
||||||
|
from api.backend.job import insert, update_job, delete_jobs
|
||||||
|
from api.backend.constants import MEDIA_DIR, MEDIA_TYPES, RECORDINGS_DIR
|
||||||
|
from api.backend.scheduler import scheduler
|
||||||
|
from api.backend.schemas.job import Job, UpdateJobs, DownloadJob, DeleteScrapeJobs
|
||||||
|
from api.backend.auth.schemas import User
|
||||||
|
from api.backend.schemas.cron import CronJob, DeleteCronJob
|
||||||
|
from api.backend.database.utils import format_list_for_query
|
||||||
|
from api.backend.auth.auth_utils import get_current_user
|
||||||
|
from api.backend.database.common import query
|
||||||
|
from api.backend.job.utils.text_utils import clean_text
|
||||||
|
from api.backend.job.models.job_options import FetchOptions
|
||||||
|
from api.backend.routers.handle_exceptions import handle_exceptions
|
||||||
|
from api.backend.job.utils.clean_job_format import clean_job_format
|
||||||
|
from api.backend.job.cron_scheduling.cron_scheduling import (
|
||||||
|
get_cron_jobs,
|
||||||
|
delete_cron_job,
|
||||||
|
insert_cron_job,
|
||||||
|
get_cron_job_trigger,
|
||||||
|
insert_job_from_cron_job,
|
||||||
|
)
|
||||||
|
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
|
||||||
|
|
||||||
|
LOG = logging.getLogger("Job")
|
||||||
|
|
||||||
|
job_router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
@job_router.post("/update")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
|
async def update(update_jobs: UpdateJobs, _: User = Depends(get_current_user)):
|
||||||
|
"""Used to update jobs"""
|
||||||
|
await update_job(update_jobs.ids, update_jobs.field, update_jobs.value)
|
||||||
|
|
||||||
|
return JSONResponse(content={"message": "Jobs updated successfully."})
|
||||||
|
|
||||||
|
|
||||||
|
@job_router.post("/submit-scrape-job")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
|
async def submit_scrape_job(job: Job):
|
||||||
|
LOG.info(f"Recieved job: {job}")
|
||||||
|
|
||||||
|
job.id = uuid.uuid4().hex
|
||||||
|
job_dict = job.model_dump()
|
||||||
|
insert(job_dict)
|
||||||
|
|
||||||
|
return JSONResponse(
|
||||||
|
content={"id": job.id, "message": "Job submitted successfully."}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@job_router.post("/retrieve-scrape-jobs")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
|
async def retrieve_scrape_jobs(
|
||||||
|
fetch_options: FetchOptions, user: User = Depends(get_current_user)
|
||||||
|
):
|
||||||
|
LOG.info(f"Retrieving jobs for account: {user.email}")
|
||||||
|
ATTRIBUTES = "chat" if fetch_options.chat else "*"
|
||||||
|
job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?"
|
||||||
|
results = query(job_query, (user.email,))
|
||||||
|
return JSONResponse(content=jsonable_encoder(results[::-1]))
|
||||||
|
|
||||||
|
|
||||||
|
@job_router.get("/job/{id}")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
|
async def job(id: str, user: User = Depends(get_current_user)):
|
||||||
|
LOG.info(f"Retrieving jobs for account: {user.email}")
|
||||||
|
job_query = "SELECT * FROM jobs WHERE user = ? AND id = ?"
|
||||||
|
results = query(job_query, (user.email, id))
|
||||||
|
return JSONResponse(content=jsonable_encoder(results))
|
||||||
|
|
||||||
|
|
||||||
|
@job_router.post("/download")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
|
async def download(download_job: DownloadJob):
|
||||||
|
LOG.info(f"Downloading job with ids: {download_job.ids}")
|
||||||
|
job_query = (
|
||||||
|
f"SELECT * FROM jobs WHERE id IN {format_list_for_query(download_job.ids)}"
|
||||||
|
)
|
||||||
|
results = query(job_query, tuple(download_job.ids))
|
||||||
|
|
||||||
|
if download_job.job_format == "csv":
|
||||||
|
csv_buffer = StringIO()
|
||||||
|
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
|
||||||
|
|
||||||
|
headers = [
|
||||||
|
"id",
|
||||||
|
"url",
|
||||||
|
"element_name",
|
||||||
|
"xpath",
|
||||||
|
"text",
|
||||||
|
"user",
|
||||||
|
"time_created",
|
||||||
|
]
|
||||||
|
csv_writer.writerow(headers)
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
for res in result["result"]:
|
||||||
|
for url, elements in res.items():
|
||||||
|
for element_name, values in elements.items():
|
||||||
|
for value in values:
|
||||||
|
text = clean_text(value.get("text", "")).strip()
|
||||||
|
if text:
|
||||||
|
csv_writer.writerow(
|
||||||
|
[
|
||||||
|
result.get("id", "")
|
||||||
|
+ "-"
|
||||||
|
+ str(random.randint(0, 1000000)),
|
||||||
|
url,
|
||||||
|
element_name,
|
||||||
|
value.get("xpath", ""),
|
||||||
|
text,
|
||||||
|
result.get("user", ""),
|
||||||
|
result.get("time_created", ""),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
_ = csv_buffer.seek(0)
|
||||||
|
response = StreamingResponse(
|
||||||
|
csv_buffer,
|
||||||
|
media_type="text/csv",
|
||||||
|
)
|
||||||
|
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
|
||||||
|
return response
|
||||||
|
|
||||||
|
elif download_job.job_format == "md":
|
||||||
|
response = StreamingResponse(
|
||||||
|
stream_md_from_job_results(results),
|
||||||
|
media_type="text/markdown",
|
||||||
|
)
|
||||||
|
|
||||||
|
response.headers["Content-Disposition"] = "attachment; filename=export.md"
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
@job_router.get("/job/{id}/convert-to-csv")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
|
async def convert_to_csv(id: str):
|
||||||
|
job_query = f"SELECT * FROM jobs WHERE id = ?"
|
||||||
|
results = query(job_query, (id,))
|
||||||
|
return JSONResponse(content=clean_job_format(results))
|
||||||
|
|
||||||
|
|
||||||
|
@job_router.post("/delete-scrape-jobs")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
|
async def delete(delete_scrape_jobs: DeleteScrapeJobs):
|
||||||
|
result = await delete_jobs(delete_scrape_jobs.ids)
|
||||||
|
return (
|
||||||
|
JSONResponse(content={"message": "Jobs successfully deleted."})
|
||||||
|
if result
|
||||||
|
else JSONResponse(content={"error": "Jobs not deleted."})
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@job_router.post("/schedule-cron-job")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
|
async def schedule_cron_job(cron_job: CronJob):
|
||||||
|
if not cron_job.id:
|
||||||
|
cron_job.id = uuid.uuid4().hex
|
||||||
|
|
||||||
|
if not cron_job.time_created:
|
||||||
|
cron_job.time_created = datetime.datetime.now()
|
||||||
|
|
||||||
|
if not cron_job.time_updated:
|
||||||
|
cron_job.time_updated = datetime.datetime.now()
|
||||||
|
|
||||||
|
insert_cron_job(cron_job)
|
||||||
|
|
||||||
|
queried_job = query("SELECT * FROM jobs WHERE id = ?", (cron_job.job_id,))
|
||||||
|
|
||||||
|
scheduler.add_job(
|
||||||
|
insert_job_from_cron_job,
|
||||||
|
get_cron_job_trigger(cron_job.cron_expression),
|
||||||
|
id=cron_job.id,
|
||||||
|
args=[queried_job[0]],
|
||||||
|
)
|
||||||
|
|
||||||
|
return JSONResponse(content={"message": "Cron job scheduled successfully."})
|
||||||
|
|
||||||
|
|
||||||
|
@job_router.post("/delete-cron-job")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
|
async def delete_cron_job_request(request: DeleteCronJob):
|
||||||
|
if not request.id:
|
||||||
|
return JSONResponse(
|
||||||
|
content={"error": "Cron job id is required."}, status_code=400
|
||||||
|
)
|
||||||
|
|
||||||
|
delete_cron_job(request.id, request.user_email)
|
||||||
|
scheduler.remove_job(request.id)
|
||||||
|
|
||||||
|
return JSONResponse(content={"message": "Cron job deleted successfully."})
|
||||||
|
|
||||||
|
|
||||||
|
@job_router.get("/cron-jobs")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
|
async def get_cron_jobs_request(user: User = Depends(get_current_user)):
|
||||||
|
cron_jobs = get_cron_jobs(user.email)
|
||||||
|
return JSONResponse(content=jsonable_encoder(cron_jobs))
|
||||||
|
|
||||||
|
|
||||||
|
@job_router.get("/recordings/{id}")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
|
async def get_recording(id: str):
|
||||||
|
path = RECORDINGS_DIR / f"{id}.mp4"
|
||||||
|
if not path.exists():
|
||||||
|
return JSONResponse(content={"error": "Recording not found."}, status_code=404)
|
||||||
|
|
||||||
|
return FileResponse(
|
||||||
|
path, headers={"Content-Type": "video/mp4", "Accept-Ranges": "bytes"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@job_router.get("/get-media")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
|
async def get_media(id: str):
|
||||||
|
files: dict[str, list[str]] = {}
|
||||||
|
|
||||||
|
for media_type in MEDIA_TYPES:
|
||||||
|
path = MEDIA_DIR / media_type / f"{id}"
|
||||||
|
files[media_type] = [file.name for file in path.glob("*")]
|
||||||
|
|
||||||
|
return JSONResponse(content={"files": files})
|
||||||
|
|
||||||
|
|
||||||
|
@job_router.get("/media")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
|
async def get_media_file(id: str, type: str, file: str):
|
||||||
|
path = MEDIA_DIR / type / f"{id}" / file
|
||||||
|
|
||||||
|
if not path.exists():
|
||||||
|
return JSONResponse(content={"error": "Media file not found."}, status_code=404)
|
||||||
|
|
||||||
|
return FileResponse(path)
|
||||||
@@ -1,3 +1,5 @@
|
|||||||
from .job_options import JobOptions
|
# LOCAL
|
||||||
|
from .job import Element, CapturedElement
|
||||||
|
from .job_options import Proxy, JobOptions
|
||||||
|
|
||||||
__all__ = ["JobOptions"]
|
__all__ = ["JobOptions", "CapturedElement", "Element", "Proxy"]
|
||||||
|
|||||||
15
api/backend/job/models/job.py
Normal file
15
api/backend/job/models/job.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import pydantic
|
||||||
|
|
||||||
|
|
||||||
|
class Element(pydantic.BaseModel):
|
||||||
|
name: str
|
||||||
|
xpath: str
|
||||||
|
url: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class CapturedElement(pydantic.BaseModel):
|
||||||
|
xpath: str
|
||||||
|
text: str
|
||||||
|
name: str
|
||||||
@@ -1,8 +1,19 @@
|
|||||||
from pydantic import BaseModel
|
# STL
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
# PDM
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
# LOCAL
|
||||||
from api.backend.job.models.site_map import SiteMap
|
from api.backend.job.models.site_map import SiteMap
|
||||||
|
|
||||||
|
|
||||||
|
class Proxy(BaseModel):
|
||||||
|
server: str
|
||||||
|
username: Optional[str] = None
|
||||||
|
password: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
class FetchOptions(BaseModel):
|
class FetchOptions(BaseModel):
|
||||||
chat: Optional[bool] = None
|
chat: Optional[bool] = None
|
||||||
|
|
||||||
@@ -10,7 +21,8 @@ class FetchOptions(BaseModel):
|
|||||||
class JobOptions(BaseModel):
|
class JobOptions(BaseModel):
|
||||||
multi_page_scrape: bool = False
|
multi_page_scrape: bool = False
|
||||||
custom_headers: dict[str, Any] = {}
|
custom_headers: dict[str, Any] = {}
|
||||||
proxies: list[str] = []
|
proxies: list[Proxy] = []
|
||||||
site_map: Optional[SiteMap] = None
|
site_map: Optional[SiteMap] = None
|
||||||
collect_media: bool = False
|
collect_media: bool = False
|
||||||
custom_cookies: list[dict[str, Any]] = []
|
custom_cookies: list[dict[str, Any]] = []
|
||||||
|
return_html: bool = False
|
||||||
|
|||||||
@@ -1,11 +1,12 @@
|
|||||||
|
# STL
|
||||||
|
import logging
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
# PDM
|
||||||
from playwright.async_api import Page, BrowserContext
|
from playwright.async_api import Page, BrowserContext
|
||||||
|
|
||||||
import logging
|
LOG = logging.getLogger("Job")
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
async def add_custom_cookies(
|
async def add_custom_cookies(
|
||||||
@@ -18,8 +19,8 @@ async def add_custom_cookies(
|
|||||||
|
|
||||||
for cookie in custom_cookies:
|
for cookie in custom_cookies:
|
||||||
cookie_dict = {
|
cookie_dict = {
|
||||||
"name": cookie.get("name", "default_name"),
|
"name": cookie.get("name", ""),
|
||||||
"value": cookie.get("value", "default_value"),
|
"value": cookie.get("value", ""),
|
||||||
"domain": domain,
|
"domain": domain,
|
||||||
"path": "/",
|
"path": "/",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,20 +1,24 @@
|
|||||||
|
# STL
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
import re
|
||||||
from urllib.parse import urlparse
|
import logging
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
# PDM
|
||||||
import aiohttp
|
import aiohttp
|
||||||
from playwright.async_api import Page
|
from playwright.async_api import Page
|
||||||
|
|
||||||
from api.backend.utils import LOG
|
LOG = logging.getLogger("Job")
|
||||||
|
|
||||||
|
|
||||||
async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
|
async def collect_media(id: str, page: Page) -> dict[str, list[dict[str, str]]]:
|
||||||
media_types = {
|
media_types = {
|
||||||
"images": "img",
|
"images": "img",
|
||||||
"videos": "video",
|
"videos": "video",
|
||||||
"audio": "audio",
|
"audio": "audio",
|
||||||
"pdfs": 'a[href$=".pdf"]',
|
"pdfs": 'a[href$=".pdf"], a[href*=".pdf#page="]',
|
||||||
"documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]',
|
"documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]',
|
||||||
"presentations": 'a[href$=".ppt"], a[href$=".pptx"]',
|
"presentations": 'a[href$=".ppt"], a[href$=".pptx"]',
|
||||||
"spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]',
|
"spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]',
|
||||||
@@ -48,6 +52,11 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
|
|||||||
root_domain = f"{root_url.scheme}://{root_url.netloc}"
|
root_domain = f"{root_url.scheme}://{root_url.netloc}"
|
||||||
url = f"{root_domain}{url}"
|
url = f"{root_domain}{url}"
|
||||||
|
|
||||||
|
if url and re.match(r"^[\w\-]+/", url):
|
||||||
|
root_url = urlparse(page.url)
|
||||||
|
root_domain = f"{root_url.scheme}://{root_url.netloc}"
|
||||||
|
url = urljoin(root_domain + "/", url)
|
||||||
|
|
||||||
if url and url.startswith(("http://", "https://")):
|
if url and url.startswith(("http://", "https://")):
|
||||||
try:
|
try:
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
@@ -67,15 +76,20 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
|
|||||||
}.get(media_type, "")
|
}.get(media_type, "")
|
||||||
filename += ext
|
filename += ext
|
||||||
|
|
||||||
file_path = media_dir / filename
|
if not os.path.exists(media_dir / id):
|
||||||
|
os.makedirs(media_dir / id, exist_ok=True)
|
||||||
|
|
||||||
|
file_path = media_dir / id / f"{filename}"
|
||||||
|
|
||||||
async with session.get(url) as response:
|
async with session.get(url) as response:
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
with open(file_path, "wb") as f:
|
with open(file_path, "wb") as f:
|
||||||
while True:
|
while True:
|
||||||
chunk = await response.content.read(8192)
|
chunk = await response.content.read(8192)
|
||||||
if not chunk:
|
if not chunk:
|
||||||
break
|
break
|
||||||
|
|
||||||
f.write(chunk)
|
f.write(chunk)
|
||||||
|
|
||||||
urls.append({"url": url, "local_path": str(file_path)})
|
urls.append({"url": url, "local_path": str(file_path)})
|
||||||
|
|||||||
@@ -1,53 +1,45 @@
|
|||||||
import logging
|
# STL
|
||||||
import random
|
import random
|
||||||
from typing import Any, Optional, cast
|
import logging
|
||||||
|
from typing import Any, cast
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, Tag
|
# PDM
|
||||||
|
from bs4 import Tag, BeautifulSoup
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from camoufox import AsyncCamoufox
|
from camoufox import AsyncCamoufox
|
||||||
from playwright.async_api import Page
|
from playwright.async_api import Page
|
||||||
from urllib.parse import urlparse, urljoin
|
|
||||||
|
|
||||||
from api.backend.models import Element, CapturedElement
|
# LOCAL
|
||||||
from api.backend.job.scraping.scraping_utils import scrape_content
|
from api.backend.constants import RECORDINGS_ENABLED
|
||||||
|
from api.backend.job.models import Element, CapturedElement
|
||||||
|
from api.backend.job.utils.text_utils import clean_text
|
||||||
|
from api.backend.job.scraping.add_custom import add_custom_items
|
||||||
|
from api.backend.job.scraping.scraping_utils import (
|
||||||
|
sxpath,
|
||||||
|
is_same_domain,
|
||||||
|
scrape_content,
|
||||||
|
)
|
||||||
from api.backend.job.site_mapping.site_mapping import handle_site_mapping
|
from api.backend.job.site_mapping.site_mapping import handle_site_mapping
|
||||||
|
|
||||||
from api.backend.job.scraping.add_custom import add_custom_items
|
LOG = logging.getLogger("Job")
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def is_same_domain(url: str, original_url: str) -> bool:
|
|
||||||
parsed_url = urlparse(url)
|
|
||||||
parsed_original_url = urlparse(original_url)
|
|
||||||
return parsed_url.netloc == parsed_original_url.netloc or parsed_url.netloc == ""
|
|
||||||
|
|
||||||
|
|
||||||
def clean_xpath(xpath: str) -> str:
|
|
||||||
parts = xpath.split("/")
|
|
||||||
clean_parts = ["/" if part == "" else part for part in parts]
|
|
||||||
clean_xpath = "//".join(clean_parts).replace("////", "//").replace("'", "\\'")
|
|
||||||
LOG.info(f"Cleaned xpath: {clean_xpath}")
|
|
||||||
|
|
||||||
return clean_xpath
|
|
||||||
|
|
||||||
|
|
||||||
def sxpath(context: etree._Element, xpath: str):
|
|
||||||
return context.xpath(xpath)
|
|
||||||
|
|
||||||
|
|
||||||
async def make_site_request(
|
async def make_site_request(
|
||||||
|
id: str,
|
||||||
url: str,
|
url: str,
|
||||||
headers: Optional[dict[str, Any]],
|
job_options: dict[str, Any],
|
||||||
multi_page_scrape: bool = False,
|
|
||||||
visited_urls: set[str] = set(),
|
visited_urls: set[str] = set(),
|
||||||
pages: set[tuple[str, str]] = set(),
|
pages: set[tuple[str, str]] = set(),
|
||||||
original_url: str = "",
|
original_url: str = "",
|
||||||
proxies: Optional[list[str]] = None,
|
|
||||||
site_map: Optional[dict[str, Any]] = None,
|
|
||||||
collect_media: bool = False,
|
|
||||||
custom_cookies: Optional[list[dict[str, Any]]] = None,
|
|
||||||
):
|
):
|
||||||
|
headers = job_options["custom_headers"]
|
||||||
|
multi_page_scrape = job_options["multi_page_scrape"]
|
||||||
|
proxies = job_options["proxies"]
|
||||||
|
site_map = job_options["site_map"]
|
||||||
|
collect_media = job_options["collect_media"]
|
||||||
|
custom_cookies = job_options["custom_cookies"]
|
||||||
|
|
||||||
if url in visited_urls:
|
if url in visited_urls:
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -57,8 +49,9 @@ async def make_site_request(
|
|||||||
proxy = random.choice(proxies)
|
proxy = random.choice(proxies)
|
||||||
LOG.info(f"Using proxy: {proxy}")
|
LOG.info(f"Using proxy: {proxy}")
|
||||||
|
|
||||||
async with AsyncCamoufox(headless=True, proxy=proxy) as browser:
|
async with AsyncCamoufox(headless=not RECORDINGS_ENABLED, proxy=proxy) as browser:
|
||||||
page: Page = await browser.new_page()
|
page: Page = await browser.new_page()
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
|
||||||
# Add cookies and headers
|
# Add cookies and headers
|
||||||
await add_custom_items(url, page, custom_cookies, headers)
|
await add_custom_items(url, page, custom_cookies, headers)
|
||||||
@@ -67,21 +60,21 @@ async def make_site_request(
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
await page.goto(url, timeout=60000)
|
await page.goto(url, timeout=60000)
|
||||||
await page.wait_for_load_state("networkidle", timeout=10000)
|
await page.wait_for_load_state("networkidle")
|
||||||
|
|
||||||
final_url = page.url
|
final_url = page.url
|
||||||
|
|
||||||
visited_urls.add(url)
|
visited_urls.add(url)
|
||||||
visited_urls.add(final_url)
|
visited_urls.add(final_url)
|
||||||
|
|
||||||
html_content = await scrape_content(page, pages, collect_media)
|
html_content = await scrape_content(id, page, pages, collect_media)
|
||||||
|
|
||||||
html_content = await page.content()
|
html_content = await page.content()
|
||||||
pages.add((html_content, final_url))
|
pages.add((html_content, final_url))
|
||||||
|
|
||||||
if site_map:
|
if site_map:
|
||||||
await handle_site_mapping(
|
await handle_site_mapping(
|
||||||
site_map, page, pages, collect_media=collect_media
|
id, site_map, page, pages, collect_media=collect_media
|
||||||
)
|
)
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
@@ -108,20 +101,18 @@ async def make_site_request(
|
|||||||
|
|
||||||
if link not in visited_urls and is_same_domain(link, original_url):
|
if link not in visited_urls and is_same_domain(link, original_url):
|
||||||
await make_site_request(
|
await make_site_request(
|
||||||
|
id,
|
||||||
link,
|
link,
|
||||||
headers=headers,
|
job_options=job_options,
|
||||||
multi_page_scrape=multi_page_scrape,
|
|
||||||
visited_urls=visited_urls,
|
visited_urls=visited_urls,
|
||||||
pages=pages,
|
pages=pages,
|
||||||
original_url=original_url,
|
original_url=original_url,
|
||||||
proxies=proxies,
|
|
||||||
site_map=site_map,
|
|
||||||
collect_media=collect_media,
|
|
||||||
custom_cookies=custom_cookies,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]):
|
async def collect_scraped_elements(
|
||||||
|
page: tuple[str, str], xpaths: list[Element], return_html: bool
|
||||||
|
):
|
||||||
soup = BeautifulSoup(page[0], "lxml")
|
soup = BeautifulSoup(page[0], "lxml")
|
||||||
root = etree.HTML(str(soup))
|
root = etree.HTML(str(soup))
|
||||||
|
|
||||||
@@ -131,12 +122,24 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
|
|||||||
el = sxpath(root, elem.xpath)
|
el = sxpath(root, elem.xpath)
|
||||||
|
|
||||||
for e in el: # type: ignore
|
for e in el: # type: ignore
|
||||||
|
if return_html:
|
||||||
|
elements[elem.name] = [
|
||||||
|
CapturedElement(
|
||||||
|
xpath=elem.xpath,
|
||||||
|
text=page[0],
|
||||||
|
name=elem.name,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
continue
|
||||||
|
|
||||||
text = (
|
text = (
|
||||||
"\t".join(str(t) for t in e.itertext())
|
" ".join(str(t) for t in e.itertext())
|
||||||
if isinstance(e, etree._Element)
|
if isinstance(e, etree._Element)
|
||||||
else str(e) # type: ignore
|
else str(e) # type: ignore
|
||||||
)
|
)
|
||||||
|
|
||||||
|
text = clean_text(text)
|
||||||
|
|
||||||
captured_element = CapturedElement(
|
captured_element = CapturedElement(
|
||||||
xpath=elem.xpath, text=text, name=elem.name
|
xpath=elem.xpath, text=text, name=elem.name
|
||||||
)
|
)
|
||||||
@@ -150,34 +153,28 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
|
|||||||
|
|
||||||
|
|
||||||
async def scrape(
|
async def scrape(
|
||||||
|
id: str,
|
||||||
url: str,
|
url: str,
|
||||||
xpaths: list[Element],
|
xpaths: list[Element],
|
||||||
headers: Optional[dict[str, Any]] = None,
|
job_options: dict[str, Any],
|
||||||
multi_page_scrape: bool = False,
|
|
||||||
proxies: Optional[list[str]] = None,
|
|
||||||
site_map: Optional[dict[str, Any]] = None,
|
|
||||||
collect_media: bool = False,
|
|
||||||
custom_cookies: Optional[list[dict[str, Any]]] = None,
|
|
||||||
):
|
):
|
||||||
visited_urls: set[str] = set()
|
visited_urls: set[str] = set()
|
||||||
pages: set[tuple[str, str]] = set()
|
pages: set[tuple[str, str]] = set()
|
||||||
|
|
||||||
await make_site_request(
|
await make_site_request(
|
||||||
|
id,
|
||||||
url,
|
url,
|
||||||
headers=headers,
|
job_options=job_options,
|
||||||
multi_page_scrape=multi_page_scrape,
|
|
||||||
visited_urls=visited_urls,
|
visited_urls=visited_urls,
|
||||||
pages=pages,
|
pages=pages,
|
||||||
original_url=url,
|
original_url=url,
|
||||||
proxies=proxies,
|
|
||||||
site_map=site_map,
|
|
||||||
collect_media=collect_media,
|
|
||||||
custom_cookies=custom_cookies,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
elements: list[dict[str, dict[str, list[CapturedElement]]]] = []
|
elements: list[dict[str, dict[str, list[CapturedElement]]]] = []
|
||||||
|
|
||||||
for page in pages:
|
for page in pages:
|
||||||
elements.append(await collect_scraped_elements(page, xpaths))
|
elements.append(
|
||||||
|
await collect_scraped_elements(page, xpaths, job_options["return_html"])
|
||||||
|
)
|
||||||
|
|
||||||
return elements
|
return elements
|
||||||
@@ -1,14 +1,21 @@
|
|||||||
|
# STL
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import logging
|
||||||
from typing import Set, Tuple
|
from typing import Set, Tuple
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
# PDM
|
||||||
|
from lxml import etree
|
||||||
from playwright.async_api import Page
|
from playwright.async_api import Page
|
||||||
|
|
||||||
from api.backend.utils import LOG
|
# LOCAL
|
||||||
|
|
||||||
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
|
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
|
||||||
|
|
||||||
|
LOG = logging.getLogger("Job")
|
||||||
|
|
||||||
|
|
||||||
async def scrape_content(
|
async def scrape_content(
|
||||||
page: Page, pages: Set[Tuple[str, str]], collect_media: bool
|
id: str, page: Page, pages: Set[Tuple[str, str]], collect_media: bool
|
||||||
) -> str:
|
) -> str:
|
||||||
last_height = await page.evaluate("document.body.scrollHeight")
|
last_height = await page.evaluate("document.body.scrollHeight")
|
||||||
|
|
||||||
@@ -27,6 +34,25 @@ async def scrape_content(
|
|||||||
|
|
||||||
if collect_media:
|
if collect_media:
|
||||||
LOG.info("Collecting media")
|
LOG.info("Collecting media")
|
||||||
await collect_media_utils(page)
|
await collect_media_utils(id, page)
|
||||||
|
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
|
||||||
|
def is_same_domain(url: str, original_url: str) -> bool:
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
parsed_original_url = urlparse(original_url)
|
||||||
|
return parsed_url.netloc == parsed_original_url.netloc or parsed_url.netloc == ""
|
||||||
|
|
||||||
|
|
||||||
|
def clean_xpath(xpath: str) -> str:
|
||||||
|
parts = xpath.split("/")
|
||||||
|
clean_parts = ["/" if part == "" else part for part in parts]
|
||||||
|
clean_xpath = "//".join(clean_parts).replace("////", "//").replace("'", "\\'")
|
||||||
|
LOG.info(f"Cleaned xpath: {clean_xpath}")
|
||||||
|
|
||||||
|
return clean_xpath
|
||||||
|
|
||||||
|
|
||||||
|
def sxpath(context: etree._Element, xpath: str):
|
||||||
|
return context.xpath(xpath)
|
||||||
|
|||||||
@@ -1,14 +1,17 @@
|
|||||||
import logging
|
# STL
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import logging
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
# PDM
|
||||||
from playwright.async_api import Page
|
from playwright.async_api import Page
|
||||||
|
|
||||||
|
# LOCAL
|
||||||
from api.backend.job.models.site_map import Action, SiteMap
|
from api.backend.job.models.site_map import Action, SiteMap
|
||||||
from api.backend.job.scraping.scraping_utils import scrape_content
|
from api.backend.job.scraping.scraping_utils import scrape_content
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger("Job")
|
||||||
|
|
||||||
|
|
||||||
def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]:
|
def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]:
|
||||||
@@ -24,7 +27,6 @@ def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]:
|
|||||||
async def handle_input(action: Action, page: Page) -> bool:
|
async def handle_input(action: Action, page: Page) -> bool:
|
||||||
try:
|
try:
|
||||||
element = page.locator(f"xpath={action.xpath}")
|
element = page.locator(f"xpath={action.xpath}")
|
||||||
await element.wait_for(state="visible", timeout=10000)
|
|
||||||
LOG.info(f"Sending keys: {action.input} to element: {action.xpath}")
|
LOG.info(f"Sending keys: {action.input} to element: {action.xpath}")
|
||||||
await element.fill(action.input)
|
await element.fill(action.input)
|
||||||
return True
|
return True
|
||||||
@@ -36,7 +38,6 @@ async def handle_input(action: Action, page: Page) -> bool:
|
|||||||
async def handle_click(action: Action, page: Page) -> bool:
|
async def handle_click(action: Action, page: Page) -> bool:
|
||||||
try:
|
try:
|
||||||
element = page.locator(f"xpath={action.xpath}")
|
element = page.locator(f"xpath={action.xpath}")
|
||||||
await element.wait_for(state="visible", timeout=10000)
|
|
||||||
LOG.info(f"Clicking element: {action.xpath}")
|
LOG.info(f"Clicking element: {action.xpath}")
|
||||||
await element.click()
|
await element.click()
|
||||||
return True
|
return True
|
||||||
@@ -52,6 +53,7 @@ ACTION_MAP = {
|
|||||||
|
|
||||||
|
|
||||||
async def handle_site_mapping(
|
async def handle_site_mapping(
|
||||||
|
id: str,
|
||||||
site_map_dict: dict[str, Any],
|
site_map_dict: dict[str, Any],
|
||||||
page: Page,
|
page: Page,
|
||||||
pages: set[tuple[str, str]],
|
pages: set[tuple[str, str]],
|
||||||
@@ -68,11 +70,11 @@ async def handle_site_mapping(
|
|||||||
|
|
||||||
await asyncio.sleep(2)
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
await scrape_content(page, pages, collect_media=collect_media)
|
await scrape_content(id, page, pages, collect_media=collect_media)
|
||||||
|
|
||||||
cleared_site_map_dict = clear_done_actions(site_map_dict)
|
cleared_site_map_dict = clear_done_actions(site_map_dict)
|
||||||
|
|
||||||
if cleared_site_map_dict["actions"]:
|
if cleared_site_map_dict["actions"]:
|
||||||
await handle_site_mapping(
|
await handle_site_mapping(
|
||||||
cleared_site_map_dict, page, pages, collect_media=collect_media
|
id, cleared_site_map_dict, page, pages, collect_media=collect_media
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
|
# STL
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from api.backend.utils import clean_text
|
# LOCAL
|
||||||
|
from api.backend.job.utils.text_utils import clean_text
|
||||||
|
|
||||||
|
|
||||||
def clean_job_format(jobs: list[dict[str, Any]]) -> dict[str, Any]:
|
def clean_job_format(jobs: list[dict[str, Any]]) -> dict[str, Any]:
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
|
# STL
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from api.backend.utils import clean_text
|
# LOCAL
|
||||||
|
from api.backend.job.utils.text_utils import clean_text
|
||||||
|
|
||||||
|
|
||||||
def stream_md_from_job_results(jobs: list[dict[str, Any]]):
|
def stream_md_from_job_results(jobs: list[dict[str, Any]]):
|
||||||
|
|||||||
10
api/backend/job/utils/text_utils.py
Normal file
10
api/backend/job/utils/text_utils.py
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
def clean_text(text: str):
|
||||||
|
text = text.strip()
|
||||||
|
text = text.replace("\n", " ")
|
||||||
|
text = text.replace("\t", " ")
|
||||||
|
text = text.replace("\r", " ")
|
||||||
|
text = text.replace("\f", " ")
|
||||||
|
text = text.replace("\v", " ")
|
||||||
|
text = text.replace("\b", " ")
|
||||||
|
text = text.replace("\a", " ")
|
||||||
|
return text
|
||||||
31
api/backend/routers/handle_exceptions.py
Normal file
31
api/backend/routers/handle_exceptions.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
# STL
|
||||||
|
import logging
|
||||||
|
import traceback
|
||||||
|
from typing import Any, Union, Callable, Awaitable
|
||||||
|
from functools import wraps
|
||||||
|
|
||||||
|
# PDM
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
|
||||||
|
|
||||||
|
def handle_exceptions(
|
||||||
|
logger: logging.Logger,
|
||||||
|
) -> Callable[
|
||||||
|
[Callable[..., Awaitable[Any]]], Callable[..., Awaitable[Union[Any, JSONResponse]]]
|
||||||
|
]:
|
||||||
|
def decorator(
|
||||||
|
func: Callable[..., Awaitable[Any]],
|
||||||
|
) -> Callable[..., Awaitable[Union[Any, JSONResponse]]]:
|
||||||
|
@wraps(func)
|
||||||
|
async def wrapper(*args: Any, **kwargs: Any) -> Union[Any, JSONResponse]:
|
||||||
|
try:
|
||||||
|
return await func(*args, **kwargs)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Exception occurred: {e}")
|
||||||
|
traceback.print_exc()
|
||||||
|
return JSONResponse(content={"error": str(e)}, status_code=500)
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
return decorator
|
||||||
@@ -1,233 +0,0 @@
|
|||||||
# STL
|
|
||||||
import datetime
|
|
||||||
import uuid
|
|
||||||
import traceback
|
|
||||||
from io import StringIO
|
|
||||||
import csv
|
|
||||||
import logging
|
|
||||||
import random
|
|
||||||
|
|
||||||
# PDM
|
|
||||||
from fastapi import Depends, APIRouter
|
|
||||||
from fastapi.encoders import jsonable_encoder
|
|
||||||
from fastapi.responses import JSONResponse, StreamingResponse
|
|
||||||
from api.backend.scheduler import scheduler
|
|
||||||
from apscheduler.triggers.cron import CronTrigger # type: ignore
|
|
||||||
|
|
||||||
# LOCAL
|
|
||||||
from api.backend.job import insert, update_job, delete_jobs
|
|
||||||
from api.backend.models import (
|
|
||||||
DeleteCronJob,
|
|
||||||
UpdateJobs,
|
|
||||||
DownloadJob,
|
|
||||||
DeleteScrapeJobs,
|
|
||||||
Job,
|
|
||||||
CronJob,
|
|
||||||
)
|
|
||||||
from api.backend.schemas import User
|
|
||||||
from api.backend.auth.auth_utils import get_current_user
|
|
||||||
from api.backend.utils import clean_text, format_list_for_query
|
|
||||||
from api.backend.job.models.job_options import FetchOptions
|
|
||||||
|
|
||||||
from api.backend.database.common import query
|
|
||||||
|
|
||||||
from api.backend.job.cron_scheduling.cron_scheduling import (
|
|
||||||
delete_cron_job,
|
|
||||||
get_cron_job_trigger,
|
|
||||||
insert_cron_job,
|
|
||||||
get_cron_jobs,
|
|
||||||
insert_job_from_cron_job,
|
|
||||||
)
|
|
||||||
|
|
||||||
from api.backend.job.utils.clean_job_format import clean_job_format
|
|
||||||
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
|
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
job_router = APIRouter()
|
|
||||||
|
|
||||||
|
|
||||||
@job_router.post("/update")
|
|
||||||
async def update(update_jobs: UpdateJobs, _: User = Depends(get_current_user)):
|
|
||||||
"""Used to update jobs"""
|
|
||||||
await update_job(update_jobs.ids, update_jobs.field, update_jobs.value)
|
|
||||||
|
|
||||||
|
|
||||||
@job_router.post("/submit-scrape-job")
|
|
||||||
async def submit_scrape_job(job: Job):
|
|
||||||
LOG.info(f"Recieved job: {job}")
|
|
||||||
try:
|
|
||||||
job.id = uuid.uuid4().hex
|
|
||||||
|
|
||||||
job_dict = job.model_dump()
|
|
||||||
insert(job_dict)
|
|
||||||
|
|
||||||
return JSONResponse(content={"id": job.id})
|
|
||||||
except Exception as e:
|
|
||||||
LOG.error(f"Exception occurred: {traceback.format_exc()}")
|
|
||||||
return JSONResponse(content={"error": str(e)}, status_code=500)
|
|
||||||
|
|
||||||
|
|
||||||
@job_router.post("/retrieve-scrape-jobs")
|
|
||||||
async def retrieve_scrape_jobs(
|
|
||||||
fetch_options: FetchOptions, user: User = Depends(get_current_user)
|
|
||||||
):
|
|
||||||
LOG.info(f"Retrieving jobs for account: {user.email}")
|
|
||||||
ATTRIBUTES = "chat" if fetch_options.chat else "*"
|
|
||||||
|
|
||||||
try:
|
|
||||||
job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?"
|
|
||||||
results = query(job_query, (user.email,))
|
|
||||||
return JSONResponse(content=jsonable_encoder(results[::-1]))
|
|
||||||
except Exception as e:
|
|
||||||
LOG.error(f"Exception occurred: {e}")
|
|
||||||
return JSONResponse(content=[], status_code=500)
|
|
||||||
|
|
||||||
|
|
||||||
@job_router.get("/job/{id}")
|
|
||||||
async def job(id: str, user: User = Depends(get_current_user)):
|
|
||||||
LOG.info(f"Retrieving jobs for account: {user.email}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
job_query = "SELECT * FROM jobs WHERE user = ? AND id = ?"
|
|
||||||
results = query(job_query, (user.email, id))
|
|
||||||
return JSONResponse(content=jsonable_encoder(results))
|
|
||||||
except Exception as e:
|
|
||||||
LOG.error(f"Exception occurred: {e}")
|
|
||||||
return JSONResponse(content={"error": str(e)}, status_code=500)
|
|
||||||
|
|
||||||
|
|
||||||
@job_router.post("/download")
|
|
||||||
async def download(download_job: DownloadJob):
|
|
||||||
LOG.info(f"Downloading job with ids: {download_job.ids}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
job_query = (
|
|
||||||
f"SELECT * FROM jobs WHERE id IN {format_list_for_query(download_job.ids)}"
|
|
||||||
)
|
|
||||||
results = query(job_query, tuple(download_job.ids))
|
|
||||||
|
|
||||||
if download_job.job_format == "csv":
|
|
||||||
csv_buffer = StringIO()
|
|
||||||
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
|
|
||||||
|
|
||||||
headers = [
|
|
||||||
"id",
|
|
||||||
"url",
|
|
||||||
"element_name",
|
|
||||||
"xpath",
|
|
||||||
"text",
|
|
||||||
"user",
|
|
||||||
"time_created",
|
|
||||||
]
|
|
||||||
csv_writer.writerow(headers)
|
|
||||||
|
|
||||||
for result in results:
|
|
||||||
for res in result["result"]:
|
|
||||||
for url, elements in res.items():
|
|
||||||
for element_name, values in elements.items():
|
|
||||||
for value in values:
|
|
||||||
text = clean_text(value.get("text", "")).strip()
|
|
||||||
if text:
|
|
||||||
csv_writer.writerow(
|
|
||||||
[
|
|
||||||
result.get("id", "")
|
|
||||||
+ "-"
|
|
||||||
+ str(random.randint(0, 1000000)),
|
|
||||||
url,
|
|
||||||
element_name,
|
|
||||||
value.get("xpath", ""),
|
|
||||||
text,
|
|
||||||
result.get("user", ""),
|
|
||||||
result.get("time_created", ""),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
_ = csv_buffer.seek(0)
|
|
||||||
response = StreamingResponse(
|
|
||||||
csv_buffer,
|
|
||||||
media_type="text/csv",
|
|
||||||
)
|
|
||||||
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
|
|
||||||
return response
|
|
||||||
|
|
||||||
elif download_job.job_format == "md":
|
|
||||||
response = StreamingResponse(
|
|
||||||
stream_md_from_job_results(results),
|
|
||||||
media_type="text/markdown",
|
|
||||||
)
|
|
||||||
|
|
||||||
response.headers["Content-Disposition"] = "attachment; filename=export.md"
|
|
||||||
return response
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
LOG.error(f"Exception occurred: {e}")
|
|
||||||
traceback.print_exc()
|
|
||||||
return {"error": str(e)}
|
|
||||||
|
|
||||||
|
|
||||||
@job_router.get("/job/{id}/convert-to-csv")
|
|
||||||
async def convert_to_csv(id: str):
|
|
||||||
try:
|
|
||||||
job_query = f"SELECT * FROM jobs WHERE id = ?"
|
|
||||||
results = query(job_query, (id,))
|
|
||||||
|
|
||||||
return JSONResponse(content=clean_job_format(results))
|
|
||||||
except Exception as e:
|
|
||||||
LOG.error(f"Exception occurred: {e}")
|
|
||||||
traceback.print_exc()
|
|
||||||
return {"error": str(e)}
|
|
||||||
|
|
||||||
|
|
||||||
@job_router.post("/delete-scrape-jobs")
|
|
||||||
async def delete(delete_scrape_jobs: DeleteScrapeJobs):
|
|
||||||
result = await delete_jobs(delete_scrape_jobs.ids)
|
|
||||||
return (
|
|
||||||
JSONResponse(content={"message": "Jobs successfully deleted."})
|
|
||||||
if result
|
|
||||||
else JSONResponse({"error": "Jobs not deleted."})
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@job_router.post("/schedule-cron-job")
|
|
||||||
async def schedule_cron_job(cron_job: CronJob):
|
|
||||||
if not cron_job.id:
|
|
||||||
cron_job.id = uuid.uuid4().hex
|
|
||||||
|
|
||||||
if not cron_job.time_created:
|
|
||||||
cron_job.time_created = datetime.datetime.now()
|
|
||||||
|
|
||||||
if not cron_job.time_updated:
|
|
||||||
cron_job.time_updated = datetime.datetime.now()
|
|
||||||
|
|
||||||
insert_cron_job(cron_job)
|
|
||||||
|
|
||||||
queried_job = query("SELECT * FROM jobs WHERE id = ?", (cron_job.job_id,))
|
|
||||||
|
|
||||||
scheduler.add_job(
|
|
||||||
insert_job_from_cron_job,
|
|
||||||
get_cron_job_trigger(cron_job.cron_expression),
|
|
||||||
id=cron_job.id,
|
|
||||||
args=[queried_job[0]],
|
|
||||||
)
|
|
||||||
|
|
||||||
return JSONResponse(content={"message": "Cron job scheduled successfully."})
|
|
||||||
|
|
||||||
|
|
||||||
@job_router.post("/delete-cron-job")
|
|
||||||
async def delete_cron_job_request(request: DeleteCronJob):
|
|
||||||
if not request.id:
|
|
||||||
return JSONResponse(
|
|
||||||
content={"error": "Cron job id is required."}, status_code=400
|
|
||||||
)
|
|
||||||
|
|
||||||
delete_cron_job(request.id, request.user_email)
|
|
||||||
scheduler.remove_job(request.id)
|
|
||||||
|
|
||||||
return JSONResponse(content={"message": "Cron job deleted successfully."})
|
|
||||||
|
|
||||||
|
|
||||||
@job_router.get("/cron-jobs")
|
|
||||||
async def get_cron_jobs_request(user: User = Depends(get_current_user)):
|
|
||||||
cron_jobs = get_cron_jobs(user.email)
|
|
||||||
return JSONResponse(content=jsonable_encoder(cron_jobs))
|
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
|
# PDM
|
||||||
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
|
|
||||||
scheduler = BackgroundScheduler()
|
scheduler = BackgroundScheduler()
|
||||||
|
|||||||
17
api/backend/schemas/cron.py
Normal file
17
api/backend/schemas/cron.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
from typing import Optional, Union
|
||||||
|
from datetime import datetime
|
||||||
|
import pydantic
|
||||||
|
|
||||||
|
|
||||||
|
class CronJob(pydantic.BaseModel):
|
||||||
|
id: Optional[str] = None
|
||||||
|
user_email: str
|
||||||
|
job_id: str
|
||||||
|
cron_expression: str
|
||||||
|
time_created: Optional[Union[datetime, str]] = None
|
||||||
|
time_updated: Optional[Union[datetime, str]] = None
|
||||||
|
|
||||||
|
|
||||||
|
class DeleteCronJob(pydantic.BaseModel):
|
||||||
|
id: str
|
||||||
|
user_email: str
|
||||||
@@ -1,24 +1,24 @@
|
|||||||
# STL
|
|
||||||
from typing import Any, Literal, Optional, Union
|
from typing import Any, Literal, Optional, Union
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
# LOCAL
|
|
||||||
from api.backend.job.models.job_options import JobOptions
|
from api.backend.job.models.job_options import JobOptions
|
||||||
|
|
||||||
# PDM
|
|
||||||
import pydantic
|
import pydantic
|
||||||
|
|
||||||
|
from api.backend.job.models import Element, CapturedElement
|
||||||
class Element(pydantic.BaseModel):
|
|
||||||
name: str
|
|
||||||
xpath: str
|
|
||||||
url: Optional[str] = None
|
|
||||||
|
|
||||||
|
|
||||||
class CapturedElement(pydantic.BaseModel):
|
class Job(pydantic.BaseModel):
|
||||||
xpath: str
|
id: Optional[str] = None
|
||||||
text: str
|
url: str
|
||||||
name: str
|
elements: list[Element]
|
||||||
|
user: str = ""
|
||||||
|
time_created: Optional[Union[datetime, str]] = None
|
||||||
|
result: list[dict[str, dict[str, list[CapturedElement]]]] = []
|
||||||
|
job_options: JobOptions
|
||||||
|
status: str = "Queued"
|
||||||
|
chat: Optional[str] = None
|
||||||
|
agent_mode: bool = False
|
||||||
|
prompt: Optional[str] = None
|
||||||
|
favorite: bool = False
|
||||||
|
|
||||||
|
|
||||||
class RetrieveScrapeJobs(pydantic.BaseModel):
|
class RetrieveScrapeJobs(pydantic.BaseModel):
|
||||||
@@ -34,41 +34,7 @@ class DeleteScrapeJobs(pydantic.BaseModel):
|
|||||||
ids: list[str]
|
ids: list[str]
|
||||||
|
|
||||||
|
|
||||||
class GetStatistics(pydantic.BaseModel):
|
|
||||||
user: str
|
|
||||||
|
|
||||||
|
|
||||||
class UpdateJobs(pydantic.BaseModel):
|
class UpdateJobs(pydantic.BaseModel):
|
||||||
ids: list[str]
|
ids: list[str]
|
||||||
field: str
|
field: str
|
||||||
value: Any
|
value: Any
|
||||||
|
|
||||||
|
|
||||||
class AI(pydantic.BaseModel):
|
|
||||||
messages: list[Any]
|
|
||||||
|
|
||||||
|
|
||||||
class Job(pydantic.BaseModel):
|
|
||||||
id: Optional[str] = None
|
|
||||||
url: str
|
|
||||||
elements: list[Element]
|
|
||||||
user: str = ""
|
|
||||||
time_created: Optional[Union[datetime, str]] = None
|
|
||||||
result: list[dict[str, dict[str, list[CapturedElement]]]] = []
|
|
||||||
job_options: JobOptions
|
|
||||||
status: str = "Queued"
|
|
||||||
chat: Optional[str] = None
|
|
||||||
|
|
||||||
|
|
||||||
class CronJob(pydantic.BaseModel):
|
|
||||||
id: Optional[str] = None
|
|
||||||
user_email: str
|
|
||||||
job_id: str
|
|
||||||
cron_expression: str
|
|
||||||
time_created: Optional[Union[datetime, str]] = None
|
|
||||||
time_updated: Optional[Union[datetime, str]] = None
|
|
||||||
|
|
||||||
|
|
||||||
class DeleteCronJob(pydantic.BaseModel):
|
|
||||||
id: str
|
|
||||||
user_email: str
|
|
||||||
@@ -2,28 +2,30 @@
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
# PDM
|
# PDM
|
||||||
from fastapi import APIRouter, Depends
|
from fastapi import Depends, APIRouter
|
||||||
|
|
||||||
# LOCAL
|
# LOCAL
|
||||||
from api.backend.job import (
|
from api.backend.auth.schemas import User
|
||||||
|
from api.backend.auth.auth_utils import get_current_user
|
||||||
|
from api.backend.routers.handle_exceptions import handle_exceptions
|
||||||
|
from api.backend.database.queries.statistics.statistic_queries import (
|
||||||
get_jobs_per_day,
|
get_jobs_per_day,
|
||||||
average_elements_per_link,
|
average_elements_per_link,
|
||||||
)
|
)
|
||||||
from api.backend.auth.auth_utils import get_current_user
|
|
||||||
from api.backend.schemas import User
|
|
||||||
|
|
||||||
|
LOG = logging.getLogger("Statistics")
|
||||||
LOG = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
stats_router = APIRouter()
|
stats_router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
@stats_router.get("/statistics/get-average-element-per-link")
|
@stats_router.get("/statistics/get-average-element-per-link")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
async def get_average_element_per_link(user: User = Depends(get_current_user)):
|
async def get_average_element_per_link(user: User = Depends(get_current_user)):
|
||||||
return await average_elements_per_link(user.email)
|
return await average_elements_per_link(user.email)
|
||||||
|
|
||||||
|
|
||||||
@stats_router.get("/statistics/get-average-jobs-per-day")
|
@stats_router.get("/statistics/get-average-jobs-per-day")
|
||||||
|
@handle_exceptions(logger=LOG)
|
||||||
async def average_jobs_per_day(user: User = Depends(get_current_user)):
|
async def average_jobs_per_day(user: User = Depends(get_current_user)):
|
||||||
data = await get_jobs_per_day(user.email)
|
data = await get_jobs_per_day(user.email)
|
||||||
return data
|
return data
|
||||||
63
api/backend/tests/conftest.py
Normal file
63
api/backend/tests/conftest.py
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
# STL
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
from typing import Generator
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
# PDM
|
||||||
|
import pytest
|
||||||
|
from proxy import Proxy
|
||||||
|
|
||||||
|
# LOCAL
|
||||||
|
from api.backend.database.schema import INIT_QUERY
|
||||||
|
from api.backend.tests.constants import TEST_DB_PATH
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session", autouse=True)
|
||||||
|
def running_proxy():
|
||||||
|
proxy = Proxy(["--hostname", "127.0.0.1", "--port", "8080"])
|
||||||
|
proxy.setup()
|
||||||
|
yield proxy
|
||||||
|
proxy.shutdown()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session", autouse=True)
|
||||||
|
def patch_database_path():
|
||||||
|
with patch("api.backend.database.common.DATABASE_PATH", TEST_DB_PATH):
|
||||||
|
yield
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session", autouse=True)
|
||||||
|
def patch_recordings_enabled():
|
||||||
|
with patch("api.backend.job.scraping.scraping.RECORDINGS_ENABLED", False):
|
||||||
|
yield
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def test_db_path() -> str:
|
||||||
|
return TEST_DB_PATH
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session", autouse=True)
|
||||||
|
def test_db(test_db_path: str) -> Generator[str, None, None]:
|
||||||
|
"""Create a fresh test database for each test function."""
|
||||||
|
os.makedirs(os.path.dirname(test_db_path), exist_ok=True)
|
||||||
|
|
||||||
|
if os.path.exists(test_db_path):
|
||||||
|
os.remove(test_db_path)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(test_db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
for query in INIT_QUERY.strip().split(";"):
|
||||||
|
query = query.strip()
|
||||||
|
if query:
|
||||||
|
cursor.execute(query)
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
yield test_db_path
|
||||||
|
|
||||||
|
if os.path.exists(test_db_path):
|
||||||
|
os.remove(test_db_path)
|
||||||
1
api/backend/tests/constants.py
Normal file
1
api/backend/tests/constants.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
TEST_DB_PATH = "tests/test_db.sqlite"
|
||||||
@@ -1,7 +1,13 @@
|
|||||||
from api.backend.models import Element, Job, JobOptions, CapturedElement
|
# STL
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
|
# PDM
|
||||||
from faker import Faker
|
from faker import Faker
|
||||||
|
|
||||||
|
# LOCAL
|
||||||
|
from api.backend.job.models import Element, JobOptions, CapturedElement
|
||||||
|
from api.backend.schemas.job import Job
|
||||||
|
|
||||||
fake = Faker()
|
fake = Faker()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,13 @@
|
|||||||
|
# STL
|
||||||
|
from unittest.mock import AsyncMock, patch
|
||||||
|
|
||||||
|
# PDM
|
||||||
import pytest
|
import pytest
|
||||||
from fastapi.testclient import TestClient
|
from fastapi.testclient import TestClient
|
||||||
from unittest.mock import AsyncMock, patch
|
|
||||||
|
# LOCAL
|
||||||
from api.backend.app import app
|
from api.backend.app import app
|
||||||
from api.backend.models import DownloadJob
|
from api.backend.schemas.job import DownloadJob
|
||||||
from api.backend.tests.factories.job_factory import create_completed_job
|
from api.backend.tests.factories.job_factory import create_completed_job
|
||||||
|
|
||||||
client = TestClient(app)
|
client = TestClient(app)
|
||||||
@@ -13,8 +18,8 @@ mocked_random_int = 123456
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@patch("api.backend.routers.job_router.query")
|
@patch("api.backend.job.job_router.query")
|
||||||
@patch("api.backend.routers.job_router.random.randint")
|
@patch("api.backend.job.job_router.random.randint")
|
||||||
async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
|
async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
|
||||||
# Ensure the mock returns immediately
|
# Ensure the mock returns immediately
|
||||||
mock_query.return_value = mock_results
|
mock_query.return_value = mock_results
|
||||||
|
|||||||
@@ -1,12 +1,26 @@
|
|||||||
import pytest
|
# STL
|
||||||
import logging
|
import logging
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
from playwright.async_api import async_playwright, Cookie, Route
|
from datetime import datetime
|
||||||
|
|
||||||
|
# PDM
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
from playwright.async_api import Route, Cookie, async_playwright
|
||||||
|
|
||||||
|
# LOCAL
|
||||||
|
from api.backend.app import app
|
||||||
|
from api.backend.job.models import Proxy, Element, JobOptions
|
||||||
|
from api.backend.schemas.job import Job
|
||||||
|
from api.backend.database.common import query
|
||||||
|
from api.backend.job.scraping.scraping import scrape
|
||||||
from api.backend.job.scraping.add_custom import add_custom_items
|
from api.backend.job.scraping.add_custom import add_custom_items
|
||||||
|
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
client = TestClient(app)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_add_custom_items():
|
async def test_add_custom_items():
|
||||||
@@ -51,3 +65,46 @@ async def test_add_custom_items():
|
|||||||
assert captured_headers.get("user-agent") == "test-agent"
|
assert captured_headers.get("user-agent") == "test-agent"
|
||||||
|
|
||||||
await browser.close()
|
await browser.close()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_proxies():
|
||||||
|
job = Job(
|
||||||
|
url="https://example.com",
|
||||||
|
elements=[Element(xpath="//div", name="test")],
|
||||||
|
job_options=JobOptions(
|
||||||
|
proxies=[
|
||||||
|
Proxy(
|
||||||
|
server="127.0.0.1:8080",
|
||||||
|
username="user",
|
||||||
|
password="pass",
|
||||||
|
)
|
||||||
|
],
|
||||||
|
),
|
||||||
|
time_created=datetime.now().isoformat(),
|
||||||
|
)
|
||||||
|
|
||||||
|
response = client.post("/submit-scrape-job", json=job.model_dump())
|
||||||
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
jobs = query("SELECT * FROM jobs")
|
||||||
|
job = jobs[0]
|
||||||
|
|
||||||
|
assert job is not None
|
||||||
|
assert job["job_options"]["proxies"] == [
|
||||||
|
{
|
||||||
|
"server": "127.0.0.1:8080",
|
||||||
|
"username": "user",
|
||||||
|
"password": "pass",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
response = await scrape(
|
||||||
|
id=job["id"],
|
||||||
|
url=job["url"],
|
||||||
|
xpaths=[Element(**e) for e in job["elements"]],
|
||||||
|
job_options=job["job_options"],
|
||||||
|
)
|
||||||
|
|
||||||
|
example_response = response[0]["https://example.com/"]
|
||||||
|
assert example_response is not {}
|
||||||
|
|||||||
17
api/backend/tests/utilities/database.py
Normal file
17
api/backend/tests/utilities/database.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
# STL
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
# LOCAL
|
||||||
|
from api.backend.database.schema import INIT_QUERY
|
||||||
|
from api.backend.tests.constants import TEST_DB_PATH
|
||||||
|
|
||||||
|
|
||||||
|
def connect_to_db():
|
||||||
|
conn = sqlite3.connect(TEST_DB_PATH)
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
for query in INIT_QUERY.split(";"):
|
||||||
|
cur.execute(query)
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
return conn, cur
|
||||||
@@ -1,17 +1,10 @@
|
|||||||
from typing import Any, Optional
|
# STL
|
||||||
import logging
|
import logging
|
||||||
import json
|
from typing import Optional
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def clean_text(text: str):
|
|
||||||
text = text.replace("\r\n", "\n") # Normalize newlines
|
|
||||||
text = text.replace("\n", "\\n") # Escape newlines
|
|
||||||
text = text.replace('"', '\\"') # Escape double quotes
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
def get_log_level(level_name: Optional[str]) -> int:
|
def get_log_level(level_name: Optional[str]) -> int:
|
||||||
level = logging.INFO
|
level = logging.INFO
|
||||||
|
|
||||||
@@ -20,30 +13,3 @@ def get_log_level(level_name: Optional[str]) -> int:
|
|||||||
level = getattr(logging, level_name, logging.INFO)
|
level = getattr(logging, level_name, logging.INFO)
|
||||||
|
|
||||||
return level
|
return level
|
||||||
|
|
||||||
|
|
||||||
def format_list_for_query(ids: list[str]):
|
|
||||||
return (
|
|
||||||
f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def format_sql_row_to_python(row: dict[str, Any]):
|
|
||||||
new_row: dict[str, Any] = {}
|
|
||||||
for key, value in row.items():
|
|
||||||
if isinstance(value, str):
|
|
||||||
try:
|
|
||||||
new_row[key] = json.loads(value)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
new_row[key] = value
|
|
||||||
else:
|
|
||||||
new_row[key] = value
|
|
||||||
|
|
||||||
return new_row
|
|
||||||
|
|
||||||
|
|
||||||
def format_json(items: list[Any]):
|
|
||||||
for idx, item in enumerate(items):
|
|
||||||
if isinstance(item, (dict, list)):
|
|
||||||
formatted_item = json.dumps(item)
|
|
||||||
items[idx] = formatted_item
|
|
||||||
|
|||||||
17
api/backend/worker/constants.py
Normal file
17
api/backend/worker/constants.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
# STL
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "")
|
||||||
|
NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "")
|
||||||
|
SCRAPERR_FRONTEND_URL = os.getenv("SCRAPERR_FRONTEND_URL", "")
|
||||||
|
EMAIL = os.getenv("EMAIL", "")
|
||||||
|
TO = os.getenv("TO", "")
|
||||||
|
SMTP_HOST = os.getenv("SMTP_HOST", "")
|
||||||
|
SMTP_PORT = int(os.getenv("SMTP_PORT", 587))
|
||||||
|
SMTP_USER = os.getenv("SMTP_USER", "")
|
||||||
|
SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "")
|
||||||
|
USE_TLS = os.getenv("USE_TLS", "false").lower() == "true"
|
||||||
|
|
||||||
|
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
|
||||||
|
RECORDINGS_DIR = Path("/project/app/media/recordings")
|
||||||
@@ -1,39 +1,68 @@
|
|||||||
import os
|
# STL
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from api.backend.job import get_queued_job, update_job
|
|
||||||
from api.backend.scraping import scrape
|
|
||||||
from api.backend.models import Element
|
|
||||||
from fastapi.encoders import jsonable_encoder
|
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import traceback
|
import traceback
|
||||||
|
import subprocess
|
||||||
|
|
||||||
from api.backend.database.startup import init_database
|
# PDM
|
||||||
|
from fastapi.encoders import jsonable_encoder
|
||||||
|
|
||||||
from api.backend.worker.post_job_complete.post_job_complete import post_job_complete
|
# LOCAL
|
||||||
|
from api.backend.job import update_job, get_queued_job
|
||||||
|
from api.backend.job.models import Element
|
||||||
from api.backend.worker.logger import LOG
|
from api.backend.worker.logger import LOG
|
||||||
|
from api.backend.ai.agent.agent import scrape_with_agent
|
||||||
|
from api.backend.database.startup import init_database
|
||||||
NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "")
|
from api.backend.worker.constants import (
|
||||||
NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "")
|
TO,
|
||||||
SCRAPERR_FRONTEND_URL = os.getenv("SCRAPERR_FRONTEND_URL", "")
|
EMAIL,
|
||||||
EMAIL = os.getenv("EMAIL", "")
|
USE_TLS,
|
||||||
TO = os.getenv("TO", "")
|
SMTP_HOST,
|
||||||
SMTP_HOST = os.getenv("SMTP_HOST", "")
|
SMTP_PORT,
|
||||||
SMTP_PORT = int(os.getenv("SMTP_PORT", 587))
|
SMTP_USER,
|
||||||
SMTP_USER = os.getenv("SMTP_USER", "")
|
SMTP_PASSWORD,
|
||||||
SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "")
|
RECORDINGS_DIR,
|
||||||
USE_TLS = os.getenv("USE_TLS", "false").lower() == "true"
|
RECORDINGS_ENABLED,
|
||||||
|
NOTIFICATION_CHANNEL,
|
||||||
|
SCRAPERR_FRONTEND_URL,
|
||||||
|
NOTIFICATION_WEBHOOK_URL,
|
||||||
|
)
|
||||||
|
from api.backend.job.scraping.scraping import scrape
|
||||||
|
from api.backend.worker.post_job_complete.post_job_complete import post_job_complete
|
||||||
|
|
||||||
|
|
||||||
async def process_job():
|
async def process_job():
|
||||||
job = await get_queued_job()
|
job = await get_queued_job()
|
||||||
|
ffmpeg_proc = None
|
||||||
status = "Queued"
|
status = "Queued"
|
||||||
|
|
||||||
if job:
|
if job:
|
||||||
LOG.info(f"Beginning processing job: {job}.")
|
LOG.info(f"Beginning processing job: {job}.")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
output_path = RECORDINGS_DIR / f"{job['id']}.mp4"
|
||||||
|
|
||||||
|
if RECORDINGS_ENABLED:
|
||||||
|
ffmpeg_proc = subprocess.Popen(
|
||||||
|
[
|
||||||
|
"ffmpeg",
|
||||||
|
"-y",
|
||||||
|
"-video_size",
|
||||||
|
"1280x1024",
|
||||||
|
"-framerate",
|
||||||
|
"15",
|
||||||
|
"-f",
|
||||||
|
"x11grab",
|
||||||
|
"-i",
|
||||||
|
":99",
|
||||||
|
"-codec:v",
|
||||||
|
"libx264",
|
||||||
|
"-preset",
|
||||||
|
"ultrafast",
|
||||||
|
output_path,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
_ = await update_job([job["id"]], field="status", value="Scraping")
|
_ = await update_job([job["id"]], field="status", value="Scraping")
|
||||||
|
|
||||||
proxies = job["job_options"]["proxies"]
|
proxies = job["job_options"]["proxies"]
|
||||||
@@ -45,16 +74,16 @@ async def process_job():
|
|||||||
LOG.error(f"Failed to parse proxy JSON: {proxies}")
|
LOG.error(f"Failed to parse proxy JSON: {proxies}")
|
||||||
proxies = []
|
proxies = []
|
||||||
|
|
||||||
scraped = await scrape(
|
if job["agent_mode"]:
|
||||||
job["url"],
|
scraped = await scrape_with_agent(job)
|
||||||
[Element(**j) for j in job["elements"]],
|
else:
|
||||||
job["job_options"]["custom_headers"],
|
scraped = await scrape(
|
||||||
job["job_options"]["multi_page_scrape"],
|
job["id"],
|
||||||
proxies,
|
job["url"],
|
||||||
job["job_options"]["site_map"],
|
[Element(**j) for j in job["elements"]],
|
||||||
job["job_options"]["collect_media"],
|
{**job["job_options"], "proxies": proxies},
|
||||||
job["job_options"]["custom_cookies"],
|
)
|
||||||
)
|
|
||||||
LOG.info(
|
LOG.info(
|
||||||
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
|
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
|
||||||
)
|
)
|
||||||
@@ -87,12 +116,18 @@ async def process_job():
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if ffmpeg_proc:
|
||||||
|
ffmpeg_proc.terminate()
|
||||||
|
ffmpeg_proc.wait()
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
LOG.info("Starting job worker...")
|
LOG.info("Starting job worker...")
|
||||||
|
|
||||||
init_database()
|
init_database()
|
||||||
|
|
||||||
|
RECORDINGS_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
await process_job()
|
await process_job()
|
||||||
await asyncio.sleep(5)
|
await asyncio.sleep(5)
|
||||||
|
|||||||
@@ -1,12 +1,13 @@
|
|||||||
|
# STL
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
|
|
||||||
from api.backend.utils import get_log_level
|
# LOCAL
|
||||||
|
from api.backend.app import LOG_LEVEL
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=get_log_level(os.getenv("LOG_LEVEL")),
|
level=LOG_LEVEL,
|
||||||
format="%(levelname)s: %(asctime)s - %(name)s - %(message)s",
|
format="%(levelname)s: %(asctime)s - [%(name)s] - %(message)s",
|
||||||
handlers=[logging.StreamHandler()],
|
handlers=[logging.StreamHandler()],
|
||||||
)
|
)
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger("Job Worker")
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
|
# STL
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
# LOCAL
|
||||||
from api.backend.worker.post_job_complete.models import PostJobCompleteOptions
|
from api.backend.worker.post_job_complete.models import PostJobCompleteOptions
|
||||||
from api.backend.worker.post_job_complete.email_notifcation import (
|
from api.backend.worker.post_job_complete.email_notifcation import (
|
||||||
send_job_complete_email,
|
send_job_complete_email,
|
||||||
@@ -16,9 +18,10 @@ async def post_job_complete(job: dict[str, Any], options: PostJobCompleteOptions
|
|||||||
if not options.values():
|
if not options.values():
|
||||||
return
|
return
|
||||||
|
|
||||||
if options["channel"] == "discord":
|
match options["channel"]:
|
||||||
discord_notification(job, options)
|
case "discord":
|
||||||
elif options["channel"] == "email":
|
discord_notification(job, options)
|
||||||
send_job_complete_email(job, options)
|
case "email":
|
||||||
else:
|
send_job_complete_email(job, options)
|
||||||
raise ValueError(f"Invalid channel: {options['channel']}")
|
case _:
|
||||||
|
raise ValueError(f"Invalid channel: {options['channel']}")
|
||||||
|
|||||||
23
cypress/e2e/00-setup.cy.ts
Normal file
23
cypress/e2e/00-setup.cy.ts
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
describe("Global setup", () => {
|
||||||
|
it("signs up user once", () => {
|
||||||
|
cy.request({
|
||||||
|
method: "POST",
|
||||||
|
url: "/api/signup",
|
||||||
|
body: JSON.stringify({
|
||||||
|
data: {
|
||||||
|
email: "test@test.com",
|
||||||
|
password: "password",
|
||||||
|
full_name: "John Doe",
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
failOnStatusCode: false,
|
||||||
|
}).then((response) => {
|
||||||
|
if (response.status !== 200 && response.status !== 201) {
|
||||||
|
console.warn("Signup failed:", response.status, response.body);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
101
cypress/e2e/advanced-job-options.cy.ts
Normal file
101
cypress/e2e/advanced-job-options.cy.ts
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
import { login } from "../utilities/authentication.utils";
|
||||||
|
import {
|
||||||
|
addCustomHeaders,
|
||||||
|
addElement,
|
||||||
|
addMedia,
|
||||||
|
addSiteMapAction,
|
||||||
|
checkForMedia,
|
||||||
|
cleanUpJobs,
|
||||||
|
enterJobUrl,
|
||||||
|
openAdvancedJobOptions,
|
||||||
|
submitBasicJob,
|
||||||
|
submitJob,
|
||||||
|
waitForJobCompletion,
|
||||||
|
} from "../utilities/job.utilities";
|
||||||
|
import { mockSubmitJob } from "../utilities/mocks";
|
||||||
|
|
||||||
|
describe.only("Advanced Job Options", () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
mockSubmitJob();
|
||||||
|
login();
|
||||||
|
cy.visit("/");
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
cleanUpJobs();
|
||||||
|
});
|
||||||
|
|
||||||
|
it.only("should handle custom headers", () => {
|
||||||
|
const customHeaders = {
|
||||||
|
"User-Agent": "Test Agent",
|
||||||
|
"Accept-Language": "en-US",
|
||||||
|
};
|
||||||
|
|
||||||
|
addCustomHeaders(customHeaders);
|
||||||
|
submitBasicJob("https://httpbin.org/headers", "headers", "//pre");
|
||||||
|
|
||||||
|
cy.wait("@submitScrapeJob").then((interception) => {
|
||||||
|
expect(interception.response?.statusCode).to.eq(200);
|
||||||
|
expect(
|
||||||
|
interception.request?.body.data.job_options.custom_headers
|
||||||
|
).to.deep.equal(customHeaders);
|
||||||
|
});
|
||||||
|
|
||||||
|
waitForJobCompletion("https://httpbin.org/headers");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle site map actions", () => {
|
||||||
|
addSiteMapAction("click", "//button[contains(text(), 'Load More')]");
|
||||||
|
addSiteMapAction("input", "//input[@type='search']", "test search");
|
||||||
|
|
||||||
|
submitBasicJob("https://example.com", "content", "//div[@class='content']");
|
||||||
|
|
||||||
|
cy.wait("@submitScrapeJob").then((interception) => {
|
||||||
|
expect(interception.response?.statusCode).to.eq(200);
|
||||||
|
const siteMap = interception.request?.body.data.job_options.site_map;
|
||||||
|
expect(siteMap.actions).to.have.length(2);
|
||||||
|
expect(siteMap.actions[0].type).to.equal("click");
|
||||||
|
expect(siteMap.actions[1].type).to.equal("input");
|
||||||
|
});
|
||||||
|
|
||||||
|
waitForJobCompletion("https://example.com");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle multiple elements", () => {
|
||||||
|
enterJobUrl("https://books.toscrape.com");
|
||||||
|
|
||||||
|
addElement("titles", "//h3");
|
||||||
|
addElement("prices", "//p[@class='price_color']");
|
||||||
|
|
||||||
|
submitJob();
|
||||||
|
|
||||||
|
cy.wait("@submitScrapeJob").then((interception) => {
|
||||||
|
expect(interception.response?.statusCode).to.eq(200);
|
||||||
|
expect(interception.request?.body.data.elements).to.have.length(2);
|
||||||
|
});
|
||||||
|
|
||||||
|
waitForJobCompletion("https://books.toscrape.com");
|
||||||
|
});
|
||||||
|
|
||||||
|
it.only("should handle collecting media", () => {
|
||||||
|
enterJobUrl("https://books.toscrape.com");
|
||||||
|
|
||||||
|
openAdvancedJobOptions();
|
||||||
|
addMedia();
|
||||||
|
|
||||||
|
cy.get("body").type("{esc}");
|
||||||
|
|
||||||
|
addElement("images", "//img");
|
||||||
|
|
||||||
|
submitJob();
|
||||||
|
|
||||||
|
cy.wait("@submitScrapeJob").then((interception) => {
|
||||||
|
expect(interception.response?.statusCode).to.eq(200);
|
||||||
|
expect(interception.request?.body.data.job_options.collect_media).to.be
|
||||||
|
.true;
|
||||||
|
});
|
||||||
|
|
||||||
|
waitForJobCompletion("https://books.toscrape.com");
|
||||||
|
checkForMedia();
|
||||||
|
});
|
||||||
|
});
|
||||||
38
cypress/e2e/agent.cy.ts
Normal file
38
cypress/e2e/agent.cy.ts
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
import { login } from "../utilities/authentication.utils";
|
||||||
|
import {
|
||||||
|
buildAgentJob,
|
||||||
|
cleanUpJobs,
|
||||||
|
submitJob,
|
||||||
|
waitForJobCompletion,
|
||||||
|
} from "../utilities/job.utilities";
|
||||||
|
import { mockSubmitJob } from "../utilities/mocks";
|
||||||
|
|
||||||
|
describe("Agent", () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
mockSubmitJob();
|
||||||
|
login();
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
cleanUpJobs();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should be able to scrape some data", () => {
|
||||||
|
cy.visit("/agent");
|
||||||
|
cy.wait(1000);
|
||||||
|
|
||||||
|
const url = "https://books.toscrape.com";
|
||||||
|
const prompt = "Collect all the links on the page";
|
||||||
|
buildAgentJob(url, prompt);
|
||||||
|
|
||||||
|
submitJob();
|
||||||
|
|
||||||
|
cy.wait("@submitScrapeJob").then((interception) => {
|
||||||
|
expect(interception.response?.statusCode).to.eq(200);
|
||||||
|
expect(interception.request?.body.data.url).to.eq(url);
|
||||||
|
expect(interception.request?.body.data.prompt).to.eq(prompt);
|
||||||
|
});
|
||||||
|
|
||||||
|
waitForJobCompletion("https://books.toscrape.com");
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -1,60 +1,61 @@
|
|||||||
describe("Authentication", () => {
|
import { faker } from "@faker-js/faker";
|
||||||
it("should register", () => {
|
import { mockLogin, mockSignup } from "../utilities/mocks";
|
||||||
cy.intercept("POST", "/api/signup").as("signup");
|
|
||||||
|
|
||||||
cy.visit("/").then(() => {
|
const mockEmail = faker.internet.email();
|
||||||
cy.get("button").contains("Login").click();
|
const mockPassword = faker.internet.password();
|
||||||
cy.url().should("include", "/login");
|
|
||||||
|
|
||||||
cy.get("form").should("be.visible");
|
describe.only("Authentication", () => {
|
||||||
cy.get("button")
|
beforeEach(() => {
|
||||||
.contains("No Account? Sign up")
|
cy.visit("/");
|
||||||
.should("be.visible")
|
mockSignup();
|
||||||
.click();
|
mockLogin();
|
||||||
|
|
||||||
cy.get("input[name='email']").type("test@test.com");
|
|
||||||
cy.get("input[name='password']").type("password");
|
|
||||||
cy.get("input[name='fullName']").type("John Doe");
|
|
||||||
cy.get("button[type='submit']").contains("Signup").click();
|
|
||||||
|
|
||||||
cy.wait("@signup").then((interception) => {
|
|
||||||
if (!interception.response) {
|
|
||||||
cy.log("No response received!");
|
|
||||||
throw new Error("signup request did not return a response");
|
|
||||||
}
|
|
||||||
|
|
||||||
cy.log("Response status: " + interception.response.statusCode);
|
|
||||||
cy.log("Response body: " + JSON.stringify(interception.response.body));
|
|
||||||
|
|
||||||
expect(interception.response.statusCode).to.eq(200);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
|
|
||||||
it("should login", () => {
|
it("should register", () => {
|
||||||
cy.intercept("POST", "/api/token").as("token");
|
cy.get("button").contains("Login").click();
|
||||||
|
cy.url().should("include", "/login");
|
||||||
|
|
||||||
cy.visit("/").then(() => {
|
cy.get("form").should("be.visible");
|
||||||
cy.get("button")
|
|
||||||
.contains("Login")
|
|
||||||
.click()
|
|
||||||
.then(() => {
|
|
||||||
cy.get("input[name='email']").type("test@test.com");
|
|
||||||
cy.get("input[name='password']").type("password");
|
|
||||||
cy.get("button[type='submit']").contains("Login").click();
|
|
||||||
|
|
||||||
cy.wait("@token").then((interception) => {
|
cy.get("button")
|
||||||
if (!interception.response) {
|
.contains("No Account? Sign up")
|
||||||
cy.log("No response received!");
|
.should("be.visible")
|
||||||
throw new Error("token request did not return a response");
|
.click();
|
||||||
}
|
|
||||||
|
|
||||||
cy.log("Response status: " + interception.response.statusCode);
|
cy.get("input[name='email']").type(mockEmail);
|
||||||
cy.log("Response body: " + JSON.stringify(interception.response.body));
|
cy.get("input[name='password']").type(mockPassword);
|
||||||
|
cy.get("input[name='fullName']").type(faker.person.fullName());
|
||||||
|
cy.get("button[type='submit']").contains("Signup").click();
|
||||||
|
|
||||||
expect(interception.response.statusCode).to.eq(200);
|
cy.wait("@signup").then((interception) => {
|
||||||
});
|
if (!interception.response) {
|
||||||
});
|
throw new Error("signup request did not return a response");
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(interception.response.statusCode).to.eq(200);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should login", () => {
|
||||||
|
cy.intercept("POST", "/api/token").as("token");
|
||||||
|
|
||||||
|
cy.visit("/").then(() => {
|
||||||
|
cy.get("button")
|
||||||
|
.contains("Login")
|
||||||
|
.click()
|
||||||
|
.then(() => {
|
||||||
|
cy.get("input[name='email']").type(mockEmail);
|
||||||
|
cy.get("input[name='password']").type(mockPassword);
|
||||||
|
cy.get("button[type='submit']").contains("Login").click();
|
||||||
|
|
||||||
|
cy.wait("@token").then((interception) => {
|
||||||
|
if (!interception.response) {
|
||||||
|
throw new Error("token request did not return a response");
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(interception.response.statusCode).to.eq(200);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
34
cypress/e2e/chat.cy.ts
Normal file
34
cypress/e2e/chat.cy.ts
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
import { login } from "../utilities/authentication.utils";
|
||||||
|
import {
|
||||||
|
cleanUpJobs,
|
||||||
|
selectJobFromSelector,
|
||||||
|
submitBasicJob,
|
||||||
|
waitForJobCompletion,
|
||||||
|
} from "../utilities/job.utilities";
|
||||||
|
import { mockLogin } from "../utilities/mocks";
|
||||||
|
|
||||||
|
describe.only("Chat", () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
mockLogin();
|
||||||
|
login();
|
||||||
|
cy.visit("/");
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
cleanUpJobs();
|
||||||
|
});
|
||||||
|
|
||||||
|
it.only("should be able to chat", () => {
|
||||||
|
const url = "https://books.toscrape.com";
|
||||||
|
submitBasicJob(url, "test", "//body");
|
||||||
|
waitForJobCompletion(url);
|
||||||
|
|
||||||
|
cy.visit("/chat");
|
||||||
|
selectJobFromSelector();
|
||||||
|
|
||||||
|
cy.get("[data-cy='message-input']").type("Hello");
|
||||||
|
cy.get("[data-cy='send-message']").click();
|
||||||
|
|
||||||
|
cy.get("[data-cy='ai-message']").should("exist");
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -1,34 +1,37 @@
|
|||||||
|
import { login } from "../utilities/authentication.utils";
|
||||||
|
import {
|
||||||
|
addElement,
|
||||||
|
cleanUpJobs,
|
||||||
|
enterJobUrl,
|
||||||
|
submitJob,
|
||||||
|
waitForJobCompletion,
|
||||||
|
} from "../utilities/job.utilities";
|
||||||
|
import { mockSubmitJob } from "../utilities/mocks";
|
||||||
|
|
||||||
describe.only("Job", () => {
|
describe.only("Job", () => {
|
||||||
it("should create a job", () => {
|
beforeEach(() => {
|
||||||
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
|
mockSubmitJob();
|
||||||
|
login();
|
||||||
cy.visit("/");
|
cy.visit("/");
|
||||||
|
});
|
||||||
|
|
||||||
cy.get('[data-cy="url-input"]').type("https://example.com");
|
afterEach(() => {
|
||||||
cy.get('[data-cy="name-field"]').type("example");
|
cleanUpJobs();
|
||||||
cy.get('[data-cy="xpath-field"]').type("//body");
|
});
|
||||||
cy.get('[data-cy="add-button"]').click();
|
|
||||||
|
|
||||||
cy.contains("Submit").click();
|
it("should create a job", () => {
|
||||||
|
enterJobUrl("https://books.toscrape.com");
|
||||||
|
addElement("body", "//body");
|
||||||
|
submitJob();
|
||||||
|
|
||||||
cy.wait("@submitScrapeJob").then((interception) => {
|
cy.wait("@submitScrapeJob").then((interception) => {
|
||||||
if (!interception.response) {
|
if (!interception.response) {
|
||||||
cy.log("No response received!");
|
|
||||||
cy.log("Request body: " + JSON.stringify(interception.request?.body));
|
|
||||||
throw new Error("submitScrapeJob request did not return a response");
|
throw new Error("submitScrapeJob request did not return a response");
|
||||||
}
|
}
|
||||||
|
|
||||||
cy.log("Response status: " + interception.response.statusCode);
|
|
||||||
cy.log("Response body: " + JSON.stringify(interception.response.body));
|
|
||||||
|
|
||||||
expect(interception.response.statusCode).to.eq(200);
|
expect(interception.response.statusCode).to.eq(200);
|
||||||
});
|
});
|
||||||
|
|
||||||
cy.get("li").contains("Jobs").click();
|
waitForJobCompletion("https://books.toscrape.com");
|
||||||
|
|
||||||
cy.contains("div", "https://example.com", { timeout: 10000 }).should(
|
|
||||||
"exist"
|
|
||||||
);
|
|
||||||
cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -14,7 +14,7 @@
|
|||||||
// ***********************************************************
|
// ***********************************************************
|
||||||
|
|
||||||
// Import commands.js using ES2015 syntax:
|
// Import commands.js using ES2015 syntax:
|
||||||
import './commands'
|
import "./commands";
|
||||||
|
|
||||||
// Alternatively you can use CommonJS syntax:
|
// Alternatively you can use CommonJS syntax:
|
||||||
// require('./commands')
|
// require('./commands')
|
||||||
|
|||||||
68
cypress/utilities/authentication.utils.ts
Normal file
68
cypress/utilities/authentication.utils.ts
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
export const signup = () => {
|
||||||
|
cy.intercept("POST", "/api/token").as("token");
|
||||||
|
|
||||||
|
cy.visit("/").then(() => {
|
||||||
|
cy.get("button").contains("Login").click();
|
||||||
|
cy.url().should("include", "/login");
|
||||||
|
|
||||||
|
cy.get("form").should("be.visible");
|
||||||
|
cy.get("button")
|
||||||
|
.contains("No Account? Sign up")
|
||||||
|
.should("be.visible")
|
||||||
|
.click();
|
||||||
|
|
||||||
|
cy.get("input[name='email']").type("test@test.com");
|
||||||
|
cy.get("input[name='password']").type("password");
|
||||||
|
cy.get("input[name='fullName']").type("John Doe");
|
||||||
|
cy.get("button[type='submit']").contains("Signup").click();
|
||||||
|
|
||||||
|
cy.wait("@token").then((interception) => {
|
||||||
|
if (!interception.response) {
|
||||||
|
cy.log("No response received!");
|
||||||
|
throw new Error("token request did not return a response");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
export const login = () => {
|
||||||
|
cy.intercept("POST", "/api/token").as("token");
|
||||||
|
cy.intercept("GET", "/api/me").as("me");
|
||||||
|
cy.intercept("GET", "/api/check").as("check");
|
||||||
|
|
||||||
|
cy.visit("/").then(() => {
|
||||||
|
cy.get("body").then(() => {
|
||||||
|
cy.get("button")
|
||||||
|
.contains("Login")
|
||||||
|
.click()
|
||||||
|
.then(() => {
|
||||||
|
cy.get("input[name='email']").type("test@test.com");
|
||||||
|
cy.get("input[name='password']").type("password");
|
||||||
|
cy.get("button[type='submit']").contains("Login").click();
|
||||||
|
|
||||||
|
cy.wait("@token").then((interception) => {
|
||||||
|
if (!interception.response) {
|
||||||
|
cy.log("No response received!");
|
||||||
|
throw new Error("token request did not return a response");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
cy.wait("@me").then((interception) => {
|
||||||
|
if (!interception.response) {
|
||||||
|
cy.log("No response received!");
|
||||||
|
throw new Error("me request did not return a response");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
cy.wait("@check").then((interception) => {
|
||||||
|
if (!interception.response) {
|
||||||
|
cy.log("No response received!");
|
||||||
|
throw new Error("check request did not return a response");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
cy.url().should("not.include", "/login");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
};
|
||||||
187
cypress/utilities/job.utilities.ts
Normal file
187
cypress/utilities/job.utilities.ts
Normal file
@@ -0,0 +1,187 @@
|
|||||||
|
export const cleanUpJobs = () => {
|
||||||
|
cy.intercept("POST", "/api/retrieve").as("retrieve");
|
||||||
|
cy.visit("/jobs");
|
||||||
|
|
||||||
|
cy.wait("@retrieve", { timeout: 15000 });
|
||||||
|
|
||||||
|
cy.get("tbody tr", { timeout: 20000 }).should("have.length.at.least", 1);
|
||||||
|
|
||||||
|
const tryClickSelectAll = (attempt = 1, maxAttempts = 5) => {
|
||||||
|
cy.log(`Attempt ${attempt} to click Select All`);
|
||||||
|
|
||||||
|
cy.get('[data-testid="select-all"]')
|
||||||
|
.closest("button")
|
||||||
|
.then(($btn) => {
|
||||||
|
// Retry if button is disabled
|
||||||
|
if ($btn.is(":disabled") || $btn.css("pointer-events") === "none") {
|
||||||
|
if (attempt < maxAttempts) {
|
||||||
|
cy.wait(1000).then(() =>
|
||||||
|
tryClickSelectAll(attempt + 1, maxAttempts)
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
throw new Error(
|
||||||
|
"Select All button is still disabled after max retries"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Click and then verify if checkbox is checked
|
||||||
|
cy.wrap($btn)
|
||||||
|
.click({ force: true })
|
||||||
|
.then(() => {
|
||||||
|
cy.get("tbody tr")
|
||||||
|
.first()
|
||||||
|
.find("td")
|
||||||
|
.first()
|
||||||
|
.find("input[type='checkbox']")
|
||||||
|
.should("be.checked")
|
||||||
|
.then(() => {
|
||||||
|
cy.log("Select All successful");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle failure case
|
||||||
|
cy.on("fail", () => {
|
||||||
|
cy.log("Error clicking Select All");
|
||||||
|
if (attempt < maxAttempts) {
|
||||||
|
cy.wait(1000).then(() =>
|
||||||
|
tryClickSelectAll(attempt + 1, maxAttempts)
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
throw new Error(
|
||||||
|
"Checkbox was never checked after clicking Select All"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return false; // Prevent Cypress from failing the test
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
tryClickSelectAll();
|
||||||
|
|
||||||
|
cy.get('[data-testid="DeleteIcon"]', { timeout: 10000 })
|
||||||
|
.closest("button")
|
||||||
|
.should("not.be.disabled")
|
||||||
|
.click();
|
||||||
|
};
|
||||||
|
export const submitBasicJob = (url: string, name: string, xpath: string) => {
|
||||||
|
cy.get('[data-cy="url-input"]').type(url);
|
||||||
|
cy.get('[data-cy="name-field"]').type(name);
|
||||||
|
cy.get('[data-cy="xpath-field"]').type(xpath);
|
||||||
|
cy.get('[data-cy="add-button"]').click();
|
||||||
|
cy.contains("Submit").click();
|
||||||
|
};
|
||||||
|
|
||||||
|
export const waitForJobCompletion = (url: string) => {
|
||||||
|
cy.intercept("POST", "/api/retrieve").as("retrieve");
|
||||||
|
|
||||||
|
cy.visit("/jobs");
|
||||||
|
|
||||||
|
cy.wait("@retrieve", { timeout: 30000 });
|
||||||
|
|
||||||
|
cy.contains("div", url, { timeout: 30000 }).should("exist");
|
||||||
|
|
||||||
|
const checkJobStatus = () => {
|
||||||
|
cy.get("[data-testid='job-status']", { timeout: 120000 }).then(($el) => {
|
||||||
|
const status = $el.text().toLowerCase().trim();
|
||||||
|
|
||||||
|
if (status.includes("completed")) {
|
||||||
|
return true;
|
||||||
|
} else if (status.includes("scraping") || status.includes("queued")) {
|
||||||
|
cy.wait(5000);
|
||||||
|
checkJobStatus();
|
||||||
|
} else {
|
||||||
|
throw new Error(`Unexpected job status: ${status}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
checkJobStatus();
|
||||||
|
};
|
||||||
|
|
||||||
|
export const enableMultiPageScraping = () => {
|
||||||
|
cy.get("button").contains("Advanced Options").click();
|
||||||
|
cy.get('[data-cy="multi-page-toggle"]').click();
|
||||||
|
cy.get("body").type("{esc}");
|
||||||
|
};
|
||||||
|
|
||||||
|
export const addCustomHeaders = (headers: Record<string, string>) => {
|
||||||
|
cy.get("button").contains("Advanced Options").click();
|
||||||
|
cy.get('[name="custom_headers"]').type(JSON.stringify(headers), {
|
||||||
|
parseSpecialCharSequences: false,
|
||||||
|
});
|
||||||
|
cy.get("body").type("{esc}");
|
||||||
|
};
|
||||||
|
|
||||||
|
export const addCustomCookies = (cookies: Record<string, string>) => {
|
||||||
|
cy.get("button").contains("Advanced Options").click();
|
||||||
|
cy.get('[name="custom_cookies"]').type(JSON.stringify(cookies));
|
||||||
|
cy.get("body").type("{esc}");
|
||||||
|
};
|
||||||
|
|
||||||
|
export const openAdvancedJobOptions = () => {
|
||||||
|
cy.get("button").contains("Advanced Options").click();
|
||||||
|
};
|
||||||
|
|
||||||
|
export const selectJobFromSelector = () => {
|
||||||
|
checkAiDisabled();
|
||||||
|
cy.get("div[id='select-job']", { timeout: 10000 }).first().click();
|
||||||
|
cy.get("li[role='option']", { timeout: 10000 }).first().click();
|
||||||
|
};
|
||||||
|
|
||||||
|
export const addMedia = () => {
|
||||||
|
cy.get('[data-cy="collect-media-checkbox"]').click();
|
||||||
|
};
|
||||||
|
|
||||||
|
export const checkForMedia = () => {
|
||||||
|
cy.intercept("GET", "/api/media/get-media?id=**").as("getMedia");
|
||||||
|
|
||||||
|
cy.visit("/media");
|
||||||
|
selectJobFromSelector();
|
||||||
|
|
||||||
|
cy.wait("@getMedia", { timeout: 30000 });
|
||||||
|
};
|
||||||
|
|
||||||
|
export const addSiteMapAction = (
|
||||||
|
type: "click" | "input",
|
||||||
|
xpath: string,
|
||||||
|
input?: string
|
||||||
|
) => {
|
||||||
|
cy.get("button").contains("Create Site Map").click();
|
||||||
|
cy.get('[data-cy="site-map-select"]').select(type);
|
||||||
|
cy.get('[data-cy="site-map-xpath"]').type(xpath);
|
||||||
|
if (type === "input" && input) {
|
||||||
|
cy.get('[data-cy="site-map-input"]').type(input);
|
||||||
|
}
|
||||||
|
cy.get('[data-cy="add-site-map-action"]').click();
|
||||||
|
};
|
||||||
|
|
||||||
|
export const addElement = (name: string, xpath: string) => {
|
||||||
|
cy.get('[data-cy="name-field"]').type(name);
|
||||||
|
cy.get('[data-cy="xpath-field"]').type(xpath);
|
||||||
|
cy.get('[data-cy="add-button"]').click();
|
||||||
|
};
|
||||||
|
|
||||||
|
export const checkAiDisabled = () => {
|
||||||
|
cy.getAllLocalStorage().then((result) => {
|
||||||
|
const storage = JSON.parse(
|
||||||
|
result["http://localhost"]["persist:root"] as string
|
||||||
|
);
|
||||||
|
const settings = JSON.parse(storage.settings);
|
||||||
|
expect(settings.aiEnabled).to.equal(true);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
export const buildAgentJob = (url: string, prompt: string) => {
|
||||||
|
checkAiDisabled();
|
||||||
|
enterJobUrl(url);
|
||||||
|
cy.get("[data-cy='prompt-input']").type(prompt);
|
||||||
|
};
|
||||||
|
|
||||||
|
export const submitJob = () => {
|
||||||
|
cy.get("button").contains("Submit").click();
|
||||||
|
};
|
||||||
|
|
||||||
|
export const enterJobUrl = (url: string) => {
|
||||||
|
cy.get('[data-cy="url-input"]').type(url);
|
||||||
|
};
|
||||||
15
cypress/utilities/mocks.ts
Normal file
15
cypress/utilities/mocks.ts
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
export const mockSubmitJob = () => {
|
||||||
|
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
|
||||||
|
};
|
||||||
|
|
||||||
|
export const mockToken = () => {
|
||||||
|
cy.intercept("POST", "/api/token").as("token");
|
||||||
|
};
|
||||||
|
|
||||||
|
export const mockSignup = () => {
|
||||||
|
cy.intercept("POST", "/api/signup").as("signup");
|
||||||
|
};
|
||||||
|
|
||||||
|
export const mockLogin = () => {
|
||||||
|
cy.intercept("POST", "/api/token").as("token");
|
||||||
|
};
|
||||||
1
cypress/utilities/utilities.ts
Normal file
1
cypress/utilities/utilities.ts
Normal file
@@ -0,0 +1 @@
|
|||||||
|
export * from "./authentication.utils";
|
||||||
@@ -1,6 +1,9 @@
|
|||||||
version: "3"
|
version: "3"
|
||||||
services:
|
services:
|
||||||
scraperr:
|
scraperr:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: docker/frontend/Dockerfile
|
||||||
command: ["npm", "run", "dev"]
|
command: ["npm", "run", "dev"]
|
||||||
volumes:
|
volumes:
|
||||||
- "$PWD/src:/app/src"
|
- "$PWD/src:/app/src"
|
||||||
@@ -10,7 +13,12 @@ services:
|
|||||||
- "$PWD/package-lock.json:/app/package-lock.json"
|
- "$PWD/package-lock.json:/app/package-lock.json"
|
||||||
- "$PWD/tsconfig.json:/app/tsconfig.json"
|
- "$PWD/tsconfig.json:/app/tsconfig.json"
|
||||||
scraperr_api:
|
scraperr_api:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: docker/api/Dockerfile
|
||||||
environment:
|
environment:
|
||||||
- LOG_LEVEL=INFO
|
- LOG_LEVEL=INFO
|
||||||
volumes:
|
volumes:
|
||||||
- "$PWD/api:/project/app/api"
|
- "$PWD/api:/project/app/api"
|
||||||
|
ports:
|
||||||
|
- "5900:5900"
|
||||||
|
|||||||
@@ -1,11 +1,6 @@
|
|||||||
services:
|
services:
|
||||||
scraperr:
|
scraperr:
|
||||||
depends_on:
|
image: jpyles0524/scraperr:latest
|
||||||
- scraperr_api
|
|
||||||
image: jpyles0524/scraperr:1.0.13
|
|
||||||
build:
|
|
||||||
context: .
|
|
||||||
dockerfile: docker/frontend/Dockerfile
|
|
||||||
container_name: scraperr
|
container_name: scraperr
|
||||||
command: ["npm", "run", "start"]
|
command: ["npm", "run", "start"]
|
||||||
environment:
|
environment:
|
||||||
@@ -18,11 +13,9 @@ services:
|
|||||||
scraperr_api:
|
scraperr_api:
|
||||||
init: True
|
init: True
|
||||||
image: jpyles0524/scraperr_api:latest
|
image: jpyles0524/scraperr_api:latest
|
||||||
build:
|
|
||||||
context: .
|
|
||||||
dockerfile: docker/api/Dockerfile
|
|
||||||
environment:
|
environment:
|
||||||
- LOG_LEVEL=INFO
|
- LOG_LEVEL=INFO
|
||||||
|
- OPENAI_KEY=${OPENAI_KEY}
|
||||||
container_name: scraperr_api
|
container_name: scraperr_api
|
||||||
ports:
|
ports:
|
||||||
- 8000:8000
|
- 8000:8000
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ FROM python:3.10.12-slim as pybuilder
|
|||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y curl && \
|
apt-get install -y curl && \
|
||||||
apt-get install -y uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 && \
|
apt-get install -y x11vnc xvfb uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 ffmpeg && \
|
||||||
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
||||||
apt-get remove -y curl && \
|
apt-get remove -y curl && \
|
||||||
apt-get autoremove -y && \
|
apt-get autoremove -y && \
|
||||||
@@ -14,7 +14,8 @@ RUN pdm config python.use_venv false
|
|||||||
|
|
||||||
WORKDIR /project/app
|
WORKDIR /project/app
|
||||||
COPY pyproject.toml pdm.lock /project/app/
|
COPY pyproject.toml pdm.lock /project/app/
|
||||||
RUN pdm install
|
|
||||||
|
RUN pdm install -v --frozen-lockfile
|
||||||
|
|
||||||
RUN pdm run playwright install --with-deps
|
RUN pdm run playwright install --with-deps
|
||||||
|
|
||||||
@@ -30,7 +31,12 @@ EXPOSE 8000
|
|||||||
|
|
||||||
WORKDIR /project/app
|
WORKDIR /project/app
|
||||||
|
|
||||||
|
RUN mkdir -p /project/app/media
|
||||||
RUN mkdir -p /project/app/data
|
RUN mkdir -p /project/app/data
|
||||||
RUN touch /project/app/data/database.db
|
RUN touch /project/app/data/database.db
|
||||||
|
|
||||||
|
EXPOSE 5900
|
||||||
|
|
||||||
|
COPY start.sh /project/app/start.sh
|
||||||
|
|
||||||
CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ]
|
CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ]
|
||||||
@@ -1,10 +1,14 @@
|
|||||||
# Build next dependencies
|
# Build next dependencies
|
||||||
FROM node:23.1
|
FROM node:23.1-slim
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
COPY package*.json ./
|
# Copy package files first to leverage Docker cache
|
||||||
RUN npm install
|
COPY package.json yarn.lock ./
|
||||||
|
|
||||||
|
# Install dependencies in a separate layer
|
||||||
|
RUN yarn install --frozen-lockfile
|
||||||
|
|
||||||
|
# Copy the rest of the application
|
||||||
COPY tsconfig.json /app/tsconfig.json
|
COPY tsconfig.json /app/tsconfig.json
|
||||||
COPY tailwind.config.js /app/tailwind.config.js
|
COPY tailwind.config.js /app/tailwind.config.js
|
||||||
COPY next.config.mjs /app/next.config.mjs
|
COPY next.config.mjs /app/next.config.mjs
|
||||||
@@ -13,6 +17,7 @@ COPY postcss.config.js /app/postcss.config.js
|
|||||||
COPY public /app/public
|
COPY public /app/public
|
||||||
COPY src /app/src
|
COPY src /app/src
|
||||||
|
|
||||||
RUN npm run build
|
# Build the application
|
||||||
|
RUN yarn build
|
||||||
|
|
||||||
EXPOSE 3000
|
EXPOSE 3000
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 47 KiB After Width: | Height: | Size: 67 KiB |
@@ -15,7 +15,7 @@ type: application
|
|||||||
# This is the chart version. This version number should be incremented each time you make changes
|
# This is the chart version. This version number should be incremented each time you make changes
|
||||||
# to the chart and its templates, including the app version.
|
# to the chart and its templates, including the app version.
|
||||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||||
version: 1.0.14
|
version: 1.1.2
|
||||||
|
|
||||||
# This is the version number of the application being deployed. This version number should be
|
# This is the version number of the application being deployed. This version number should be
|
||||||
# incremented each time you make changes to the application. Versions are not expected to
|
# incremented each time you make changes to the application. Versions are not expected to
|
||||||
|
|||||||
11371
package-lock.json
generated
11371
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
12
package.json
12
package.json
@@ -12,9 +12,11 @@
|
|||||||
"@minchat/react-chat-ui": "^0.16.2",
|
"@minchat/react-chat-ui": "^0.16.2",
|
||||||
"@mui/icons-material": "^5.15.3",
|
"@mui/icons-material": "^5.15.3",
|
||||||
"@mui/material": "^5.16.0",
|
"@mui/material": "^5.16.0",
|
||||||
|
"@reduxjs/toolkit": "^2.8.2",
|
||||||
"@testing-library/jest-dom": "^5.16.5",
|
"@testing-library/jest-dom": "^5.16.5",
|
||||||
"@testing-library/react": "^13.4.0",
|
"@testing-library/react": "^13.4.0",
|
||||||
"@testing-library/user-event": "^13.5.0",
|
"@testing-library/user-event": "^13.5.0",
|
||||||
|
"@types/react": "^18.3.21",
|
||||||
"axios": "^1.7.2",
|
"axios": "^1.7.2",
|
||||||
"bootstrap": "^5.3.0",
|
"bootstrap": "^5.3.0",
|
||||||
"chart.js": "^4.4.3",
|
"chart.js": "^4.4.3",
|
||||||
@@ -30,16 +32,19 @@
|
|||||||
"react-dom": "^18.3.1",
|
"react-dom": "^18.3.1",
|
||||||
"react-markdown": "^9.0.0",
|
"react-markdown": "^9.0.0",
|
||||||
"react-modal-image": "^2.6.0",
|
"react-modal-image": "^2.6.0",
|
||||||
|
"react-redux": "^9.2.0",
|
||||||
"react-router": "^6.14.1",
|
"react-router": "^6.14.1",
|
||||||
"react-router-dom": "^6.14.1",
|
"react-router-dom": "^6.14.1",
|
||||||
"react-spinners": "^0.14.1",
|
"react-spinners": "^0.14.1",
|
||||||
|
"react-toastify": "^11.0.5",
|
||||||
|
"redux-persist": "^6.0.0",
|
||||||
"typescript": "^4.9.5",
|
"typescript": "^4.9.5",
|
||||||
"web-vitals": "^2.1.4"
|
"web-vitals": "^2.1.4"
|
||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"dev": "next dev",
|
"dev": "yarn next dev",
|
||||||
"build": "next build",
|
"build": "yarn next build",
|
||||||
"start": "next start",
|
"start": "yarn next start",
|
||||||
"serve": "serve -s ./dist",
|
"serve": "serve -s ./dist",
|
||||||
"cy:open": "cypress open",
|
"cy:open": "cypress open",
|
||||||
"cy:run": "cypress run"
|
"cy:run": "cypress run"
|
||||||
@@ -63,6 +68,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
"@faker-js/faker": "^9.8.0",
|
||||||
"@types/cypress": "^1.1.6",
|
"@types/cypress": "^1.1.6",
|
||||||
"@types/js-cookie": "^3.0.6",
|
"@types/js-cookie": "^3.0.6",
|
||||||
"autoprefixer": "^10.4.21",
|
"autoprefixer": "^10.4.21",
|
||||||
|
|||||||
24
pdm.lock
generated
24
pdm.lock
generated
@@ -5,7 +5,7 @@
|
|||||||
groups = ["default", "dev"]
|
groups = ["default", "dev"]
|
||||||
strategy = ["inherit_metadata"]
|
strategy = ["inherit_metadata"]
|
||||||
lock_version = "4.5.0"
|
lock_version = "4.5.0"
|
||||||
content_hash = "sha256:cb37fedd6d022515dde14e475588a8da2144ba22e41dfdfacfe3f7a7d14486ca"
|
content_hash = "sha256:1a65c1e288d2c6827fc6866d3bfe6a9b8707b2ca895d488f4a9b11cd579c4359"
|
||||||
|
|
||||||
[[metadata.targets]]
|
[[metadata.targets]]
|
||||||
requires_python = ">=3.10"
|
requires_python = ">=3.10"
|
||||||
@@ -1174,6 +1174,17 @@ files = [
|
|||||||
{file = "hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca"},
|
{file = "hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "html2text"
|
||||||
|
version = "2025.4.15"
|
||||||
|
requires_python = ">=3.9"
|
||||||
|
summary = "Turn HTML into equivalent Markdown-structured text."
|
||||||
|
groups = ["default"]
|
||||||
|
files = [
|
||||||
|
{file = "html2text-2025.4.15-py3-none-any.whl", hash = "sha256:00569167ffdab3d7767a4cdf589b7f57e777a5ed28d12907d8c58769ec734acc"},
|
||||||
|
{file = "html2text-2025.4.15.tar.gz", hash = "sha256:948a645f8f0bc3abe7fd587019a2197a12436cd73d0d4908af95bfc8da337588"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "httpcore"
|
name = "httpcore"
|
||||||
version = "1.0.9"
|
version = "1.0.9"
|
||||||
@@ -2303,6 +2314,17 @@ files = [
|
|||||||
{file = "propcache-0.3.1.tar.gz", hash = "sha256:40d980c33765359098837527e18eddefc9a24cea5b45e078a7f3bb5b032c6ecf"},
|
{file = "propcache-0.3.1.tar.gz", hash = "sha256:40d980c33765359098837527e18eddefc9a24cea5b45e078a7f3bb5b032c6ecf"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "proxy-py"
|
||||||
|
version = "2.4.10"
|
||||||
|
requires_python = ">=3.6"
|
||||||
|
summary = "\\u26a1 Fast \\u2022 \\U0001fab6 Lightweight \\u2022 \\U0001f51f Dependency \\u2022 \\U0001f50c Pluggable \\u2022 \\U0001f608 TLS interception \\u2022 \\U0001f512 DNS-over-HTTPS \\u2022 \\U0001f525 Poor Mans VPN \\u2022 \\u23ea Reverse & \\u23e9 Forward \\u2022 \\U0001f46e\\U0001f3ff Proxy Server framework \\u2022 \\U0001f310 Web Server framework \\u2022 \\u27b5 \\u27b6 \\u27b7 \\u27a0 PubSub framework \\u2022 \\U0001f477 Work acceptor & executor framework."
|
||||||
|
groups = ["default"]
|
||||||
|
files = [
|
||||||
|
{file = "proxy.py-2.4.10-py3-none-any.whl", hash = "sha256:ef3a31f6ef3be6ff78559c0e68198523bfe2fb1e820bb16686750c1bb5baf9e8"},
|
||||||
|
{file = "proxy_py-2.4.10.tar.gz", hash = "sha256:41b9e9d3aae6f80e2304d3726e8e9c583a510d8de224eada53d115f48a63a9ce"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ptyprocess"
|
name = "ptyprocess"
|
||||||
version = "0.7.0"
|
version = "0.7.0"
|
||||||
|
|||||||
@@ -41,6 +41,8 @@ dependencies = [
|
|||||||
"apscheduler>=3.11.0",
|
"apscheduler>=3.11.0",
|
||||||
"playwright>=1.52.0",
|
"playwright>=1.52.0",
|
||||||
"camoufox>=0.4.11",
|
"camoufox>=0.4.11",
|
||||||
|
"html2text>=2025.4.15",
|
||||||
|
"proxy-py>=2.4.10",
|
||||||
]
|
]
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.10"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
@@ -97,9 +99,9 @@ strictSetInference = true
|
|||||||
|
|
||||||
|
|
||||||
[tool.isort]
|
[tool.isort]
|
||||||
length_sort = "1"
|
length_sort = true
|
||||||
profile = "black"
|
profile = "black"
|
||||||
sections = "STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER"
|
sections = ["STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
|
||||||
import_heading_stdlib = "STL"
|
import_heading_stdlib = "STL"
|
||||||
import_heading_thirdparty = "PDM"
|
import_heading_thirdparty = "PDM"
|
||||||
import_heading_firstparty = "LOCAL"
|
import_heading_firstparty = "LOCAL"
|
||||||
|
|||||||
19
scripts/version.sh
Executable file
19
scripts/version.sh
Executable file
@@ -0,0 +1,19 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# ARGS
|
||||||
|
VERSION_TYPE=$1 # patch, minor, major
|
||||||
|
|
||||||
|
# Get the current version from the Chart.yaml file
|
||||||
|
current_version=$(grep -oP 'version:\s*\K[0-9]+\.[0-9]+\.[0-9]+' helm/Chart.yaml | tr -d '[:space:]')
|
||||||
|
|
||||||
|
# Increment the version number
|
||||||
|
if [ "$VERSION_TYPE" == "patch" ]; then
|
||||||
|
new_version=$(echo $current_version | awk -F. -v OFS=. '{$NF++; print}')
|
||||||
|
elif [ "$VERSION_TYPE" == "minor" ]; then
|
||||||
|
new_version=$(echo $current_version | awk -F. -v OFS=. '{$2++; $3=0; print}')
|
||||||
|
elif [ "$VERSION_TYPE" == "major" ]; then
|
||||||
|
new_version=$(echo $current_version | awk -F. -v OFS=. '{$1++; $2=0; $3=0; print}')
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Output the new version
|
||||||
|
echo "$new_version"
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
import React from "react";
|
|
||||||
|
|
||||||
export const Chat = () => {
|
|
||||||
return <h1>Chat</h1>;
|
|
||||||
};
|
|
||||||
@@ -1,133 +0,0 @@
|
|||||||
import React, { useState, useEffect, Dispatch, useRef } from "react";
|
|
||||||
import { Job } from "../../types";
|
|
||||||
import { fetchJobs } from "../../lib";
|
|
||||||
import Box from "@mui/material/Box";
|
|
||||||
import InputLabel from "@mui/material/InputLabel";
|
|
||||||
import FormControl from "@mui/material/FormControl";
|
|
||||||
import Select from "@mui/material/Select";
|
|
||||||
import Popover from "@mui/material/Popover";
|
|
||||||
import { Typography, MenuItem, useTheme } from "@mui/material";
|
|
||||||
import { SxProps } from "@mui/material";
|
|
||||||
|
|
||||||
interface Props {
|
|
||||||
sxProps: SxProps;
|
|
||||||
setSelectedJob: Dispatch<React.SetStateAction<Job | null>>;
|
|
||||||
selectedJob: Job | null;
|
|
||||||
setJobs: Dispatch<React.SetStateAction<Job[]>>;
|
|
||||||
jobs: Job[];
|
|
||||||
}
|
|
||||||
|
|
||||||
export const JobSelector = ({
|
|
||||||
sxProps,
|
|
||||||
selectedJob,
|
|
||||||
setSelectedJob,
|
|
||||||
setJobs,
|
|
||||||
jobs,
|
|
||||||
}: Props) => {
|
|
||||||
const [anchorEl, setAnchorEl] = useState<HTMLElement | null>(null);
|
|
||||||
const [popoverJob, setPopoverJob] = useState<Job | null>(null);
|
|
||||||
const theme = useTheme();
|
|
||||||
|
|
||||||
const handlePopoverOpen = (
|
|
||||||
event: React.MouseEvent<HTMLElement>,
|
|
||||||
job: Job
|
|
||||||
) => {
|
|
||||||
setAnchorEl(event.currentTarget);
|
|
||||||
setPopoverJob(job);
|
|
||||||
};
|
|
||||||
|
|
||||||
const handlePopoverClose = () => {
|
|
||||||
setAnchorEl(null);
|
|
||||||
setPopoverJob(null);
|
|
||||||
};
|
|
||||||
|
|
||||||
const open = Boolean(anchorEl);
|
|
||||||
|
|
||||||
return (
|
|
||||||
<Box sx={sxProps}>
|
|
||||||
<FormControl fullWidth>
|
|
||||||
{jobs.length ? (
|
|
||||||
<>
|
|
||||||
<InputLabel id="select-job">Job</InputLabel>
|
|
||||||
<Select
|
|
||||||
labelId="select-job"
|
|
||||||
id="select-job"
|
|
||||||
value={selectedJob?.id || ""}
|
|
||||||
label="Job"
|
|
||||||
onChange={(e) => {
|
|
||||||
setSelectedJob(
|
|
||||||
jobs.find((job) => job.id === e.target.value) || null
|
|
||||||
);
|
|
||||||
}}
|
|
||||||
>
|
|
||||||
{jobs.map((job) => (
|
|
||||||
<MenuItem
|
|
||||||
key={job.id}
|
|
||||||
value={job.id}
|
|
||||||
aria-owns={open ? "mouse-over-popover" : undefined}
|
|
||||||
aria-haspopup="true"
|
|
||||||
onMouseEnter={(e) => handlePopoverOpen(e, job)}
|
|
||||||
onMouseLeave={handlePopoverClose}
|
|
||||||
onClick={handlePopoverClose}
|
|
||||||
>
|
|
||||||
{job.id}
|
|
||||||
</MenuItem>
|
|
||||||
))}
|
|
||||||
</Select>
|
|
||||||
</>
|
|
||||||
) : null}
|
|
||||||
</FormControl>
|
|
||||||
<Popover
|
|
||||||
id="mouse-over-popover"
|
|
||||||
sx={{
|
|
||||||
pointerEvents: "none",
|
|
||||||
padding: 0,
|
|
||||||
}}
|
|
||||||
open={open}
|
|
||||||
anchorEl={anchorEl}
|
|
||||||
anchorOrigin={{
|
|
||||||
vertical: "bottom",
|
|
||||||
horizontal: "left",
|
|
||||||
}}
|
|
||||||
transformOrigin={{
|
|
||||||
vertical: "top",
|
|
||||||
horizontal: "left",
|
|
||||||
}}
|
|
||||||
onClose={handlePopoverClose}
|
|
||||||
>
|
|
||||||
{popoverJob && (
|
|
||||||
<Box
|
|
||||||
sx={{
|
|
||||||
border:
|
|
||||||
theme.palette.mode === "light"
|
|
||||||
? "2px solid black"
|
|
||||||
: "2px solid white",
|
|
||||||
}}
|
|
||||||
>
|
|
||||||
<Typography
|
|
||||||
variant="body1"
|
|
||||||
sx={{ paddingLeft: 1, paddingRight: 1 }}
|
|
||||||
>
|
|
||||||
{popoverJob.url}
|
|
||||||
</Typography>
|
|
||||||
<div className="flex flex-row w-full justify-end mb-1">
|
|
||||||
<Typography
|
|
||||||
variant="body2"
|
|
||||||
sx={{
|
|
||||||
paddingLeft: 1,
|
|
||||||
paddingRight: 1,
|
|
||||||
color: theme.palette.mode === "dark" ? "#d3d7e6" : "#5b5d63",
|
|
||||||
fontStyle: "italic",
|
|
||||||
}}
|
|
||||||
>
|
|
||||||
{popoverJob.time_created
|
|
||||||
? new Date(popoverJob.time_created).toLocaleString()
|
|
||||||
: "Unknown"}
|
|
||||||
</Typography>
|
|
||||||
</div>
|
|
||||||
</Box>
|
|
||||||
)}
|
|
||||||
</Popover>
|
|
||||||
</Box>
|
|
||||||
);
|
|
||||||
};
|
|
||||||
@@ -1,2 +0,0 @@
|
|||||||
export * from "./Chat";
|
|
||||||
export * from "./JobSelector";
|
|
||||||
@@ -1,44 +1,51 @@
|
|||||||
import { Box, Link, Typography } from "@mui/material";
|
|
||||||
import { SetStateAction, Dispatch, useState } from "react";
|
|
||||||
import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog";
|
|
||||||
import { RawJobOptions } from "@/types";
|
import { RawJobOptions } from "@/types";
|
||||||
|
import SettingsIcon from "@mui/icons-material/Settings";
|
||||||
|
import { Box, Button, Typography } from "@mui/material";
|
||||||
|
import { Dispatch, SetStateAction, useState } from "react";
|
||||||
|
import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog";
|
||||||
|
|
||||||
export type AdvancedJobOptionsProps = {
|
export type AdvancedJobOptionsProps = {
|
||||||
jobOptions: RawJobOptions;
|
jobOptions: RawJobOptions;
|
||||||
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
|
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
|
||||||
|
multiPageScrapeEnabled?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const AdvancedJobOptions = ({
|
export const AdvancedJobOptions = ({
|
||||||
jobOptions,
|
jobOptions,
|
||||||
setJobOptions,
|
setJobOptions,
|
||||||
|
multiPageScrapeEnabled = true,
|
||||||
}: AdvancedJobOptionsProps) => {
|
}: AdvancedJobOptionsProps) => {
|
||||||
const [open, setOpen] = useState(false);
|
const [open, setOpen] = useState(false);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<Box sx={{ mb: 2 }}>
|
<Box sx={{ display: "flex", alignItems: "center", gap: 1 }}>
|
||||||
<Link
|
<Button
|
||||||
component="button"
|
variant="outlined"
|
||||||
variant="body2"
|
|
||||||
onClick={() => setOpen(true)}
|
onClick={() => setOpen(true)}
|
||||||
|
startIcon={<SettingsIcon />}
|
||||||
sx={{
|
sx={{
|
||||||
textDecoration: "none",
|
textTransform: "none",
|
||||||
color: "primary.main",
|
borderRadius: 2,
|
||||||
|
px: 2,
|
||||||
|
py: 1,
|
||||||
|
borderColor: "divider",
|
||||||
|
color: "text.secondary",
|
||||||
"&:hover": {
|
"&:hover": {
|
||||||
color: "primary.dark",
|
borderColor: "primary.main",
|
||||||
textDecoration: "underline",
|
color: "primary.main",
|
||||||
|
bgcolor: "action.hover",
|
||||||
},
|
},
|
||||||
paddingLeft: 1,
|
|
||||||
display: "inline-flex",
|
|
||||||
alignItems: "center",
|
|
||||||
gap: 0.5,
|
|
||||||
}}
|
}}
|
||||||
>
|
>
|
||||||
<Typography variant="body2">Advanced Job Options</Typography>
|
<Typography variant="body2">Advanced Options</Typography>
|
||||||
</Link>
|
</Button>
|
||||||
|
|
||||||
<AdvancedJobOptionsDialog
|
<AdvancedJobOptionsDialog
|
||||||
open={open}
|
open={open}
|
||||||
onClose={() => setOpen(false)}
|
onClose={() => setOpen(false)}
|
||||||
jobOptions={jobOptions}
|
jobOptions={jobOptions}
|
||||||
setJobOptions={setJobOptions}
|
setJobOptions={setJobOptions}
|
||||||
|
multiPageScrapeEnabled={multiPageScrapeEnabled}
|
||||||
/>
|
/>
|
||||||
</Box>
|
</Box>
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -1,3 +1,11 @@
|
|||||||
|
import { ExpandedTableInput } from "@/components/common/expanded-table-input";
|
||||||
|
import { RawJobOptions } from "@/types";
|
||||||
|
import {
|
||||||
|
Code as CodeIcon,
|
||||||
|
ExpandMore as ExpandMoreIcon,
|
||||||
|
InfoOutlined,
|
||||||
|
Settings,
|
||||||
|
} from "@mui/icons-material";
|
||||||
import {
|
import {
|
||||||
Accordion,
|
Accordion,
|
||||||
AccordionDetails,
|
AccordionDetails,
|
||||||
@@ -17,21 +25,14 @@ import {
|
|||||||
Typography,
|
Typography,
|
||||||
useTheme,
|
useTheme,
|
||||||
} from "@mui/material";
|
} from "@mui/material";
|
||||||
import {
|
import { Dispatch, SetStateAction, useEffect, useState } from "react";
|
||||||
ExpandMore as ExpandMoreIcon,
|
|
||||||
InfoOutlined,
|
|
||||||
Code as CodeIcon,
|
|
||||||
Settings,
|
|
||||||
} from "@mui/icons-material";
|
|
||||||
import { Dispatch, SetStateAction } from "react";
|
|
||||||
import { RawJobOptions } from "@/types";
|
|
||||||
import { ExpandedTableInput } from "../../expanded-table-input";
|
|
||||||
|
|
||||||
export type AdvancedJobOptionsDialogProps = {
|
export type AdvancedJobOptionsDialogProps = {
|
||||||
open: boolean;
|
open: boolean;
|
||||||
onClose: () => void;
|
onClose: () => void;
|
||||||
jobOptions: RawJobOptions;
|
jobOptions: RawJobOptions;
|
||||||
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
|
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
|
||||||
|
multiPageScrapeEnabled?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const AdvancedJobOptionsDialog = ({
|
export const AdvancedJobOptionsDialog = ({
|
||||||
@@ -39,33 +40,39 @@ export const AdvancedJobOptionsDialog = ({
|
|||||||
onClose,
|
onClose,
|
||||||
jobOptions,
|
jobOptions,
|
||||||
setJobOptions,
|
setJobOptions,
|
||||||
|
multiPageScrapeEnabled = true,
|
||||||
}: AdvancedJobOptionsDialogProps) => {
|
}: AdvancedJobOptionsDialogProps) => {
|
||||||
const theme = useTheme();
|
const theme = useTheme();
|
||||||
const handleMultiPageScrapeChange = () => {
|
const [localJobOptions, setLocalJobOptions] =
|
||||||
setJobOptions((prevJobOptions) => ({
|
useState<RawJobOptions>(jobOptions);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
setLocalJobOptions(jobOptions);
|
||||||
|
}, [jobOptions]);
|
||||||
|
|
||||||
|
const handleCheckboxChange = (key: keyof RawJobOptions) => {
|
||||||
|
setLocalJobOptions((prevJobOptions) => ({
|
||||||
...prevJobOptions,
|
...prevJobOptions,
|
||||||
multi_page_scrape: !prevJobOptions.multi_page_scrape,
|
[key]: !prevJobOptions[key],
|
||||||
}));
|
}));
|
||||||
};
|
};
|
||||||
|
|
||||||
const handleProxiesChange = (e: React.ChangeEvent<HTMLInputElement>) => {
|
const handleProxiesChange = (e: React.ChangeEvent<HTMLInputElement>) => {
|
||||||
setJobOptions((prevJobOptions) => ({
|
setLocalJobOptions((prevJobOptions) => ({
|
||||||
...prevJobOptions,
|
...prevJobOptions,
|
||||||
proxies: e.target.value,
|
proxies: e.target.value,
|
||||||
}));
|
}));
|
||||||
};
|
};
|
||||||
|
|
||||||
const handleCollectMediaChange = () => {
|
const handleClose = () => {
|
||||||
setJobOptions((prevJobOptions) => ({
|
setJobOptions(localJobOptions);
|
||||||
...prevJobOptions,
|
onClose();
|
||||||
collect_media: !prevJobOptions.collect_media,
|
|
||||||
}));
|
|
||||||
};
|
};
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<Dialog
|
<Dialog
|
||||||
open={open}
|
open={open}
|
||||||
onClose={onClose}
|
onClose={handleClose}
|
||||||
maxWidth="md"
|
maxWidth="md"
|
||||||
fullWidth
|
fullWidth
|
||||||
PaperProps={{
|
PaperProps={{
|
||||||
@@ -120,14 +127,21 @@ export const AdvancedJobOptionsDialog = ({
|
|||||||
<FormControlLabel
|
<FormControlLabel
|
||||||
control={
|
control={
|
||||||
<Checkbox
|
<Checkbox
|
||||||
checked={jobOptions.multi_page_scrape}
|
checked={localJobOptions.multi_page_scrape}
|
||||||
onChange={handleMultiPageScrapeChange}
|
onChange={() => handleCheckboxChange("multi_page_scrape")}
|
||||||
|
disabled={!multiPageScrapeEnabled}
|
||||||
/>
|
/>
|
||||||
}
|
}
|
||||||
label={
|
label={
|
||||||
<Box sx={{ display: "flex", alignItems: "center" }}>
|
<Box sx={{ display: "flex", alignItems: "center" }}>
|
||||||
<Typography>Multi Page Scrape</Typography>
|
<Typography>Multi Page Scrape</Typography>
|
||||||
<Tooltip title="Enable crawling through multiple pages">
|
<Tooltip
|
||||||
|
title={
|
||||||
|
multiPageScrapeEnabled
|
||||||
|
? "Enable crawling through multiple pages"
|
||||||
|
: "Multi page scrape is disabled"
|
||||||
|
}
|
||||||
|
>
|
||||||
<IconButton size="small">
|
<IconButton size="small">
|
||||||
<InfoOutlined fontSize="small" />
|
<InfoOutlined fontSize="small" />
|
||||||
</IconButton>
|
</IconButton>
|
||||||
@@ -135,11 +149,13 @@ export const AdvancedJobOptionsDialog = ({
|
|||||||
</Box>
|
</Box>
|
||||||
}
|
}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
<FormControlLabel
|
<FormControlLabel
|
||||||
control={
|
control={
|
||||||
<Checkbox
|
<Checkbox
|
||||||
checked={jobOptions.collect_media}
|
checked={localJobOptions.collect_media}
|
||||||
onChange={handleCollectMediaChange}
|
onChange={() => handleCheckboxChange("collect_media")}
|
||||||
|
data-cy="collect-media-checkbox"
|
||||||
/>
|
/>
|
||||||
}
|
}
|
||||||
label={
|
label={
|
||||||
@@ -153,6 +169,26 @@ export const AdvancedJobOptionsDialog = ({
|
|||||||
</Box>
|
</Box>
|
||||||
}
|
}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
|
<FormControlLabel
|
||||||
|
control={
|
||||||
|
<Checkbox
|
||||||
|
checked={localJobOptions.return_html}
|
||||||
|
onChange={() => handleCheckboxChange("return_html")}
|
||||||
|
data-cy="return-html-checkbox"
|
||||||
|
/>
|
||||||
|
}
|
||||||
|
label={
|
||||||
|
<Box sx={{ display: "flex", alignItems: "center" }}>
|
||||||
|
<Typography>Return HTML</Typography>
|
||||||
|
<Tooltip title="Return the HTML of the page">
|
||||||
|
<IconButton size="small">
|
||||||
|
<InfoOutlined fontSize="small" />
|
||||||
|
</IconButton>
|
||||||
|
</Tooltip>
|
||||||
|
</Box>
|
||||||
|
}
|
||||||
|
/>
|
||||||
</FormGroup>
|
</FormGroup>
|
||||||
</Box>
|
</Box>
|
||||||
|
|
||||||
@@ -223,7 +259,7 @@ export const AdvancedJobOptionsDialog = ({
|
|||||||
fullWidth
|
fullWidth
|
||||||
variant="outlined"
|
variant="outlined"
|
||||||
size="small"
|
size="small"
|
||||||
value={jobOptions.proxies}
|
value={localJobOptions.proxies}
|
||||||
onChange={handleProxiesChange}
|
onChange={handleProxiesChange}
|
||||||
InputProps={{
|
InputProps={{
|
||||||
startAdornment: (
|
startAdornment: (
|
||||||
@@ -241,8 +277,9 @@ export const AdvancedJobOptionsDialog = ({
|
|||||||
label="Custom Headers"
|
label="Custom Headers"
|
||||||
placeholder='{"User-Agent": "CustomAgent", "Accept": "*/*"}'
|
placeholder='{"User-Agent": "CustomAgent", "Accept": "*/*"}'
|
||||||
urlParam="custom_headers"
|
urlParam="custom_headers"
|
||||||
|
name="custom_headers"
|
||||||
onChange={(value) => {
|
onChange={(value) => {
|
||||||
setJobOptions((prevJobOptions) => ({
|
setLocalJobOptions((prevJobOptions) => ({
|
||||||
...prevJobOptions,
|
...prevJobOptions,
|
||||||
custom_headers: value,
|
custom_headers: value,
|
||||||
}));
|
}));
|
||||||
@@ -254,8 +291,9 @@ export const AdvancedJobOptionsDialog = ({
|
|||||||
label="Custom Cookies"
|
label="Custom Cookies"
|
||||||
placeholder='[{"name": "value", "name2": "value2"}]'
|
placeholder='[{"name": "value", "name2": "value2"}]'
|
||||||
urlParam="custom_cookies"
|
urlParam="custom_cookies"
|
||||||
|
name="custom_cookies"
|
||||||
onChange={(value) => {
|
onChange={(value) => {
|
||||||
setJobOptions((prevJobOptions) => ({
|
setLocalJobOptions((prevJobOptions) => ({
|
||||||
...prevJobOptions,
|
...prevJobOptions,
|
||||||
custom_cookies: value,
|
custom_cookies: value,
|
||||||
}));
|
}));
|
||||||
|
|||||||
@@ -1,17 +1,17 @@
|
|||||||
import React, { useState } from "react";
|
|
||||||
import {
|
import {
|
||||||
|
alpha,
|
||||||
|
Box,
|
||||||
|
Paper,
|
||||||
Table,
|
Table,
|
||||||
TableBody,
|
TableBody,
|
||||||
TableCell,
|
TableCell,
|
||||||
TableContainer,
|
TableContainer,
|
||||||
TableHead,
|
TableHead,
|
||||||
TableRow,
|
TableRow,
|
||||||
Paper,
|
|
||||||
Box,
|
|
||||||
Typography,
|
Typography,
|
||||||
useTheme,
|
useTheme,
|
||||||
alpha,
|
|
||||||
} from "@mui/material";
|
} from "@mui/material";
|
||||||
|
import React, { useState } from "react";
|
||||||
|
|
||||||
export type CsvRow = {
|
export type CsvRow = {
|
||||||
[key: string]: string;
|
[key: string]: string;
|
||||||
@@ -131,8 +131,9 @@ export const CsvTable: React.FC<CsvTableProps> = ({ csv, className }) => {
|
|||||||
<Typography variant="body2" color="text.secondary">
|
<Typography variant="body2" color="text.secondary">
|
||||||
{row.text
|
{row.text
|
||||||
? row.text
|
? row.text
|
||||||
.replace(/(\r\n|\n|\r)/g, " ")
|
.replace(/[\n\t\r]+/g, " ")
|
||||||
.replace(/\t/g, " ")
|
.replace(/\s+/g, " ")
|
||||||
|
.trim()
|
||||||
: "No text available"}
|
: "No text available"}
|
||||||
</Typography>
|
</Typography>
|
||||||
</Paper>
|
</Paper>
|
||||||
|
|||||||
30
src/components/common/disabled/disabled.tsx
Normal file
30
src/components/common/disabled/disabled.tsx
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
import { Box } from "@mui/material";
|
||||||
|
|
||||||
|
export type DisabledProps = {
|
||||||
|
message: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
export const Disabled = ({ message }: DisabledProps) => {
|
||||||
|
return (
|
||||||
|
<Box
|
||||||
|
bgcolor="background.default"
|
||||||
|
minHeight="100vh"
|
||||||
|
display="flex"
|
||||||
|
justifyContent="center"
|
||||||
|
alignItems="center"
|
||||||
|
data-testid="disabled-message"
|
||||||
|
>
|
||||||
|
<h4
|
||||||
|
style={{
|
||||||
|
color: "#fff",
|
||||||
|
padding: "20px",
|
||||||
|
borderRadius: "8px",
|
||||||
|
background: "rgba(0, 0, 0, 0.6)",
|
||||||
|
boxShadow: "0 4px 8px rgba(0, 0, 0, 0.2)",
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{message}
|
||||||
|
</h4>
|
||||||
|
</Box>
|
||||||
|
);
|
||||||
|
};
|
||||||
1
src/components/common/disabled/index.ts
Normal file
1
src/components/common/disabled/index.ts
Normal file
@@ -0,0 +1 @@
|
|||||||
|
export * from "./disabled";
|
||||||
@@ -1,28 +1,29 @@
|
|||||||
|
import { parseJsonToEntries } from "@/lib/helpers/parse-json-to-entries";
|
||||||
|
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
|
||||||
import {
|
import {
|
||||||
Accordion,
|
Accordion,
|
||||||
AccordionSummary,
|
|
||||||
TableCell,
|
|
||||||
TableRow,
|
|
||||||
Paper,
|
|
||||||
TableBody,
|
|
||||||
useTheme,
|
|
||||||
TextField,
|
|
||||||
Box,
|
|
||||||
Typography,
|
|
||||||
AccordionDetails,
|
AccordionDetails,
|
||||||
TableHead,
|
AccordionSummary,
|
||||||
TableContainer,
|
Box,
|
||||||
|
Paper,
|
||||||
Table,
|
Table,
|
||||||
|
TableBody,
|
||||||
|
TableCell,
|
||||||
|
TableContainer,
|
||||||
|
TableHead,
|
||||||
|
TableRow,
|
||||||
|
TextField,
|
||||||
|
Typography,
|
||||||
|
useTheme,
|
||||||
} from "@mui/material";
|
} from "@mui/material";
|
||||||
import { useEffect, useState } from "react";
|
import { useEffect, useState } from "react";
|
||||||
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
|
|
||||||
import { parseJsonToEntries } from "@/lib/helpers/parse-json-to-entries";
|
|
||||||
|
|
||||||
export type ExpandedTableInputProps = {
|
export type ExpandedTableInputProps = {
|
||||||
label: string;
|
label: string;
|
||||||
onChange: (value: any) => void;
|
onChange: (value: any) => void;
|
||||||
placeholder: string;
|
placeholder: string;
|
||||||
urlParam: string;
|
urlParam: string;
|
||||||
|
name: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const ExpandedTableInput = ({
|
export const ExpandedTableInput = ({
|
||||||
@@ -30,6 +31,7 @@ export const ExpandedTableInput = ({
|
|||||||
onChange,
|
onChange,
|
||||||
placeholder,
|
placeholder,
|
||||||
urlParam,
|
urlParam,
|
||||||
|
name,
|
||||||
}: ExpandedTableInputProps) => {
|
}: ExpandedTableInputProps) => {
|
||||||
const theme = useTheme();
|
const theme = useTheme();
|
||||||
const [value, setValue] = useState("");
|
const [value, setValue] = useState("");
|
||||||
@@ -150,6 +152,7 @@ export const ExpandedTableInput = ({
|
|||||||
size="small"
|
size="small"
|
||||||
error={jsonError !== null}
|
error={jsonError !== null}
|
||||||
helperText={jsonError ?? ""}
|
helperText={jsonError ?? ""}
|
||||||
|
name={name}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
{parsedHeaders && parsedHeaders.length > 0 && (
|
{parsedHeaders && parsedHeaders.length > 0 && (
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user