40 Commits

Author SHA1 Message Date
Jayden Pyles
8cd30599fa feat: add in media downloading (#62)
* feat: add in media downloading

* fix: build issue
2025-05-10 15:14:54 -05:00
Jayden Pyles
a58212b214 feat: add authentication test 2025-05-10 14:22:06 -05:00
Jayden Pyles
a6ab6ec71d fix: vulns 2025-05-10 12:04:39 -05:00
Jayden Pyles
c5c9427af4 fix: vulns 2025-05-10 11:49:24 -05:00
Jayden Pyles
e8d80c1a77 fix: add cypress tests to CI [skip ci] 2025-05-10 11:29:20 -05:00
Jayden Pyles
ee8047ac78 fix: add cypress tests to CI [skip ci] 2025-05-10 10:46:05 -05:00
Jayden Pyles
e74c4f392c fix: add cypress tests to CI [skip ci] 2025-05-10 10:41:54 -05:00
Jayden Pyles
6b484952a3 fix: add cypress tests to CI [skip ci] 2025-05-10 10:35:31 -05:00
Jayden Pyles
2283808605 fix: add cypress tests to CI [skip ci] 2025-05-10 10:17:22 -05:00
Jayden Pyles
ee5ada70f7 fix: add cypress tests to CI [skip ci] 2025-05-10 10:04:55 -05:00
Jayden Pyles
56cc457e6e fix: add cypress tests to CI [skip ci] 2025-05-10 09:48:54 -05:00
Jayden Pyles
21a38181de fix: add cypress tests to CI [skip ci] 2025-05-10 09:44:43 -05:00
Jayden Pyles
3063bc0d53 fix: add cypress tests to CI [skip ci] 2025-05-10 09:41:43 -05:00
Jayden Pyles
f42e7ed531 fix: add cypress tests to CI [skip ci] 2025-05-10 09:39:44 -05:00
Jayden Pyles
c197f2becd fix: add cypress tests to CI [skip ci] 2025-05-10 09:38:11 -05:00
Jayden Pyles
a534129702 fix: swap to using chrome driver manager [skip ci] 2025-05-10 09:24:48 -05:00
Jayden Pyles
455ed049c9 fix: allow workflow dispatch [skip ci] 2025-05-10 09:16:41 -05:00
Jayden Pyles
de4ccfbf3a fix: only allow cron on logged in
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
2025-04-24 22:14:00 -05:00
Jayden Pyles
3475d66995 Add cron jobs (#60)
* feat: finish up cron jobs

* feat: clean up
2025-04-24 22:03:28 -05:00
Jayden Pyles
186b4a0231 Merge branch 'master' of github.com:jaypyles/Scraperr 2025-04-24 22:02:06 -05:00
Jayden Pyles
0af0ebf5b5 feat: fix authentication 2025-04-24 18:24:19 -05:00
c3Nz
ef35db00d7 fix: Python handler Fixed (#51)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
* Fix: Python handler Fixed

* fix: Python handler Fixed without comment
2024-11-26 10:05:43 -06:00
Jayden Pyles
d65e600ec3 Merge branch 'master' of github.com:jaypyles/Scraperr
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
2024-11-21 18:13:18 -06:00
Jayden Pyles
6fe145f649 chore: remove uneeded files [skip ci] 2024-11-21 18:12:46 -06:00
Jayden Pyles
563ca2245e Refactor: Drop MongoDB (#48)
* feat: replace mongodb with sqllite

* feat: update docker compose to drop mongo

* chore: drop logs

* chore: cleanup

* fix: unit tests

* fix: workflow

* fix: workflow run
2024-11-21 18:11:46 -06:00
Jayden Pyles
d54fdbd405 fix: workflow ruin [skip ci] 2024-11-21 18:11:31 -06:00
Jayden Pyles
7169755cd2 fix: workflow 2024-11-21 18:03:40 -06:00
Jayden Pyles
15b56b5704 fix: unit tests 2024-11-21 18:00:57 -06:00
Jayden Pyles
bf6b740005 chore: cleanup 2024-11-21 17:43:20 -06:00
Jayden Pyles
c339e75e06 chore: drop logs 2024-11-21 17:36:47 -06:00
Jayden Pyles
b6ed40e6cf feat: update docker compose to drop mongo 2024-11-21 17:36:22 -06:00
Jayden Pyles
3085f9d31a feat: replace mongodb with sqllite 2024-11-20 21:32:27 -06:00
Jayden Pyles
7d80ff5c7f Feat: Site Mapping (#46)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
* wip: add site mapping

* chore: cleanup
2024-11-16 20:55:23 -06:00
Jayden Pyles
3a0762f1e3 fix: headers
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
2024-11-12 22:07:30 -06:00
Jayden Pyles
dc4d219205 fix: make calls to next server 2024-11-12 21:34:47 -06:00
Jayden Pyles
b3bf780eda Refactor: Remove Proxy Dependency (#44)
Some checks are pending
Unit Tests / unit-tests (push) Waiting to run
2024-11-12 17:30:07 -06:00
Jayden Pyles
1dfd3ca92a Update issue templates [skip ci] 2024-11-10 16:33:29 -06:00
Jayden Pyles
fe51140a0e fix: ci
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
2024-11-09 21:38:03 -06:00
Jayden Pyles
dd6cec6679 fix: ci 2024-11-09 21:34:41 -06:00
Jayden Pyles
2339ba1b77 fix: ci 2024-11-09 21:33:21 -06:00
111 changed files with 6586 additions and 15127 deletions

32
.github/ISSUE_TEMPLATE/bug_report.md vendored Normal file
View File

@@ -0,0 +1,32 @@
---
name: Bug report
about: 'Bug reporting '
title: ''
labels: ''
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug is.
**To Reproduce**
Steps to reproduce the behavior:
1. Go to '...'
2. Click on '....'
3. Scroll down to '....'
4. See error
**Expected behavior**
A clear and concise description of what you expected to happen.
**Screenshots**
If applicable, add screenshots to help explain your problem.
**Desktop (please complete the following information):**
- OS: [e.g. iOS]
- Browser [e.g. chrome, safari]
- Version [e.g. 22]
**Additional context**
Add any other context about the problem here.

View File

@@ -0,0 +1,58 @@
name: Run Cypress Tests
description: Run Cypress tests
runs:
using: "composite"
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 22
- name: Setup Docker project
shell: bash
run: make build up-dev
- name: Install dependencies
shell: bash
run: npm install
- name: Wait for frontend to be ready
shell: bash
run: |
for i in {1..10}; do
curl -s http://127.0.0.1:80 && echo "Frontend is ready" && exit 0
echo "Waiting for frontend to be ready... attempt $i"
sleep 1
done
echo "Frontend failed to be ready after 10 retries"
exit 1
- name: Wait for backend to be ready
shell: bash
run: |
for i in {1..10}; do
curl -s http://127.0.0.1:8000 && echo "Backend is ready" && exit 0
echo "Waiting for backend to be ready... attempt $i"
sleep 1
done
echo "Backend failed to be ready after 10 retries"
exit 1
- name: Show backend logs on failure
if: failure()
shell: bash
run: |
echo "== Docker Containers =="
docker ps -a
echo "== Backend Logs =="
docker logs $(docker ps -a --filter "name=scraperr_api" --format "{{.Names}}") || echo "Could not get backend logs"
- name: Run Cypress tests
shell: bash
run: npm run cy:run

View File

@@ -1,12 +1,14 @@
name: ci
requires:
- unit-tests
name: Docker Image
on:
push:
branches: ["master"]
workflow_run:
workflows: ["Unit Tests"]
types:
- completed
workflow_dispatch:
jobs:
build:
if: ${{ github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.head_branch == 'master' }}
runs-on: ubuntu-latest
steps:
- name: Checkout
@@ -36,3 +38,20 @@ jobs:
file: ./docker/api/Dockerfile
push: true
tags: ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
success-message:
runs-on: ubuntu-latest
needs:
- build
steps:
- name: Send Discord Message
uses: jaypyles/discord-webhook-action@v1.0.0
with:
webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
content: "Scraperr Successfully Built Docker Images"
username: "Scraperr CI"
embed-title: "✅ Deployment Status"
embed-description: "Scraperr successfully built docker images."
embed-color: 3066993 # Green
embed-footer-text: "Scraperr CI"
embed-timestamp: ${{ github.event.head_commit.timestamp }}

View File

@@ -4,9 +4,11 @@ on:
push:
branches:
- master
pull_request:
branches:
- master
types: [opened, synchronize, reopened]
workflow_dispatch:
jobs:
unit-tests:
@@ -15,6 +17,9 @@ jobs:
- name: Checkout
uses: actions/checkout@v4
- name: Set env
run: echo "ENV=test" >> $GITHUB_ENV
- name: Install pdm
run: pip install pdm
@@ -23,3 +28,27 @@ jobs:
- name: Run tests
run: PYTHONPATH=. pdm run pytest api/backend/tests
cypress-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/run-cypress-tests
success-message:
runs-on: ubuntu-latest
needs:
- unit-tests
- cypress-tests
steps:
- name: Send Discord Message
uses: jaypyles/discord-webhook-action@v1.0.0
with:
webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
content: "Scraperr Successfully Passed Tests"
username: "Scraperr CI"
embed-title: "✅ Deployment Status"
embed-description: "Scraperr successfully passed all tests."
embed-color: 3066993 # Green
embed-footer-text: "Scraperr CI"
embed-timestamp: ${{ github.event.head_commit.timestamp }}

2
.gitignore vendored
View File

@@ -187,3 +187,5 @@ cython_debug/
postgres_data
.vscode
ollama
data
media

1
.python-version Normal file
View File

@@ -0,0 +1 @@
3.10.12

View File

@@ -13,7 +13,7 @@ Scraperr is a self-hosted web application that allows users to scrape data from
From the table, users can download an excel sheet of the job's results, along with an option to rerun the job.
View the [docs](https://scraperr-docs.pages.dev).
View the [docs](https://scraperr-docs.pages.dev) for a quickstart guide and more information.
## Features
@@ -64,87 +64,12 @@ View the [docs](https://scraperr-docs.pages.dev).
![chat](https://github.com/jaypyles/www-scrape/blob/master/docs/chat_page.png)
## Installation
1. Clone the repository:
```sh
git clone https://github.com/jaypyles/scraperr.git
```
2. Set environmental variables and labels in `docker-compose.yml`.
```yaml
scraperr:
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr.rule=Host(`localhost`)" # change this to your domain, if not running on localhost
- "traefik.http.routers.scraperr.entrypoints=web" # websecure if using https
- "traefik.http.services.scraperr.loadbalancer.server.port=3000"
scraperr_api:
environment:
- LOG_LEVEL=INFO
- MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB
- SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string)
- ALGORITHM=HS256 # authentication encoding algorithm
- ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr_api.rule=Host(`localhost`) && PathPrefix(`/api`)" # change this to your domain, if not running on localhost
- "traefik.http.routers.scraperr_api.entrypoints=web" # websecure if using https
- "traefik.http.middlewares.api-stripprefix.stripprefix.prefixes=/api"
- "traefik.http.routers.scraperr_api.middlewares=api-stripprefix"
- "traefik.http.services.scraperr_api.loadbalancer.server.port=8000"
mongo:
environment:
MONGO_INITDB_ROOT_USERNAME: root
MONGO_INITDB_ROOT_PASSWORD: example
```
Don't want to use `traefik`? This configuration can be used in other reverse proxies, as long as the API is proxied to `/api` of the frontend container. This is currently
not able to be used without a reverse proxy, due to limitations of runtime client-side environmental variables in `next.js`.
3. Deploy
```sh
make up
```
The app provides its own `traefik` configuration to use independently, but can easily be reverse-proxied by any other app, or your own reverse-proxy.
## Usage
1. Open the application in your browser at `http://localhost`.
2. Enter the URL you want to scrape in the URL field.
3. Add elements to scrape by specifying a name and the corresponding XPath.
4. Click the "Submit" button to queue URL to be scraped.
5. View queue in the "Previous Jobs" section.
## API Endpoints
Use this service as an API for your own projects. Due to this using FastAPI, a docs page is available at `/docs` for the API.
![docs](https://github.com/jaypyles/www-scrape/blob/master/docs/docs_page.png)
## AI
Currently supports either an Ollama instance or OpenAI's ChatGPT, using your own API key. Setting up is easy as either setting the Ollama url or the OpenAI API key in the API's environmental variables in the `docker-compose.yml` file:
```yaml
scraperr_api:
environment:
- OLLAMA_URL=http://ollama:11434
- OLLAMA_MODEL=llama3.1
# or
- OPENAI_KEY=<your_key>
- OPENAI_MODEL=gpt3.5-turbo
```
The model's names are taken from the documentation of their respective technologies.
## Troubleshooting
Q: When running Scraperr, I'm met with "404 Page not found".

View File

@@ -1,3 +0,0 @@
github_repo: https://github.com/jaypyles/webapp-template.git
deploy_path: /home/admin/site-test6
deploy_command: make pull up-prd

View File

@@ -1,10 +0,0 @@
- name: Deploy site
hosts: all
become: true
vars_files:
- ./config.yaml
tasks:
- name: Deploy
command: "{{deploy_command}}"
args:
chdir: "{{deploy_path}}"

View File

@@ -1,6 +0,0 @@
all:
hosts:
host1:
ansible_host: 192.168.0.1
ansible_user: admin
ansible_ssh_private_key_file: private_key.pem

View File

@@ -1,54 +0,0 @@
- name: Install Docker and run make pull up
hosts: all
become: true
vars_files:
- ./config.yaml
tasks:
- name: Update apt cache
apt:
update_cache: yes
- name: Install required packages
apt:
name:
- apt-transport-https
- ca-certificates
- curl
- gnupg-agent
- software-properties-common
- rsync
- make
state: present
- name: Add Dockers official GPG key
apt_key:
url: https://download.docker.com/linux/ubuntu/gpg
state: present
- name: Add Docker APT repository
apt_repository:
repo: deb [arch=amd64] https://download.docker.com/linux/ubuntu focal stable
state: present
- name: Update apt cache again after adding Docker repo
apt:
update_cache: yes
- name: Install Docker
apt:
name: docker-ce
state: present
- name: Start and enable Docker service
systemd:
name: docker
enabled: yes
state: started
- name: Install Docker Compose
apt:
name: docker-compose-plugin
state: present
- name: Verify Docker is installed
command: docker --version
register: docker_version
- name: Display Docker version
debug:
msg: "Docker version: {{ docker_version.stdout }}"
- name: Clone repo
ansible.builtin.git:
repo: "{{github_repo}}"
dest: "{{deploy_path}}"

View File

@@ -1,9 +1,13 @@
# STL
import os
import logging
import apscheduler # type: ignore
# PDM
from fastapi import FastAPI
import apscheduler.schedulers
import apscheduler.schedulers.background
from fastapi import FastAPI, Request, status
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
# LOCAL
@@ -13,6 +17,11 @@ from api.backend.utils import get_log_level
from api.backend.routers.job_router import job_router
from api.backend.routers.log_router import log_router
from api.backend.routers.stats_router import stats_router
from api.backend.database.startup import init_database
from fastapi.responses import JSONResponse
from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
from api.backend.scheduler import scheduler
log_level = os.getenv("LOG_LEVEL")
LOG_LEVEL = get_log_level(log_level)
@@ -25,7 +34,7 @@ logging.basicConfig(
LOG = logging.getLogger(__name__)
app = FastAPI(title="api")
app = FastAPI(title="api", root_path="/api")
app.add_middleware(
CORSMiddleware,
@@ -41,3 +50,28 @@ app.include_router(ai_router)
app.include_router(job_router)
app.include_router(log_router)
app.include_router(stats_router)
@app.on_event("startup")
async def startup_event():
start_cron_scheduler(scheduler)
scheduler.start()
if os.getenv("ENV") != "test":
init_database()
LOG.info("Starting up...")
@app.on_event("shutdown")
def shutdown_scheduler():
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(request: Request, exc: RequestValidationError):
exc_str = f"{exc}".replace("\n", " ").replace(" ", " ")
logging.error(f"{request}: {exc_str}")
content = {"status_code": 10422, "message": exc_str, "data": None}
return JSONResponse(
content=content, status_code=status.HTTP_422_UNPROCESSABLE_ENTITY
)

View File

@@ -7,7 +7,6 @@ from fastapi.security import OAuth2PasswordRequestForm
# LOCAL
from api.backend.schemas import User, Token, UserCreate
from api.backend.database import get_user_collection
from api.backend.auth.auth_utils import (
ACCESS_TOKEN_EXPIRE_MINUTES,
get_current_user,
@@ -15,9 +14,14 @@ from api.backend.auth.auth_utils import (
get_password_hash,
create_access_token,
)
import logging
from api.backend.database.common import update
auth_router = APIRouter()
LOG = logging.getLogger("auth_router")
@auth_router.post("/auth/token", response_model=Token)
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
@@ -43,12 +47,14 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends(
@auth_router.post("/auth/signup", response_model=User)
async def create_user(user: UserCreate):
users_collection = get_user_collection()
hashed_password = get_password_hash(user.password)
user_dict = user.model_dump()
user_dict["hashed_password"] = hashed_password
del user_dict["password"]
_ = await users_collection.insert_one(user_dict)
query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)"
_ = update(query, (user_dict["email"], hashed_password, user_dict["full_name"]))
return user_dict

View File

@@ -1,7 +1,5 @@
# STL
import os
from gc import disable
from queue import Empty
from typing import Any, Optional
from datetime import datetime, timedelta
import logging
@@ -15,7 +13,8 @@ from fastapi.security import OAuth2PasswordBearer
# LOCAL
from api.backend.schemas import User, UserInDB, TokenData
from api.backend.database import get_user_collection
from api.backend.database.common import query
LOG = logging.getLogger(__name__)
@@ -40,8 +39,8 @@ def get_password_hash(password: str):
async def get_user(email: str):
user_collection = get_user_collection()
user = await user_collection.find_one({"email": email})
user_query = "SELECT * FROM users WHERE email = ?"
user = query(user_query, (email,))[0]
if not user:
return
@@ -77,27 +76,42 @@ def create_access_token(
async def get_current_user(token: str = Depends(oauth2_scheme)):
LOG.info(f"Getting current user with token: {token}")
LOG.debug(f"Getting current user with token: {token}")
if not token:
LOG.debug("No token provided")
return EMPTY_USER
if len(token.split(".")) != 3:
LOG.error(f"Malformed token: {token}")
return EMPTY_USER
try:
LOG.debug(
f"Decoding token: {token} with secret key: {SECRET_KEY} and algorithm: {ALGORITHM}"
)
if token.startswith("Bearer "):
token = token.split(" ")[1]
payload: Optional[dict[str, Any]] = jwt.decode(
token, SECRET_KEY, algorithms=[ALGORITHM]
)
if not payload:
LOG.error("No payload found in token")
return EMPTY_USER
email = payload.get("sub")
if email is None:
LOG.error("No email found in payload")
return EMPTY_USER
token_data = TokenData(email=email)
except JWTError:
except JWTError as e:
LOG.error(f"JWTError occurred: {e}")
return EMPTY_USER
except Exception as e:
@@ -105,7 +119,6 @@ async def get_current_user(token: str = Depends(oauth2_scheme)):
return EMPTY_USER
user = await get_user(email=token_data.email)
if user is None:
return EMPTY_USER

1
api/backend/constants.py Normal file
View File

@@ -0,0 +1 @@
DATABASE_PATH = "data/database.db"

View File

@@ -1,23 +0,0 @@
# STL
import os
from typing import Any
# PDM
from dotenv import load_dotenv
from motor.motor_asyncio import AsyncIOMotorClient
_ = load_dotenv()
MONGODB_URI = os.getenv("MONGODB_URI")
def get_user_collection():
client: AsyncIOMotorClient[dict[str, Any]] = AsyncIOMotorClient(MONGODB_URI)
db = client["scrape"]
return db["users"]
def get_job_collection():
client: AsyncIOMotorClient[dict[str, Any]] = AsyncIOMotorClient(MONGODB_URI)
db = client["scrape"]
return db["jobs"]

View File

@@ -0,0 +1,3 @@
from .common import insert, QUERIES, update
__all__ = ["insert", "QUERIES", "update"]

View File

@@ -0,0 +1,92 @@
import sqlite3
from typing import Any, Optional
from api.backend.constants import DATABASE_PATH
from api.backend.utils import format_json, format_sql_row_to_python
from api.backend.database.schema import INIT_QUERY
from api.backend.database.queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
import logging
LOG = logging.getLogger(__name__)
def connect():
connection = sqlite3.connect(DATABASE_PATH)
connection.set_trace_callback(print)
cursor = connection.cursor()
return cursor
def insert(query: str, values: tuple[Any, ...]):
connection = sqlite3.connect(DATABASE_PATH)
cursor = connection.cursor()
copy = list(values)
format_json(copy)
try:
_ = cursor.execute(query, copy)
connection.commit()
except sqlite3.Error as e:
LOG.error(f"An error occurred: {e}")
finally:
cursor.close()
connection.close()
def query(query: str, values: Optional[tuple[Any, ...]] = None):
connection = sqlite3.connect(DATABASE_PATH)
connection.row_factory = sqlite3.Row
cursor = connection.cursor()
rows = []
try:
if values:
_ = cursor.execute(query, values)
else:
_ = cursor.execute(query)
rows = cursor.fetchall()
finally:
cursor.close()
connection.close()
formatted_rows: list[dict[str, Any]] = []
for row in rows:
row = dict(row)
formatted_row = format_sql_row_to_python(row)
formatted_rows.append(formatted_row)
return formatted_rows
def update(query: str, values: Optional[tuple[Any, ...]] = None):
connection = sqlite3.connect(DATABASE_PATH)
cursor = connection.cursor()
copy = None
if values:
copy = list(values)
format_json(copy)
try:
if copy:
res = cursor.execute(query, copy)
else:
res = cursor.execute(query)
connection.commit()
return res.rowcount
except sqlite3.Error as e:
LOG.error(f"An error occurred: {e}")
finally:
cursor.close()
connection.close()
return 0
QUERIES = {
"init": INIT_QUERY,
"insert_job": JOB_INSERT_QUERY,
"delete_job": DELETE_JOB_QUERY,
}

View File

@@ -0,0 +1,3 @@
from .queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
__all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"]

View File

@@ -0,0 +1,9 @@
JOB_INSERT_QUERY = """
INSERT INTO jobs
(id, url, elements, user, time_created, result, status, chat, job_options)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
"""
DELETE_JOB_QUERY = """
DELETE FROM jobs WHERE id IN ()
"""

View File

@@ -0,0 +1,3 @@
from .schema import INIT_QUERY
__all__ = ["INIT_QUERY"]

View File

@@ -0,0 +1,30 @@
INIT_QUERY = """
CREATE TABLE IF NOT EXISTS jobs (
id STRING PRIMARY KEY NOT NULL,
url STRING NOT NULL,
elements JSON NOT NULL,
user STRING,
time_created DATETIME NOT NULL,
result JSON NOT NULL,
status STRING NOT NULL,
chat JSON,
job_options JSON
);
CREATE TABLE IF NOT EXISTS users (
email STRING PRIMARY KEY NOT NULL,
hashed_password STRING NOT NULL,
full_name STRING,
disabled BOOLEAN
);
CREATE TABLE IF NOT EXISTS cron_jobs (
id STRING PRIMARY KEY NOT NULL,
user_email STRING NOT NULL,
job_id STRING NOT NULL,
cron_expression STRING NOT NULL,
time_created DATETIME NOT NULL,
time_updated DATETIME NOT NULL,
FOREIGN KEY (job_id) REFERENCES jobs(id)
);
"""

View File

@@ -0,0 +1,15 @@
from api.backend.database.common import connect, QUERIES
import logging
LOG = logging.getLogger(__name__)
def init_database():
cursor = connect()
for query in QUERIES["init"].strip().split(";"):
if query.strip():
LOG.info(f"Executing query: {query}")
_ = cursor.execute(query)
cursor.close()

View File

@@ -1,119 +0,0 @@
# STL
import logging
from typing import Any, Optional
# PDM
from pymongo import DESCENDING
# LOCAL
from api.backend.models import FetchOptions
from api.backend.database import get_job_collection
LOG = logging.getLogger(__name__)
async def insert(item: dict[str, Any]) -> None:
collection = get_job_collection()
i = await collection.insert_one(item)
LOG.info(f"Inserted item: {i}")
async def get_queued_job():
collection = get_job_collection()
return await collection.find_one(
{"status": "Queued"}, sort=[("created_at", DESCENDING)]
)
async def query(
filter: dict[str, Any], fetch_options: Optional[FetchOptions] = None
) -> list[dict[str, Any]]:
collection = get_job_collection()
cursor = collection.find(filter)
results: list[dict[str, Any]] = []
async for document in cursor:
del document["_id"]
if fetch_options and not fetch_options.chat and document.get("chat"):
del document["chat"]
results.append(document)
return results
async def update_job(ids: list[str], field: str, value: Any):
collection = get_job_collection()
for id in ids:
_ = await collection.update_one(
{"id": id},
{"$set": {field: value}},
)
async def delete_jobs(jobs: list[str]):
collection = get_job_collection()
result = await collection.delete_many({"id": {"$in": jobs}})
LOG.info(f"{result.deleted_count} documents deleted")
return True if result.deleted_count > 0 else False
async def average_elements_per_link(user: str):
collection = get_job_collection()
pipeline = [
{"$match": {"status": "Completed", "user": user}},
{
"$project": {
"date": {
"$dateToString": {"format": "%Y-%m-%d", "date": "$time_created"}
},
"num_elements": {"$size": "$elements"},
}
},
{
"$group": {
"_id": "$date",
"average_elements": {"$avg": "$num_elements"},
"count": {"$sum": 1},
}
},
{"$sort": {"_id": 1}},
]
cursor = collection.aggregate(pipeline)
results: list[dict[str, Any]] = []
async for document in cursor:
results.append(
{
"date": document["_id"],
"average_elements": document["average_elements"],
"count": document["count"],
}
)
return results
async def get_jobs_per_day(user: str):
collection = get_job_collection()
pipeline = [
{"$match": {"status": "Completed", "user": user}},
{
"$project": {
"date": {
"$dateToString": {"format": "%Y-%m-%d", "date": "$time_created"}
}
}
},
{"$group": {"_id": "$date", "job_count": {"$sum": 1}}},
{"$sort": {"_id": 1}},
]
cursor = collection.aggregate(pipeline)
results: list[dict[str, Any]] = []
async for document in cursor:
results.append({"date": document["_id"], "job_count": document["job_count"]})
return results

View File

@@ -0,0 +1,17 @@
from .job import (
insert,
update_job,
delete_jobs,
get_jobs_per_day,
get_queued_job,
average_elements_per_link,
)
__all__ = [
"insert",
"update_job",
"delete_jobs",
"get_jobs_per_day",
"get_queued_job",
"average_elements_per_link",
]

View File

@@ -0,0 +1,100 @@
import datetime
from typing import Any
import uuid
from api.backend.database.common import insert, query
from api.backend.models import CronJob
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
from apscheduler.triggers.cron import CronTrigger # type: ignore
from api.backend.job import insert as insert_job
import logging
LOG = logging.getLogger("Cron Scheduler")
def insert_cron_job(cron_job: CronJob):
query = """
INSERT INTO cron_jobs (id, user_email, job_id, cron_expression, time_created, time_updated)
VALUES (?, ?, ?, ?, ?, ?)
"""
values = (
cron_job.id,
cron_job.user_email,
cron_job.job_id,
cron_job.cron_expression,
cron_job.time_created,
cron_job.time_updated,
)
insert(query, values)
return True
def delete_cron_job(id: str, user_email: str):
query = """
DELETE FROM cron_jobs
WHERE id = ? AND user_email = ?
"""
values = (id, user_email)
insert(query, values)
return True
def get_cron_jobs(user_email: str):
cron_jobs = query("SELECT * FROM cron_jobs WHERE user_email = ?", (user_email,))
return cron_jobs
def get_all_cron_jobs():
cron_jobs = query("SELECT * FROM cron_jobs")
return cron_jobs
def insert_job_from_cron_job(job: dict[str, Any]):
insert_job(
{
**job,
"id": uuid.uuid4().hex,
"status": "Queued",
"result": "",
"chat": None,
"time_created": datetime.datetime.now(),
"time_updated": datetime.datetime.now(),
}
)
def get_cron_job_trigger(cron_expression: str):
expression_parts = cron_expression.split()
if len(expression_parts) != 5:
print(f"Invalid cron expression: {cron_expression}")
return None
minute, hour, day, month, day_of_week = expression_parts
return CronTrigger(
minute=minute, hour=hour, day=day, month=month, day_of_week=day_of_week
)
def start_cron_scheduler(scheduler: BackgroundScheduler):
cron_jobs = get_all_cron_jobs()
LOG.info(f"Cron jobs: {cron_jobs}")
for job in cron_jobs:
queried_job = query("SELECT * FROM jobs WHERE id = ?", (job["job_id"],))
LOG.info(f"Adding job: {queried_job}")
scheduler.add_job(
insert_job_from_cron_job,
get_cron_job_trigger(job["cron_expression"]),
id=job["id"],
args=[queried_job[0]],
)

97
api/backend/job/job.py Normal file
View File

@@ -0,0 +1,97 @@
# STL
import logging
from typing import Any
# LOCAL
from api.backend.utils import format_list_for_query
from api.backend.database.common import (
insert as common_insert,
query as common_query,
QUERIES,
update as common_update,
)
LOG = logging.getLogger(__name__)
def insert(item: dict[str, Any]) -> None:
common_insert(
QUERIES["insert_job"],
(
item["id"],
item["url"],
item["elements"],
item["user"],
item["time_created"],
item["result"],
item["status"],
item["chat"],
item["job_options"],
),
)
LOG.info(f"Inserted item: {item}")
async def get_queued_job():
query = (
"SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
)
res = common_query(query)
LOG.info(f"Got queued job: {res}")
return res[0] if res else None
async def update_job(ids: list[str], field: str, value: Any):
query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
res = common_update(query, tuple([value] + ids))
LOG.info(f"Updated job: {res}")
async def delete_jobs(jobs: list[str]):
if not jobs:
LOG.info("No jobs to delete.")
return False
query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}"
res = common_update(query, tuple(jobs))
return res > 0
async def average_elements_per_link(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
AVG(json_array_length(elements)) AS average_elements,
COUNT(*) AS count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = common_query(job_query, (user,))
return results
async def get_jobs_per_day(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
COUNT(*) AS job_count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = common_query(job_query, (user,))
return results

View File

@@ -0,0 +1,3 @@
from .job_options import JobOptions
__all__ = ["JobOptions"]

View File

@@ -0,0 +1,15 @@
from pydantic import BaseModel
from typing import Any, Optional
from api.backend.job.models.site_map import SiteMap
class FetchOptions(BaseModel):
chat: Optional[bool] = None
class JobOptions(BaseModel):
multi_page_scrape: bool = False
custom_headers: dict[str, Any] = {}
proxies: list[str] = []
site_map: Optional[SiteMap] = None
collect_media: bool = False

View File

@@ -0,0 +1,14 @@
from pydantic import BaseModel
from typing import Literal
class Action(BaseModel):
type: Literal["click", "input"]
xpath: str
name: str
input: str = ""
do_once: bool = True
class SiteMap(BaseModel):
actions: list[Action]

View File

@@ -0,0 +1,91 @@
import os
import requests
from pathlib import Path
from selenium.webdriver.common.by import By
from seleniumwire import webdriver
from urllib.parse import urlparse
from api.backend.utils import LOG
def collect_media(driver: webdriver.Chrome):
media_types = {
"images": "img",
"videos": "video",
"audio": "audio",
"pdfs": 'a[href$=".pdf"]',
"documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]',
"presentations": 'a[href$=".ppt"], a[href$=".pptx"]',
"spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]',
}
base_dir = Path("media")
base_dir.mkdir(exist_ok=True)
media_urls = {}
for media_type, selector in media_types.items():
elements = driver.find_elements(By.CSS_SELECTOR, selector)
urls: list[dict[str, str]] = []
media_dir = base_dir / media_type
media_dir.mkdir(exist_ok=True)
for element in elements:
if media_type == "images":
url = element.get_attribute("src")
elif media_type == "videos":
url = element.get_attribute("src") or element.get_attribute("data-src")
else:
url = element.get_attribute("href")
if url and url.startswith(("http://", "https://")):
try:
filename = os.path.basename(urlparse(url).path)
if not filename:
filename = f"{media_type}_{len(urls)}"
if media_type == "images":
filename += ".jpg"
elif media_type == "videos":
filename += ".mp4"
elif media_type == "audio":
filename += ".mp3"
elif media_type == "pdfs":
filename += ".pdf"
elif media_type == "documents":
filename += ".doc"
elif media_type == "presentations":
filename += ".ppt"
elif media_type == "spreadsheets":
filename += ".xls"
response = requests.get(url, stream=True)
response.raise_for_status()
# Save the file
file_path = media_dir / filename
with open(file_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
urls.append({"url": url, "local_path": str(file_path)})
LOG.info(f"Downloaded {filename} to {file_path}")
except Exception as e:
LOG.error(f"Error downloading {url}: {str(e)}")
continue
media_urls[media_type] = urls
with open(base_dir / "download_summary.txt", "w") as f:
for media_type, downloads in media_urls.items():
if downloads:
f.write(f"\n=== {media_type.upper()} ===\n")
for download in downloads:
f.write(f"URL: {download['url']}\n")
f.write(f"Saved to: {download['local_path']}\n\n")
return media_urls

View File

@@ -0,0 +1,41 @@
import time
from typing import cast
from seleniumwire import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from api.backend.utils import LOG
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
def scrape_content(
driver: webdriver.Chrome, pages: set[tuple[str, str]], collect_media: bool
):
_ = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
last_height = cast(str, driver.execute_script("return document.body.scrollHeight"))
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3) # Wait for the page to load
new_height = cast(
str, driver.execute_script("return document.body.scrollHeight")
)
if new_height == last_height:
break
last_height = new_height
pages.add((driver.page_source, driver.current_url))
if collect_media:
LOG.info("Collecting media")
collect_media_utils(driver)
return driver.page_source

View File

View File

@@ -0,0 +1,93 @@
from api.backend.job.models.site_map import Action, SiteMap
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from typing import Any
import logging
import time
from copy import deepcopy
from api.backend.job.scraping.scraping_utils import scrape_content
from selenium.webdriver.support.ui import WebDriverWait
from seleniumwire.inspect import TimeoutException
from seleniumwire.webdriver import Chrome
from selenium.webdriver.support import expected_conditions as EC
LOG = logging.getLogger(__name__)
def clear_done_actions(site_map: dict[str, Any]):
"""Clear all actions that have been clicked."""
cleared_site_map = deepcopy(site_map)
cleared_site_map["actions"] = [
action for action in cleared_site_map["actions"] if not action["do_once"]
]
return cleared_site_map
def handle_input(action: Action, driver: webdriver.Chrome):
try:
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, action.xpath))
)
LOG.info(f"Sending keys: {action.input} to element: {element}")
element.send_keys(action.input)
except NoSuchElementException:
LOG.info(f"Element not found: {action.xpath}")
return False
except TimeoutException:
LOG.info(f"Timeout waiting for element: {action.xpath}")
return False
except Exception as e:
LOG.info(f"Error handling input: {e}")
return False
return True
def handle_click(action: Action, driver: webdriver.Chrome):
try:
element = driver.find_element(By.XPATH, action.xpath)
LOG.info(f"Clicking element: {element}")
element.click()
except NoSuchElementException:
LOG.info(f"Element not found: {action.xpath}")
return False
return True
ACTION_MAP = {
"click": handle_click,
"input": handle_input,
}
async def handle_site_mapping(
site_map_dict: dict[str, Any],
driver: Chrome,
pages: set[tuple[str, str]],
):
site_map = SiteMap(**site_map_dict)
for action in site_map.actions:
action_handler = ACTION_MAP[action.type]
if not action_handler(action, driver):
return
time.sleep(2)
_ = scrape_content(driver, pages)
cleared_site_map_dict = clear_done_actions(site_map_dict)
if cleared_site_map_dict["actions"]:
await handle_site_mapping(cleared_site_map_dict, driver, pages)

View File

@@ -2,14 +2,13 @@
from typing import Any, Optional, Union
from datetime import datetime
# LOCAL
from api.backend.job.models.job_options import JobOptions
# PDM
import pydantic
class FetchOptions(pydantic.BaseModel):
chat: Optional[bool] = None
class Element(pydantic.BaseModel):
name: str
xpath: str
@@ -22,12 +21,6 @@ class CapturedElement(pydantic.BaseModel):
name: str
class JobOptions(pydantic.BaseModel):
multi_page_scrape: bool = False
custom_headers: Optional[dict[str, Any]] = {}
proxies: Optional[list[str]] = []
class RetrieveScrapeJobs(pydantic.BaseModel):
user: str
@@ -64,3 +57,17 @@ class Job(pydantic.BaseModel):
job_options: JobOptions
status: str = "Queued"
chat: Optional[str] = None
class CronJob(pydantic.BaseModel):
id: Optional[str] = None
user_email: str
job_id: str
cron_expression: str
time_created: Optional[Union[datetime, str]] = None
time_updated: Optional[Union[datetime, str]] = None
class DeleteCronJob(pydantic.BaseModel):
id: str
user_email: str

View File

@@ -1,4 +1,5 @@
# STL
import datetime
import uuid
import traceback
from io import StringIO
@@ -10,24 +11,33 @@ import random
from fastapi import Depends, APIRouter
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse, StreamingResponse
from api.backend.scheduler import scheduler
from apscheduler.triggers.cron import CronTrigger # type: ignore
# LOCAL
from api.backend.job import (
query,
insert,
update_job,
delete_jobs,
)
from api.backend.job import insert, update_job, delete_jobs
from api.backend.models import (
DeleteCronJob,
UpdateJobs,
DownloadJob,
FetchOptions,
DeleteScrapeJobs,
Job,
CronJob,
)
from api.backend.schemas import User
from api.backend.auth.auth_utils import get_current_user
from api.backend.utils import clean_text
from api.backend.utils import clean_text, format_list_for_query
from api.backend.job.models.job_options import FetchOptions
from api.backend.database.common import query
from api.backend.job.cron_scheduling.cron_scheduling import (
delete_cron_job,
get_cron_job_trigger,
insert_cron_job,
get_cron_jobs,
insert_job_from_cron_job,
)
LOG = logging.getLogger(__name__)
@@ -47,10 +57,11 @@ async def submit_scrape_job(job: Job):
job.id = uuid.uuid4().hex
job_dict = job.model_dump()
await insert(job_dict)
insert(job_dict)
return JSONResponse(content={"id": job.id})
except Exception as e:
LOG.error(f"Exception occurred: {traceback.format_exc()}")
return JSONResponse(content={"error": str(e)}, status_code=500)
@@ -59,8 +70,11 @@ async def retrieve_scrape_jobs(
fetch_options: FetchOptions, user: User = Depends(get_current_user)
):
LOG.info(f"Retrieving jobs for account: {user.email}")
ATTRIBUTES = "chat" if fetch_options.chat else "*"
try:
results = await query({"user": user.email}, fetch_options=fetch_options)
job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?"
results = query(job_query, (user.email,))
return JSONResponse(content=jsonable_encoder(results[::-1]))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
@@ -72,8 +86,8 @@ async def job(id: str, user: User = Depends(get_current_user)):
LOG.info(f"Retrieving jobs for account: {user.email}")
try:
filter = {"user": user.email, "id": id}
results = await query(filter)
job_query = "SELECT * FROM jobs WHERE user = ? AND id = ?"
results = query(job_query, (user.email, id))
return JSONResponse(content=jsonable_encoder(results))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
@@ -85,7 +99,10 @@ async def download(download_job: DownloadJob):
LOG.info(f"Downloading job with ids: {download_job.ids}")
try:
results = await query({"id": {"$in": download_job.ids}})
job_query = (
f"SELECT * FROM jobs WHERE id IN {format_list_for_query(download_job.ids)}"
)
results = query(job_query, tuple(download_job.ids))
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
@@ -136,3 +153,47 @@ async def delete(delete_scrape_jobs: DeleteScrapeJobs):
if result
else JSONResponse({"error": "Jobs not deleted."})
)
@job_router.post("/schedule-cron-job")
async def schedule_cron_job(cron_job: CronJob):
if not cron_job.id:
cron_job.id = uuid.uuid4().hex
if not cron_job.time_created:
cron_job.time_created = datetime.datetime.now()
if not cron_job.time_updated:
cron_job.time_updated = datetime.datetime.now()
insert_cron_job(cron_job)
queried_job = query("SELECT * FROM jobs WHERE id = ?", (cron_job.job_id,))
scheduler.add_job(
insert_job_from_cron_job,
get_cron_job_trigger(cron_job.cron_expression),
id=cron_job.id,
args=[queried_job[0]],
)
return JSONResponse(content={"message": "Cron job scheduled successfully."})
@job_router.post("/delete-cron-job")
async def delete_cron_job_request(request: DeleteCronJob):
if not request.id:
return JSONResponse(
content={"error": "Cron job id is required."}, status_code=400
)
delete_cron_job(request.id, request.user_email)
scheduler.remove_job(request.id)
return JSONResponse(content={"message": "Cron job deleted successfully."})
@job_router.get("/cron-jobs")
async def get_cron_jobs_request(user: User = Depends(get_current_user)):
cron_jobs = get_cron_jobs(user.email)
return JSONResponse(content=jsonable_encoder(cron_jobs))

3
api/backend/scheduler.py Normal file
View File

@@ -0,0 +1,3 @@
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
scheduler = BackgroundScheduler()

View File

@@ -1,19 +1,21 @@
import logging
from typing import Any, Optional
import time
import random
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag
from lxml import etree
from seleniumwire import webdriver
from lxml.etree import _Element # type: ignore [reportPrivateImport]
from lxml.etree import _Element
from fake_useragent import UserAgent
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options as ChromeOptions
from urllib.parse import urlparse, urljoin
from api.backend.models import Element, CapturedElement
from api.backend.job.site_mapping.site_mapping import (
handle_site_mapping,
)
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from api.backend.job.scraping.scraping_utils import scrape_content
LOG = logging.getLogger(__name__)
@@ -69,21 +71,27 @@ def create_driver(proxies: Optional[list[str]] = []):
chrome_options.add_argument(f"user-agent={ua.random}")
sw_options = {}
if proxies:
selected_proxy = proxies[random.randint(0, len(proxies) - 1)]
selected_proxy = random.choice(proxies)
LOG.info(f"Using proxy: {selected_proxy}")
sw_options = {
"proxy": {
"https": f"https://{selected_proxy}",
"http": f"http://{selected_proxy}",
"no_proxy": "localhost,127.0.0.1",
}
}
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(
service=service,
options=chrome_options,
seleniumwire_options=sw_options,
)
return driver
@@ -95,6 +103,8 @@ async def make_site_request(
pages: set[tuple[str, str]] = set(),
original_url: str = "",
proxies: Optional[list[str]] = [],
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
) -> None:
"""Make basic `GET` request to site using Selenium."""
# Check if URL has already been visited
@@ -114,27 +124,16 @@ async def make_site_request(
final_url = driver.current_url
visited_urls.add(url)
visited_urls.add(final_url)
_ = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_source = scrape_content(driver, pages, collect_media)
time.sleep(3) # Wait for the page to load
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
final_height = driver.execute_script("return document.body.scrollHeight")
page_source = driver.page_source
LOG.debug(f"Page source for url: {url}\n{page_source}")
pages.add((page_source, final_url))
if site_map:
LOG.info("Site map: %s", site_map)
_ = await handle_site_mapping(
site_map,
driver,
pages,
)
finally:
driver.quit()
@@ -144,7 +143,10 @@ async def make_site_request(
soup = BeautifulSoup(page_source, "html.parser")
for a_tag in soup.find_all("a"):
link = a_tag.get("href")
if not isinstance(a_tag, Tag):
continue
link = str(a_tag.get("href", ""))
if link:
if not urlparse(link).netloc:
@@ -172,7 +174,10 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
el = sxpath(root, elem.xpath)
for e in el:
text = "\t".join(str(t) for t in e.itertext())
if isinstance(e, etree._Element): # type: ignore
text = "\t".join(str(t) for t in e.itertext())
else:
text = str(e)
captured_element = CapturedElement(
xpath=elem.xpath, text=text, name=elem.name
)
@@ -192,6 +197,8 @@ async def scrape(
headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False,
proxies: Optional[list[str]] = [],
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
):
visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set()
@@ -204,6 +211,8 @@ async def scrape(
pages=pages,
original_url=url,
proxies=proxies,
site_map=site_map,
collect_media=collect_media,
)
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()

View File

@@ -1,15 +1,10 @@
import pytest
import logging
from unittest.mock import AsyncMock, patch, MagicMock
from api.backend.tests.factories.job_factory import create_job
from api.backend.models import JobOptions
from api.backend.scraping import create_driver
mocked_job = create_job(
job_options=JobOptions(
multi_page_scrape=False, custom_headers={}, proxies=["127.0.0.1:8080"]
)
).model_dump()
logging.basicConfig(level=logging.DEBUG)
LOG = logging.getLogger(__name__)
@pytest.mark.asyncio
@@ -26,8 +21,7 @@ async def test_proxy(mock_get: AsyncMock):
driver.get("http://example.com")
response = driver.last_request
# Check if the proxy header is set correctly
if response:
assert response.headers["Proxy"] == "127.0.0.1:8080"
assert response.headers["Proxy-Connection"] == "keep-alive"
driver.quit()

View File

@@ -1,5 +1,8 @@
from typing import Optional
from typing import Any, Optional
import logging
import json
LOG = logging.getLogger(__name__)
def clean_text(text: str):
@@ -17,3 +20,30 @@ def get_log_level(level_name: Optional[str]) -> int:
level = getattr(logging, level_name, logging.INFO)
return level
def format_list_for_query(ids: list[str]):
return (
f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)"
)
def format_sql_row_to_python(row: dict[str, Any]):
new_row: dict[str, Any] = {}
for key, value in row.items():
if isinstance(value, str):
try:
new_row[key] = json.loads(value)
except json.JSONDecodeError:
new_row[key] = value
else:
new_row[key] = value
return new_row
def format_json(items: list[Any]):
for idx, item in enumerate(items):
if isinstance(item, (dict, list)):
formatted_item = json.dumps(item)
items[idx] = formatted_item

View File

@@ -8,6 +8,8 @@ import logging
import sys
import traceback
from api.backend.database.startup import init_database
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
LOG = logging.getLogger(__name__)
@@ -24,6 +26,8 @@ async def process_job():
job["job_options"]["custom_headers"],
job["job_options"]["multi_page_scrape"],
job["job_options"]["proxies"],
job["job_options"]["site_map"],
job["job_options"]["collect_media"],
)
LOG.info(
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
@@ -40,6 +44,9 @@ async def process_job():
async def main():
LOG.info("Starting job worker...")
init_database()
while True:
await process_job()
await asyncio.sleep(5)

View File

@@ -0,0 +1,60 @@
describe("Authentication", () => {
it("should register", () => {
cy.intercept("POST", "/api/signup").as("signup");
cy.visit("/").then(() => {
cy.get("button").contains("Login").click();
cy.url().should("include", "/login");
cy.get("form").should("be.visible");
cy.get("button")
.contains("No Account? Sign up")
.should("be.visible")
.click();
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("input[name='fullName']").type("John Doe");
cy.get("button[type='submit']").contains("Signup").click();
cy.wait("@signup").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("signup request did not return a response");
}
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
expect(interception.response.statusCode).to.eq(200);
});
});
});
it("should login", () => {
cy.intercept("POST", "/api/token").as("token");
cy.visit("/").then(() => {
cy.get("button")
.contains("Login")
.click()
.then(() => {
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("button[type='submit']").contains("Login").click();
cy.wait("@token").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("token request did not return a response");
}
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
expect(interception.response.statusCode).to.eq(200);
});
});
});
});
});

View File

@@ -1,19 +1,34 @@
describe("Job", () => {
describe.only("Job", () => {
it("should create a job", () => {
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
cy.visit("/");
const input = cy.get('[data-cy="url-input"]');
input.type("https://example.com");
cy.get('[data-cy="url-input"]').type("https://example.com");
cy.get('[data-cy="name-field"]').type("example");
cy.get('[data-cy="xpath-field"]').type("//body");
cy.get('[data-cy="add-button"]').click();
const nameField = cy.get('[data-cy="name-field"]');
const xPathField = cy.get('[data-cy="xpath-field"]');
const addButton = cy.get('[data-cy="add-button"]');
cy.contains("Submit").click();
nameField.type("example");
xPathField.type("//body");
addButton.click();
cy.wait("@submitScrapeJob").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
cy.log("Request body: " + JSON.stringify(interception.request?.body));
throw new Error("submitScrapeJob request did not return a response");
}
const submit = cy.contains("Submit");
submit.click();
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
expect(interception.response.statusCode).to.eq(200);
});
cy.get("li").contains("Previous Jobs").click();
cy.contains("div", "https://example.com", { timeout: 10000 }).should(
"exist"
);
cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
});
});

View File

@@ -34,4 +34,4 @@
// visit(originalFn: CommandOriginalFn, url: string, options: Partial<VisitOptions>): Chainable<Element>
// }
// }
// }
// }

View File

@@ -2,12 +2,6 @@ version: "3"
services:
scraperr:
command: ["npm", "run", "dev"]
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr.rule=Host(`localhost`)"
- "traefik.http.routers.scraperr.entrypoints=web"
- "traefik.http.services.scraperr.loadbalancer.server.port=3000"
- "traefik.http.routers.scraperr.tls=false"
volumes:
- "$PWD/src:/app/src"
- "$PWD/public:/app/public"
@@ -16,7 +10,8 @@ services:
- "$PWD/package-lock.json:/app/package-lock.json"
- "$PWD/tsconfig.json:/app/tsconfig.json"
scraperr_api:
ports:
- "8000:8000"
environment:
- LOG_LEVEL=INFO
volumes:
- "$PWD/api:/project/api"
- "$PWD/scraping:/project/scraping"

View File

@@ -1,16 +1,18 @@
services:
scraperr:
depends_on:
- scraperr_api
image: jpyles0524/scraperr:latest
build:
context: .
dockerfile: docker/frontend/Dockerfile
container_name: scraperr
command: ["npm", "run", "start"]
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr.rule=Host(`localhost`)" # change this to your domain, if not running on localhost
- "traefik.http.routers.scraperr.entrypoints=web" # websecure if using https
- "traefik.http.services.scraperr.loadbalancer.server.port=3000"
environment:
- NEXT_PUBLIC_API_URL=http://scraperr_api:8000 # your API URL
- SERVER_URL=http://scraperr_api:8000 # your docker container API URL
ports:
- 80:3000
networks:
- web
scraperr_api:
@@ -21,45 +23,16 @@ services:
dockerfile: docker/api/Dockerfile
environment:
- LOG_LEVEL=INFO
- OLLAMA_URL=http://ollama:11434
- OLLAMA_MODEL=phi3
- MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB
- SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string)
- SECRET_KEY=MRo9PfasPibnqFeK4Oswb6Z+PhFmjzdvxZzwdAkbf/Y= # used to encode authentication tokens (can be a random string)
- ALGORITHM=HS256 # authentication encoding algorithm
- ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes
container_name: scraperr_api
volumes:
- /var/run/docker.sock:/var/run/docker.sock
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr_api.rule=Host(`localhost`) && PathPrefix(`/api`)" # change this to your domain, if not running on localhost
- "traefik.http.routers.scraperr_api.entrypoints=web" # websecure if using https
- "traefik.http.middlewares.api-stripprefix.stripprefix.prefixes=/api"
- "traefik.http.routers.scraperr_api.middlewares=api-stripprefix"
- "traefik.http.services.scraperr_api.loadbalancer.server.port=8000"
networks:
- web
traefik:
image: traefik:latest
container_name: traefik
command:
- "--providers.docker=true"
- "--entrypoints.web.address=:80"
- "--entrypoints.websecure.address=:443"
ports:
- 80:80
- 443:443
- 8000:8000
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro"
networks:
- web
mongo:
container_name: webscrape-mongo
image: mongo
restart: always
environment:
MONGO_INITDB_ROOT_USERNAME: root
MONGO_INITDB_ROOT_PASSWORD: example
- "$PWD/data:/project/data"
- "$PWD/media:/project/media"
- /var/run/docker.sock:/var/run/docker.sock
networks:
- web
networks:

View File

@@ -1,5 +1,5 @@
# Build next dependencies
FROM node:latest
FROM node:23.1
WORKDIR /app
COPY package*.json ./
@@ -15,6 +15,4 @@ COPY src /app/src
RUN npm run build
EXPOSE 3000
# CMD [ "npm", "run" ]
EXPOSE 3000

View File

@@ -1,4 +0,0 @@
tls:
certificates:
- certFile: /etc/certs/ssl-cert.pem
keyFile: /etc/certs/ssl-cert.key

View File

@@ -1,37 +0,0 @@
# STL
import os
# PDM
import boto3
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
def test_insert_and_delete():
# Get environment variables
region_name = os.getenv("AWS_REGION")
# Initialize DynamoDB resource
dynamodb = boto3.resource("dynamodb", region_name=region_name)
table = dynamodb.Table("scrape")
# Item to insert
item = {
"id": "123", # Replace with the appropriate id value
"attribute1": "value1",
"attribute2": "value2",
# Add more attributes as needed
}
# Insert the item
table.put_item(Item=item)
print(f"Inserted item: {item}")
# Delete the item
table.delete_item(Key={"id": "123"}) # Replace with the appropriate id value
print(f"Deleted item with id: {item['id']}")
if __name__ == "__main__":
test_insert_and_delete()

15660
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -19,6 +19,7 @@
"bootstrap": "^5.3.0",
"chart.js": "^4.4.3",
"cookie": "^0.6.0",
"dotenv": "^16.5.0",
"framer-motion": "^4.1.17",
"js-cookie": "^3.0.5",
"next": "^14.2.4",
@@ -31,7 +32,6 @@
"react-modal-image": "^2.6.0",
"react-router": "^6.14.1",
"react-router-dom": "^6.14.1",
"react-scripts": "^5.0.1",
"react-spinners": "^0.14.1",
"typescript": "^4.9.5",
"web-vitals": "^2.1.4"
@@ -63,12 +63,18 @@
]
},
"devDependencies": {
"@types/cypress": "^0.1.6",
"@types/cypress": "^1.1.6",
"@types/js-cookie": "^3.0.6",
"cypress": "^13.15.0",
"autoprefixer": "^10.4.21",
"cypress": "^13.17.0",
"eslint": "^9.26.0",
"postcss": "^8.5.3",
"tailwindcss": "^3.3.5"
},
"overrides": {
"react-refresh": "0.11.0"
},
"resolutions": {
"postcss": "^8.4.31"
}
}

2247
pdm.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -2,9 +2,7 @@
name = "web-scrape"
version = "0.1.0"
description = ""
authors = [
{name = "Jayden Pyles", email = "jpylesbuisness@gmail.com"},
]
authors = [{ name = "Jayden Pyles", email = "jpylesbuisness@gmail.com" }]
dependencies = [
"uvicorn>=0.30.1",
"fastapi>=0.111.0",
@@ -39,20 +37,19 @@ dependencies = [
"exceptiongroup>=1.2.2",
"Faker>=30.6.0",
"pytest-asyncio>=0.24.0",
"python-multipart>=0.0.12",
"python-multipart>=0.0.1",
"bcrypt==4.0.1",
"apscheduler>=3.11.0",
]
requires-python = ">=3.10"
readme = "README.md"
license = {text = "MIT"}
license = { text = "MIT" }
[tool.pdm]
distribution = true
[tool.pdm.dev-dependencies]
dev = [
"ipython>=8.26.0",
"pytest>=8.3.3",
]
dev = ["ipython>=8.26.0", "pytest>=8.3.3"]
[tool.pyright]
include = ["./api/backend/"]
exclude = ["**/node_modules", "**/__pycache__"]
@@ -60,14 +57,42 @@ ignore = []
defineConstant = { DEBUG = true }
stubPath = ""
reportUnknownMemberType= false
reportMissingImports = true
reportMissingTypeStubs = false
reportAny = false
reportCallInDefaultInitializer = false
# Type checking strictness
typeCheckingMode = "strict" # Enables strict type checking mode
reportPrivateUsage = "none"
reportMissingTypeStubs = "none"
reportUntypedFunctionDecorator = "error"
reportUntypedClassDecorator = "error"
reportUntypedBaseClass = "error"
reportInvalidTypeVarUse = "error"
reportUnnecessaryTypeIgnoreComment = "information"
reportUnknownVariableType = "none"
reportUnknownMemberType = "none"
reportUnknownParameterType = "none"
pythonVersion = "3.9"
pythonPlatform = "Linux"
# Additional checks
reportImplicitStringConcatenation = "error"
reportInvalidStringEscapeSequence = "error"
reportMissingImports = "error"
reportMissingModuleSource = "error"
reportOptionalCall = "error"
reportOptionalIterable = "error"
reportOptionalMemberAccess = "error"
reportOptionalOperand = "error"
reportOptionalSubscript = "error"
reportTypedDictNotRequiredAccess = "error"
# Function return type checking
reportIncompleteStub = "error"
reportIncompatibleMethodOverride = "error"
reportInvalidStubStatement = "error"
reportInconsistentOverload = "error"
# Misc settings
pythonVersion = "3.10" # Matches your Python version from pyproject.toml
strictListInference = true
strictDictionaryInference = true
strictSetInference = true
[tool.isort]

View File

@@ -2,7 +2,7 @@
import React from "react";
import { useAuth } from "../../../contexts/AuthContext";
import { Box, Drawer, Divider } from "@mui/material";
import { Box, Drawer } from "@mui/material";
import { QuickSettings } from "../../nav/quick-settings";
import { NavItems } from "./nav-items/nav-items";

View File

@@ -7,6 +7,7 @@ import TerminalIcon from "@mui/icons-material/Terminal";
import BarChart from "@mui/icons-material/BarChart";
import AutoAwesomeIcon from "@mui/icons-material/AutoAwesome";
import { List } from "@mui/material";
import { Schedule } from "@mui/icons-material";
const items = [
{
@@ -34,6 +35,11 @@ const items = [
text: "View App Logs",
href: "/logs",
},
{
icon: <Schedule />,
text: "Cron Jobs",
href: "/cron-jobs",
},
];
export const NavItems = () => {

View File

@@ -15,6 +15,7 @@ import {
Button,
Tooltip,
IconButton,
TableContainer,
} from "@mui/material";
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
import StarIcon from "@mui/icons-material/Star";
@@ -52,145 +53,155 @@ export const JobQueue = ({
const router = useRouter();
return (
<Table sx={{ tableLayout: "fixed", width: "100%" }}>
<TableHead>
<TableRow>
<TableCell>Select</TableCell>
<TableCell>Id</TableCell>
<TableCell>Url</TableCell>
<TableCell>Elements</TableCell>
<TableCell>Result</TableCell>
<TableCell>Time Created</TableCell>
<TableCell>Status</TableCell>
<TableCell>Actions</TableCell>
</TableRow>
</TableHead>
<TableBody>
{filteredJobs.map((row, index) => (
<TableRow key={index}>
<TableCell padding="checkbox">
<Checkbox
checked={selectedJobs.has(row.id)}
onChange={() => onSelectJob(row.id)}
/>
<Tooltip title="Chat with AI">
<span>
<IconButton
onClick={() => {
router.push({
pathname: "/chat",
query: {
job: row.id,
},
});
}}
>
<AutoAwesome />
</IconButton>
</span>
</Tooltip>
<Tooltip title="Favorite Job">
<span>
<IconButton
color={row.favorite ? "warning" : "default"}
onClick={() => {
onFavorite([row.id], "favorite", !row.favorite);
row.favorite = !row.favorite;
}}
>
<StarIcon />
</IconButton>
</span>
</Tooltip>
</TableCell>
<TableCell sx={{ maxWidth: 100, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.id}</Box>
</TableCell>
<TableCell sx={{ maxWidth: 200, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.url}</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
{JSON.stringify(row.elements)}
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto", padding: 0 }}>
<Accordion sx={{ margin: 0, padding: 0.5 }}>
<AccordionSummary
expandIcon={<ExpandMoreIcon />}
aria-controls="panel1a-content"
id="panel1a-header"
sx={{
minHeight: 0,
"&.Mui-expanded": { minHeight: 0 },
}}
>
<Box
sx={{
maxHeight: 150,
overflow: "auto",
width: "100%",
}}
>
<Typography sx={{ fontSize: "0.875rem" }}>
Show Result
</Typography>
</Box>
</AccordionSummary>
<AccordionDetails sx={{ padding: 1 }}>
<Box sx={{ maxHeight: 200, overflow: "auto" }}>
<Typography
sx={{
fontSize: "0.875rem",
whiteSpace: "pre-wrap",
<TableContainer component={Box} sx={{ maxHeight: "90dvh" }}>
<Table sx={{ tableLayout: "fixed", width: "100%" }}>
<TableHead>
<TableRow>
<TableCell>Select</TableCell>
<TableCell>Id</TableCell>
<TableCell>Url</TableCell>
<TableCell>Elements</TableCell>
<TableCell>Result</TableCell>
<TableCell>Time Created</TableCell>
<TableCell>Status</TableCell>
<TableCell>Actions</TableCell>
</TableRow>
</TableHead>
<TableBody sx={{ overflow: "auto" }}>
{filteredJobs.map((row, index) => (
<TableRow key={index}>
<TableCell padding="checkbox">
<Checkbox
checked={selectedJobs.has(row.id)}
onChange={() => onSelectJob(row.id)}
/>
<Tooltip title="Chat with AI">
<span>
<IconButton
onClick={() => {
router.push({
pathname: "/chat",
query: {
job: row.id,
},
});
}}
>
{JSON.stringify(row.result, null, 2)}
</Typography>
</Box>
</AccordionDetails>
</Accordion>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
{new Date(row.time_created).toLocaleString()}
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 50, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
<Box
className="rounded-md p-2 text-center"
sx={{ bgcolor: colors[row.status] }}
>
{row.status}
<AutoAwesome />
</IconButton>
</span>
</Tooltip>
<Tooltip title="Favorite Job">
<span>
<IconButton
color={row.favorite ? "warning" : "default"}
onClick={() => {
onFavorite([row.id], "favorite", !row.favorite);
row.favorite = !row.favorite;
}}
>
<StarIcon />
</IconButton>
</span>
</Tooltip>
</TableCell>
<TableCell sx={{ maxWidth: 100, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.id}</Box>
</TableCell>
<TableCell sx={{ maxWidth: 200, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.url}</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
{JSON.stringify(row.elements)}
</Box>
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ display: "flex", gap: 1 }}>
<Button
onClick={() => {
onDownload([row.id]);
}}
size="small"
sx={{ minWidth: 0, padding: "4px 8px" }}
>
Download
</Button>
<Button
onClick={() =>
onNavigate(row.elements, row.url, row.job_options)
}
size="small"
sx={{ minWidth: 0, padding: "4px 8px" }}
>
Rerun
</Button>
</Box>
</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto", padding: 0 }}>
<Accordion sx={{ margin: 0, padding: 0.5 }}>
<AccordionSummary
expandIcon={<ExpandMoreIcon />}
aria-controls="panel1a-content"
id="panel1a-header"
sx={{
minHeight: 0,
"&.Mui-expanded": { minHeight: 0 },
}}
>
<Box
sx={{
maxHeight: 150,
overflow: "auto",
width: "100%",
}}
>
<Typography sx={{ fontSize: "0.875rem" }}>
Show Result
</Typography>
</Box>
</AccordionSummary>
<AccordionDetails sx={{ padding: 1 }}>
<Box sx={{ maxHeight: 200, overflow: "auto" }}>
<Typography
sx={{
fontSize: "0.875rem",
whiteSpace: "pre-wrap",
}}
>
{JSON.stringify(row.result, null, 2)}
</Typography>
</Box>
</AccordionDetails>
</Accordion>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
{new Date(row.time_created).toLocaleString()}
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 50, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
<Box
className="rounded-md p-2 text-center"
sx={{ bgcolor: colors[row.status] }}
>
{row.status}
</Box>
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ display: "flex", gap: 1 }}>
<Button
onClick={() => {
onDownload([row.id]);
}}
size="small"
sx={{
minWidth: 0,
padding: "4px 8px",
fontSize: "0.625rem",
}}
>
Download
</Button>
<Button
onClick={() =>
onNavigate(row.elements, row.url, row.job_options)
}
size="small"
sx={{
minWidth: 0,
padding: "4px 8px",
fontSize: "0.625rem",
}}
>
Rerun
</Button>
</Box>
</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</TableContainer>
);
};

View File

@@ -48,10 +48,10 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
const router = useRouter();
const handleDownload = async (ids: string[]) => {
const response = await fetch(`${Constants.DOMAIN}/api/download`, {
const response = await fetch("/api/download", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ ids: ids }),
body: JSON.stringify({ data: { ids: ids } }),
});
if (response.ok) {
@@ -104,10 +104,10 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
};
const handleDeleteSelected = async () => {
const response = await fetch(`${Constants.DOMAIN}/api/delete-scrape-jobs`, {
const response = await fetch("/api/delete", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ ids: Array.from(selectedJobs) }),
body: JSON.stringify({ data: { ids: Array.from(selectedJobs) } }),
});
if (response.ok) {
@@ -142,13 +142,13 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
value: value,
};
await fetch(`${Constants.DOMAIN}/api/update`, {
await fetch("/api/update", {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${token}`,
},
body: JSON.stringify(postBody),
body: JSON.stringify({ data: postBody }),
});
};

View File

@@ -14,19 +14,24 @@ export const LogContainer: React.FC<LogContainerProps> = ({ initialLogs }) => {
const logsContainerRef = useRef<HTMLDivElement | null>(null);
useEffect(() => {
const eventSource = new EventSource(`${Constants.DOMAIN}/api/logs`);
const eventSource = new EventSource(`/api/logs`);
setLogs("");
eventSource.onmessage = (event) => {
setLogs((prevLogs) => prevLogs + event.data + "\n");
if (logsContainerRef.current) {
logsContainerRef.current.scrollTop =
logsContainerRef.current.scrollHeight;
}
};
eventSource.onerror = () => {
eventSource.onopen = (e) => {
};
eventSource.onerror = (error) => {
console.error("EventSource failed:", error);
eventSource.close();
};

View File

@@ -0,0 +1,182 @@
import { Job } from "@/types";
import {
Button,
Dialog,
DialogTitle,
DialogContent,
TextField,
Snackbar,
Alert,
} from "@mui/material";
import Cookies from "js-cookie";
import { useState } from "react";
export type CreateCronJobsProps = {
availableJobs: Job[];
user: any;
};
export const CreateCronJobs = ({
availableJobs,
user,
}: CreateCronJobsProps) => {
const [open, setOpen] = useState(false);
return (
<>
<Button
variant="contained"
color="primary"
onClick={() => setOpen(true)}
sx={{ borderRadius: 2 }}
>
Create Cron Job
</Button>
<CreateCronJobDialog
open={open}
onClose={() => setOpen(false)}
availableJobs={availableJobs}
user={user}
/>
</>
);
};
const CreateCronJobDialog = ({
open,
onClose,
availableJobs,
user,
}: {
open: boolean;
onClose: () => void;
availableJobs: Job[];
user: any;
}) => {
const [cronExpression, setCronExpression] = useState("");
const [jobId, setJobId] = useState("");
const [successOpen, setSuccessOpen] = useState(false);
const [isSubmitting, setIsSubmitting] = useState(false);
const [error, setError] = useState("");
const handleSubmit = async () => {
if (!cronExpression || !jobId) {
setError("Please fill in all fields");
return;
}
setIsSubmitting(true);
const token = Cookies.get("token");
try {
const response = await fetch("/api/schedule-cron-job", {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${token}`,
},
body: JSON.stringify({
data: {
cron_expression: cronExpression,
job_id: jobId,
user_email: user.email,
},
}),
});
if (!response.ok) {
throw new Error("Failed to schedule job");
}
setSuccessOpen(true);
setCronExpression("");
setJobId("");
setTimeout(() => {
onClose();
}, 1500);
window.location.reload();
} catch (error) {
console.error(error);
setError("Failed to create cron job");
} finally {
setIsSubmitting(false);
}
};
const handleClose = () => {
setSuccessOpen(false);
};
return (
<>
<Dialog
open={open}
onClose={onClose}
PaperProps={{
sx: { borderRadius: 2, minWidth: "400px" },
}}
>
<DialogTitle sx={{ fontWeight: 500 }}>Create Cron Job</DialogTitle>
<DialogContent>
<div className="flex flex-col gap-1 mt0">
<TextField
label="Cron Expression"
fullWidth
value={cronExpression}
onChange={(e) => setCronExpression(e.target.value)}
variant="outlined"
placeholder="* * * * *"
margin="normal"
helperText="Format: minute hour day month day-of-week"
/>
<TextField
label="Job ID"
fullWidth
value={jobId}
onChange={(e) => setJobId(e.target.value)}
variant="outlined"
margin="normal"
/>
{error && (
<Alert severity="error" sx={{ mt: 2 }}>
{error}
</Alert>
)}
<div className="flex justify-end gap-2 mt-4">
<Button
variant="outlined"
onClick={onClose}
sx={{ borderRadius: 2 }}
>
Cancel
</Button>
<Button
variant="contained"
color="primary"
onClick={handleSubmit}
disabled={isSubmitting}
sx={{ borderRadius: 2 }}
>
{isSubmitting ? "Submitting..." : "Create Job"}
</Button>
</div>
</div>
</DialogContent>
</Dialog>
<Snackbar
open={successOpen}
autoHideDuration={4000}
onClose={handleClose}
anchorOrigin={{ vertical: "bottom", horizontal: "right" }}
>
<Alert onClose={handleClose} severity="success" sx={{ width: "100%" }}>
Cron job created successfully!
</Alert>
</Snackbar>
</>
);
};

View File

@@ -0,0 +1 @@
export * from "./create-cron-jobs";

View File

@@ -0,0 +1,104 @@
import { Job, CronJob } from "@/types/job";
import { useState, useEffect } from "react";
import { CreateCronJobs } from "./create-cron-jobs";
import {
Table,
TableHead,
TableRow,
TableCell,
TableBody,
Button,
Box,
Typography,
} from "@mui/material";
import Cookies from "js-cookie";
export type CronJobsProps = {
initialJobs: Job[];
initialCronJobs: CronJob[];
initialUser: any;
};
export const CronJobs = ({
initialJobs,
initialCronJobs,
initialUser,
}: CronJobsProps) => {
const [jobs, setJobs] = useState<Job[]>(initialJobs);
const [cronJobs, setCronJobs] = useState<CronJob[]>(initialCronJobs);
const [user, setUser] = useState<any>(initialUser);
useEffect(() => {
setJobs(initialJobs);
setCronJobs(initialCronJobs);
setUser(initialUser);
}, [initialJobs, initialCronJobs, initialUser]);
const handleDeleteCronJob = async (id: string) => {
const token = Cookies.get("token");
const response = await fetch("/api/delete-cron-job", {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${token}`,
},
body: JSON.stringify({ data: { id, user_email: user.email } }),
});
if (response.ok) {
console.log("Cron job deleted successfully");
setCronJobs(cronJobs.filter((cronJob) => cronJob.id !== id));
} else {
console.error("Failed to delete cron job");
}
};
if (!user) {
return (
<Box>
<Typography variant="h6">
Please login to view your cron jobs
</Typography>
</Box>
);
}
return (
<div>
<CreateCronJobs availableJobs={jobs} user={user} />
<Table>
<TableHead>
<TableRow>
<TableCell>Cron Expression</TableCell>
<TableCell>Job ID</TableCell>
<TableCell>User Email</TableCell>
<TableCell>Created At</TableCell>
<TableCell>Updated At</TableCell>
<TableCell>Actions</TableCell>
</TableRow>
</TableHead>
<TableBody>
{cronJobs.map((cronJob) => (
<TableRow key={cronJob.id}>
<TableCell>{cronJob.cron_expression}</TableCell>
<TableCell>{cronJob.job_id}</TableCell>
<TableCell>{cronJob.user_email}</TableCell>
<TableCell>
{new Date(cronJob.time_created).toLocaleString()}
</TableCell>
<TableCell>
{new Date(cronJob.time_updated).toLocaleString()}
</TableCell>
<TableCell>
<Button onClick={() => handleDeleteCronJob(cronJob.id)}>
Delete
</Button>
</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</div>
);
};

View File

@@ -0,0 +1,62 @@
import axios from "axios";
import { GetServerSideProps } from "next";
import { parseCookies } from "nookies";
import { CronJob, Job } from "../../../types";
export const getServerSideProps: GetServerSideProps = async (context) => {
const { req } = context;
const cookies = parseCookies({ req });
const token = cookies.token;
let user = null;
let initialJobs: Job[] = [];
let initialCronJobs: CronJob[] = [];
if (token) {
try {
const userResponse = await axios.get(
`${process.env.NEXT_PUBLIC_API_URL}/api/auth/users/me`,
{
headers: { Authorization: `Bearer ${token}` },
}
);
user = userResponse.data;
const jobsResponse = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/retrieve-scrape-jobs`,
{
method: "POST",
body: JSON.stringify({ user: user.email }),
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
}
);
initialJobs = await jobsResponse.json();
console.log(initialJobs);
const cronJobsResponse = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/cron-jobs`,
{
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
}
);
initialCronJobs = await cronJobsResponse.json();
} catch (error) {
console.error("Error fetching user or jobs:", error);
}
}
return {
props: {
initialJobs,
initialUser: user,
initialCronJobs,
},
};
};

View File

@@ -0,0 +1 @@
export { CronJobs } from "./cron-jobs";

View File

@@ -0,0 +1,107 @@
"use client";
import React, { useState, useEffect, useRef } from "react";
import { Button, Container, Box, Snackbar, Alert } from "@mui/material";
import { useRouter } from "next/router";
import { Element, Result } from "@/types";
import { ElementTable, JobSubmitter } from "@/components/submit/job-submitter";
import { useJobSubmitterProvider } from "@/components/submit/job-submitter/provider";
export const Home = () => {
const {
submittedURL,
setSubmittedURL,
rows,
setRows,
results,
snackbarOpen,
setSnackbarOpen,
snackbarMessage,
snackbarSeverity,
} = useJobSubmitterProvider();
const router = useRouter();
const { elements, url } = router.query;
const resultsRef = useRef<HTMLTableElement | null>(null);
useEffect(() => {
if (elements) {
setRows(JSON.parse(elements as string));
}
if (url) {
setSubmittedURL(url as string);
}
}, [elements, url]);
useEffect(() => {
if (results && resultsRef.current) {
resultsRef.current.scrollIntoView({ behavior: "smooth" });
}
}, [results]);
const handleCloseSnackbar = () => {
setSnackbarOpen(false);
};
const ErrorSnackbar = () => {
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="error">
{snackbarMessage}
</Alert>
</Snackbar>
);
};
const NotifySnackbar = () => {
const goTo = () => {
router.push("/jobs");
};
const action = (
<Button color="inherit" size="small" onClick={goTo}>
Go To Job
</Button>
);
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="info" action={action}>
{snackbarMessage}
</Alert>
</Snackbar>
);
};
return (
<Box
bgcolor="background.default"
display="flex"
flexDirection="column"
justifyContent="center"
alignItems="center"
height="100%"
py={4}
>
<Container maxWidth="lg" className="overflow-y-auto max-h-full">
<JobSubmitter />
{submittedURL.length ? (
<ElementTable
rows={rows}
setRows={setRows}
submittedURL={submittedURL}
/>
) : null}
</Container>
{snackbarSeverity === "info" ? <NotifySnackbar /> : <ErrorSnackbar />}
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./home";

View File

@@ -1,2 +1 @@
export * from "./ElementTable";
export * from "./job-submitter";

View File

@@ -15,9 +15,11 @@ import {
IconButton,
Tooltip,
useTheme,
Divider,
} from "@mui/material";
import AddIcon from "@mui/icons-material/Add";
import { Element } from "../../types";
import { Element } from "@/types";
import { SiteMap } from "../site-map";
interface Props {
rows: Element[];
@@ -169,6 +171,13 @@ export const ElementTable = ({ rows, setRows, submittedURL }: Props) => {
</div>
</TableContainer>
</Box>
<Divider
sx={{
borderColor: theme.palette.mode === "dark" ? "#ffffff" : "0000000",
marginBottom: 2,
}}
/>
<SiteMap />
</Box>
);
};

View File

@@ -0,0 +1 @@
export { ElementTable } from "./element-table";

View File

@@ -1 +1,2 @@
export { JobSubmitter } from "./job-submitter";
export { ElementTable } from "./element-table";

View File

@@ -1,26 +1,20 @@
import React, { Dispatch } from "react";
import React from "react";
import { TextField, Button, CircularProgress } from "@mui/material";
import { Element } from "@/types";
import { useJobSubmitterProvider } from "../provider";
export type JobSubmitterInputProps = {
submittedURL: string;
setSubmittedURL: Dispatch<React.SetStateAction<string>>;
isValidURL: boolean;
urlError: string | null;
handleSubmit: () => void;
loading: boolean;
rows: Element[];
};
export const JobSubmitterInput = ({
submittedURL,
setSubmittedURL,
isValidURL,
urlError,
handleSubmit,
loading,
rows,
urlError,
}: JobSubmitterInputProps) => {
const { submittedURL, setSubmittedURL, isValidURL, rows } =
useJobSubmitterProvider();
return (
<div className="flex flex-row space-x-4 items-center mb-2">
<TextField

View File

@@ -14,9 +14,9 @@ export type JobSubmitterOptionsProps = {
export const JobSubmitterOptions = ({
jobOptions,
setJobOptions,
handleSelectProxies,
customJSONSelected,
setCustomJSONSelected,
handleSelectProxies,
proxiesSelected,
}: JobSubmitterOptionsProps) => {
const handleMultiPageScrapeChange = () => {
@@ -42,6 +42,13 @@ export const JobSubmitterOptions = ({
}));
};
const handleCollectMediaChange = () => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
collect_media: !prevJobOptions.collect_media,
}));
};
return (
<Box bgcolor="background.paper" className="flex flex-col mb-2 rounded-md">
<div id="options" className="p-2 flex flex-row space-x-2">
@@ -94,6 +101,15 @@ export const JobSubmitterOptions = ({
/>
}
></FormControlLabel>
<FormControlLabel
label="Collect Media"
control={
<Checkbox
checked={jobOptions.collect_media}
onChange={handleCollectMediaChange}
/>
}
/>
</div>
{customJSONSelected ? (
<div id="custom-json" className="pl-2 pr-2 pb-2">

View File

@@ -1,7 +1,6 @@
"use client";
import React, { useEffect, useState, Dispatch } from "react";
import { Element } from "@/types";
import React, { useEffect, useState } from "react";
import { useAuth } from "@/contexts/AuthContext";
import { useRouter } from "next/router";
import { RawJobOptions } from "@/types/job";
@@ -10,29 +9,16 @@ import { JobSubmitterHeader } from "./job-submitter-header";
import { JobSubmitterInput } from "./job-submitter-input";
import { JobSubmitterOptions } from "./job-submitter-options";
import { ApiService } from "@/services";
interface StateProps {
submittedURL: string;
setSubmittedURL: Dispatch<React.SetStateAction<string>>;
rows: Element[];
isValidURL: boolean;
setIsValidUrl: Dispatch<React.SetStateAction<boolean>>;
setSnackbarMessage: Dispatch<React.SetStateAction<string>>;
setSnackbarOpen: Dispatch<React.SetStateAction<boolean>>;
setSnackbarSeverity: Dispatch<React.SetStateAction<string>>;
}
interface Props {
stateProps: StateProps;
}
import { useJobSubmitterProvider } from "./provider";
const initialJobOptions: RawJobOptions = {
multi_page_scrape: false,
custom_headers: null,
proxies: null,
collect_media: false,
};
export const JobSubmitter = ({ stateProps }: Props) => {
export const JobSubmitter = () => {
const { user } = useAuth();
const router = useRouter();
const { job_options } = router.query;
@@ -40,11 +26,13 @@ export const JobSubmitter = ({ stateProps }: Props) => {
const {
submittedURL,
rows,
siteMap,
setIsValidUrl,
setSnackbarMessage,
setSnackbarOpen,
setSnackbarSeverity,
} = stateProps;
setSiteMap,
} = useJobSubmitterProvider();
const [urlError, setUrlError] = useState<string | null>(null);
const [loading, setLoading] = useState<boolean>(false);
@@ -87,7 +75,8 @@ export const JobSubmitter = ({ stateProps }: Props) => {
rows,
user,
jobOptions,
customHeaders
customHeaders,
siteMap
)
.then(async (response) => {
if (!response.ok) {
@@ -120,31 +109,28 @@ export const JobSubmitter = ({ stateProps }: Props) => {
job_options as string,
setCustomJSONSelected,
setProxiesSelected,
setJobOptions
setJobOptions,
setSiteMap
);
}
}, [job_options]);
return (
<>
<div>
<JobSubmitterHeader />
<JobSubmitterInput
{...stateProps}
urlError={urlError}
handleSubmit={handleSubmit}
loading={loading}
/>
<JobSubmitterOptions
{...stateProps}
jobOptions={jobOptions}
setJobOptions={setJobOptions}
customJSONSelected={customJSONSelected}
setCustomJSONSelected={setCustomJSONSelected}
handleSelectProxies={handleSelectProxies}
proxiesSelected={proxiesSelected}
/>
</div>
</>
<div>
<JobSubmitterHeader />
<JobSubmitterInput
urlError={urlError}
handleSubmit={handleSubmit}
loading={loading}
/>
<JobSubmitterOptions
jobOptions={jobOptions}
setJobOptions={setJobOptions}
customJSONSelected={customJSONSelected}
setCustomJSONSelected={setCustomJSONSelected}
handleSelectProxies={handleSelectProxies}
proxiesSelected={proxiesSelected}
/>
</div>
);
};

View File

@@ -0,0 +1,84 @@
import React, {
createContext,
PropsWithChildren,
useContext,
useState,
Dispatch,
useMemo,
} from "react";
import { Element, Result, SiteMap } from "@/types";
type JobSubmitterProviderType = {
submittedURL: string;
setSubmittedURL: Dispatch<React.SetStateAction<string>>;
rows: Element[];
setRows: Dispatch<React.SetStateAction<Element[]>>;
results: Result;
setResults: Dispatch<React.SetStateAction<Result>>;
snackbarOpen: boolean;
setSnackbarOpen: Dispatch<React.SetStateAction<boolean>>;
snackbarMessage: string;
setSnackbarMessage: Dispatch<React.SetStateAction<string>>;
snackbarSeverity: string;
setSnackbarSeverity: Dispatch<React.SetStateAction<string>>;
isValidURL: boolean;
setIsValidUrl: Dispatch<React.SetStateAction<boolean>>;
siteMap: SiteMap | null;
setSiteMap: Dispatch<React.SetStateAction<SiteMap | null>>;
};
const JobSubmitterProvider = createContext<JobSubmitterProviderType>(
{} as JobSubmitterProviderType
);
export const Provider = ({ children }: PropsWithChildren) => {
const [submittedURL, setSubmittedURL] = useState<string>("");
const [rows, setRows] = useState<Element[]>([]);
const [results, setResults] = useState<Result>({});
const [snackbarOpen, setSnackbarOpen] = useState<boolean>(false);
const [snackbarMessage, setSnackbarMessage] = useState<string>("");
const [snackbarSeverity, setSnackbarSeverity] = useState<string>("error");
const [isValidURL, setIsValidUrl] = useState<boolean>(true);
const [siteMap, setSiteMap] = useState<SiteMap | null>(null);
const value: JobSubmitterProviderType = useMemo(
() => ({
submittedURL,
setSubmittedURL,
rows,
setRows,
results,
setResults,
snackbarOpen,
setSnackbarOpen,
snackbarMessage,
setSnackbarMessage,
snackbarSeverity,
setSnackbarSeverity,
isValidURL,
setIsValidUrl,
siteMap,
setSiteMap,
}),
[
submittedURL,
rows,
results,
snackbarOpen,
snackbarMessage,
snackbarSeverity,
isValidURL,
siteMap,
]
);
return (
<JobSubmitterProvider.Provider value={value}>
{children}
</JobSubmitterProvider.Provider>
);
};
export const useJobSubmitterProvider = () => {
return useContext(JobSubmitterProvider);
};

View File

@@ -0,0 +1 @@
export * from "./site-map";

View File

@@ -0,0 +1 @@
export * from "./site-map-input";

View File

@@ -0,0 +1,22 @@
.button {
height: 3rem;
width: 2rem;
color: #ffffff;
font-weight: 600;
border-radius: 0.375rem;
transition: transform 0.2s ease-in-out;
transform: scale(1);
}
.button:hover {
transform: scale(1.05);
}
.remove {
background-color: var(--delete-red) !important;
}
.remove:hover {
background-color: var(--delete-red-hover) !important;
}

View File

@@ -0,0 +1,135 @@
import { useState } from "react";
import { useJobSubmitterProvider } from "../../provider";
import {
MenuItem,
Select,
TextField,
FormControl,
Button,
Checkbox,
FormControlLabel,
} from "@mui/material";
import { ActionOption } from "@/types/job";
import classes from "./site-map-input.module.css";
import { clsx } from "clsx";
export type SiteMapInputProps = {
disabled?: boolean;
xpath?: string;
option?: ActionOption;
clickOnce?: boolean;
input?: string;
};
export const SiteMapInput = ({
disabled,
xpath,
option,
clickOnce,
input,
}: SiteMapInputProps) => {
console.log(clickOnce);
const [optionState, setOptionState] = useState<ActionOption>(
option || "click"
);
const [xpathState, setXpathState] = useState<string>(xpath || "");
const [clickOnceState, setClickOnceState] = useState<boolean>(
clickOnce || false
);
const [inputState, setInputState] = useState<string>(input || "");
const { siteMap, setSiteMap } = useJobSubmitterProvider();
const handleAdd = () => {
if (!siteMap) return;
console.log(optionState, xpathState, clickOnceState, inputState);
setSiteMap((prevSiteMap) => ({
...prevSiteMap,
actions: [
{
type: optionState,
xpath: xpathState,
name: "",
do_once: clickOnceState,
input: inputState,
},
...(prevSiteMap?.actions || []),
],
}));
setXpathState("");
};
const handleRemove = () => {
if (!siteMap) return;
setSiteMap((prevSiteMap) => ({
...prevSiteMap,
actions: (prevSiteMap?.actions || []).slice(0, -1),
}));
};
return (
<div className="flex flex-col gap-2 w-full">
<div className="flex gap-2 items-center">
<FormControl className="w-1/4">
<Select
disabled={disabled}
displayEmpty
value={optionState}
onChange={(e) => setOptionState(e.target.value as ActionOption)}
>
<MenuItem value="click">Click</MenuItem>
<MenuItem value="input">Input</MenuItem>
</Select>
</FormControl>
{optionState === "input" && (
<TextField
label="Input Text"
fullWidth
value={inputState}
onChange={(e) => setInputState(e.target.value)}
disabled={disabled}
/>
)}
<TextField
label="XPath Selector"
fullWidth
value={xpathState}
onChange={(e) => setXpathState(e.target.value)}
disabled={disabled}
/>
{disabled ? (
<Button
onClick={handleRemove}
className={clsx(classes.button, classes.remove)}
>
Delete
</Button>
) : (
<Button
onClick={handleAdd}
disabled={!xpathState}
className={clsx(classes.button, classes.add)}
>
Add
</Button>
)}
</div>
{!disabled && (
<FormControlLabel
label="Do Once"
control={
<Checkbox
checked={clickOnceState}
disabled={disabled}
onChange={() => setClickOnceState(!clickOnceState)}
/>
}
/>
)}
</div>
);
};

View File

@@ -0,0 +1,70 @@
import { useEffect, useState } from "react";
import { useJobSubmitterProvider } from "../provider";
import { Button, Divider, Typography, useTheme } from "@mui/material";
import { SiteMapInput } from "./site-map-input";
export const SiteMap = () => {
const { siteMap, setSiteMap } = useJobSubmitterProvider();
const [showSiteMap, setShowSiteMap] = useState<boolean>(false);
const theme = useTheme();
const handleCreateSiteMap = () => {
setSiteMap({ actions: [] });
setShowSiteMap(true);
};
const handleClearSiteMap = () => {
setSiteMap(null);
setShowSiteMap(false);
};
useEffect(() => {
if (siteMap) {
setShowSiteMap(true);
}
}, [siteMap]);
return (
<div className="flex flex-col gap-4">
{siteMap ? (
<Button onClick={handleClearSiteMap}>Clear Site Map</Button>
) : (
<Button onClick={handleCreateSiteMap}>Create Site Map</Button>
)}
{showSiteMap && (
<div className="flex flex-col gap-4">
<SiteMapInput />
{siteMap?.actions && siteMap?.actions.length > 0 && (
<>
<Divider
sx={{
borderColor:
theme.palette.mode === "dark" ? "#ffffff" : "0000000",
}}
/>
<Typography className="w-full text-center" variant="h5">
Site Map Actions
</Typography>
</>
)}
<ul className="flex flex-col gap-4">
{siteMap?.actions.reverse().map((action, index) => (
<li key={action.xpath} className="flex w-full items-center">
<Typography variant="h6" className="w-[10%] mr-2">
Action {index + 1}:
</Typography>
<SiteMapInput
disabled={Boolean(siteMap)}
xpath={action.xpath}
option={action.type}
clickOnce={action.do_once}
input={action.input}
/>
</li>
))}
</ul>
</div>
)}
</div>
);
};

View File

@@ -1,6 +1,5 @@
import React, { createContext, useContext, useState, useEffect } from "react";
import axios from "axios";
import { Constants } from "../lib";
import Cookies from "js-cookie";
interface AuthContextProps {
@@ -25,7 +24,7 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
const token = Cookies.get("token");
if (token) {
axios
.get(`${Constants.DOMAIN}/api/auth/users/me`, {
.get(`/api/me`, {
headers: { Authorization: `Bearer ${token}` },
})
.then((response) => {
@@ -42,10 +41,8 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
const params = new URLSearchParams();
params.append("username", email);
params.append("password", password);
const response = await axios.post(
`${Constants.DOMAIN}/api/auth/token`,
params
);
const response = await axios.post(`/api/token`, params);
Cookies.set("token", response.data.access_token, {
expires: 7,
path: "/",
@@ -53,12 +50,11 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
secure: false,
sameSite: "Lax",
});
const userResponse = await axios.get(
`${Constants.DOMAIN}/api/auth/users/me`,
{
headers: { Authorization: `Bearer ${response.data.access_token}` },
}
);
const userResponse = await axios.get(`/api/me`, {
headers: { Authorization: `Bearer ${response.data.access_token}` },
});
setUser(userResponse.data);
setIsAuthenticated(true);
};

View File

@@ -1,12 +1,13 @@
import { Dispatch, SetStateAction } from "react";
import { RawJobOptions } from "@/types";
import { RawJobOptions, SiteMap } from "@/types";
export const parseJobOptions = (
job_options: string,
setCustomJSONSelected: Dispatch<SetStateAction<boolean>>,
setProxiesSelected: Dispatch<SetStateAction<boolean>>,
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>,
setSiteMap: Dispatch<SetStateAction<any>>
) => {
if (job_options) {
const jsonOptions = JSON.parse(job_options as string);
@@ -14,6 +15,7 @@ export const parseJobOptions = (
multi_page_scrape: false,
custom_headers: null,
proxies: null,
collect_media: false,
};
if (
@@ -31,6 +33,10 @@ export const parseJobOptions = (
newJobOptions.proxies = jsonOptions.proxies.join(",");
}
if (jsonOptions.site_map) {
setSiteMap(jsonOptions.site_map);
}
setJobOptions(newJobOptions);
}
};

View File

@@ -11,13 +11,13 @@ export const fetchJobs = async (
fetchOptions: fetchOptions = {}
) => {
const token = Cookies.get("token");
await fetch(`/api/retrieve-scrape-jobs`, {
await fetch("/api/retrieve", {
method: "POST",
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
body: JSON.stringify(fetchOptions),
body: JSON.stringify({ data: fetchOptions }),
})
.then((response) => response.json())
.then((data) => setJobs(data))
@@ -48,7 +48,7 @@ export const checkAI = async (
) => {
const token = Cookies.get("token");
try {
const response = await fetch(`/api/ai/check`, {
const response = await fetch("/api/ai/check", {
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
@@ -69,13 +69,13 @@ export const updateJob = async (ids: string[], field: string, value: any) => {
field: field,
value: value,
};
await fetch(`/api/update`, {
await fetch("/api/update", {
method: "POST",
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
body: JSON.stringify(postBody),
body: JSON.stringify({ data: postBody }),
}).catch((error) => {
console.error("Error fetching jobs:", error);
});

30
src/pages/api/ai/check.ts Normal file
View File

@@ -0,0 +1,30 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
try {
const headers = new Headers(req.headers as Record<string, string>);
headers.set("content-type", "application/json");
headers.set("Authorization", `Bearer ${req.headers.authorization}`);
const response = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/ai/check`,
{
method: "GET",
headers,
}
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const result = await response.json();
res.status(200).json(result);
} catch (error) {
console.error("Error submitting scrape job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
}

56
src/pages/api/ai/index.ts Normal file
View File

@@ -0,0 +1,56 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
const { data } = req.body;
try {
const response = await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/ai`, {
method: "POST",
headers: {
Accept: "text/event-stream",
"Content-Type": "application/json",
},
body: JSON.stringify(data),
});
if (!response.ok) {
const errorDetails = await response.text();
if (response.status === 422) {
console.error(`422 Error: ${errorDetails}`);
}
throw new Error(
`Error fetching logs: ${response.statusText} - ${errorDetails}`
);
}
if (!response.body) {
throw new Error(`No response body from API`);
}
res.writeHead(200, {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache, no-transform",
Connection: "keep-alive",
"Transfer-Encoding": "chunked",
});
let responseStream = response.body;
const reader = responseStream.getReader();
const decoder = new TextDecoder();
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = decoder.decode(value, { stream: true });
res.write(`${chunk}`);
}
res.end();
} catch (error) {
console.error("Error streaming logs:", error);
res.status(500).json({ error: "Internal Server Error" });
}
}

View File

@@ -0,0 +1,39 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
if (req.method === "POST") {
const { data } = req.body;
console.log("Data", data);
const headers = new Headers();
headers.set("content-type", "application/json");
try {
const response = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/delete-cron-job`,
{
method: "POST",
headers,
body: JSON.stringify(data),
}
);
if (!response.ok) {
console.error(response);
throw new Error(`Error: ${response.statusText}`);
}
const result = await response.json();
res.status(200).json(result);
} catch (error) {
console.error("Error deleting cron job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
} else {
res.setHeader("Allow", ["POST"]);
res.status(405).end(`Method ${req.method} Not Allowed`);
}
}

38
src/pages/api/delete.ts Normal file
View File

@@ -0,0 +1,38 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
if (req.method === "POST") {
const { data } = req.body;
const headers = new Headers();
headers.set("content-type", "application/json");
headers.set("Authorization", `Bearer ${req.headers.authorization}`);
try {
const response = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/delete-scrape-jobs`,
{
method: "POST",
headers,
body: JSON.stringify(data),
}
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const result = await response.json();
res.status(200).json(result);
} catch (error) {
console.error("Error submitting scrape job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
} else {
res.setHeader("Allow", ["POST"]);
res.status(405).end(`Method ${req.method} Not Allowed`);
}
}

37
src/pages/api/download.ts Normal file
View File

@@ -0,0 +1,37 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
if (req.method === "POST") {
const { data } = req.body;
const headers = new Headers();
headers.set("content-type", "application/json");
try {
const response = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/download`,
{
method: "POST",
headers,
body: JSON.stringify(data),
}
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const csvText = await response.text();
res.status(200).send(csvText);
} catch (error) {
console.error("Error submitting scrape job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
} else {
res.setHeader("Allow", ["POST"]);
res.status(405).end(`Method ${req.method} Not Allowed`);
}
}

View File

@@ -0,0 +1,30 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
const headers = new Headers();
headers.set("content-type", "application/json");
headers.set("Authorization", `Bearer ${req.headers.authorization}`);
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/statistics/get-average-element-per-link`,
{
method: "GET",
headers,
}
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const csvText = await response.text();
res.status(200).send(csvText);
} catch (error) {
console.error("Error submitting scrape job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
}

View File

@@ -0,0 +1,30 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
const headers = new Headers();
headers.set("content-type", "application/json");
headers.set("Authorization", `Bearer ${req.headers.authorization}`);
try {
const response = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/statistics/get-average-jobs-per-day`,
{
method: "GET",
headers,
}
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const csvText = await response.text();
res.status(200).send(csvText);
} catch (error) {
console.error("Error submitting scrape job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
}

31
src/pages/api/job/[id].ts Normal file
View File

@@ -0,0 +1,31 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
const { id } = req.query;
const headers = new Headers();
headers.set("content-type", "application/json");
headers.set("Authorization", `Bearer ${req.headers.authorization}`);
try {
const response = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/job/${id}`,
{
headers,
}
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const result = await response.json();
res.status(200).json(result);
} catch (error) {
console.error("Error submitting scrape job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
}

45
src/pages/api/logs.ts Normal file
View File

@@ -0,0 +1,45 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
try {
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/logs`,
{
method: "GET",
headers: {
Accept: "text/event-stream",
},
}
);
if (!response.ok || !response.body) {
throw new Error(`Error fetching logs: ${response.statusText}`);
}
res.writeHead(200, {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache, no-transform",
Connection: "keep-alive",
"Transfer-Encoding": "chunked",
});
let responseStream = response.body;
const reader = responseStream.getReader();
const decoder = new TextDecoder();
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = decoder.decode(value, { stream: true });
res.write(`data: ${chunk}\n\n`);
}
res.end();
} catch (error) {
console.error("Error streaming logs:", error);
res.status(500).json({ error: "Internal Server Error" });
}
}

30
src/pages/api/me.ts Normal file
View File

@@ -0,0 +1,30 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
try {
const headers = new Headers();
headers.set("Authorization", `Bearer ${req.headers.authorization}`);
headers.set("content-type", "application/json");
const response = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/auth/users/me`,
{
method: "GET",
headers,
}
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const result = await response.json();
res.status(200).json(result);
} catch (error) {
console.error("Error submitting scrape job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
}

38
src/pages/api/retrieve.ts Normal file
View File

@@ -0,0 +1,38 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
if (req.method === "POST") {
const { data } = req.body;
const headers = new Headers();
headers.set("content-type", "application/json");
headers.set("Authorization", `Bearer ${req.headers.authorization}`);
try {
const response = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/retrieve-scrape-jobs`,
{
method: "POST",
headers,
body: JSON.stringify(data),
}
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const result = await response.json();
res.status(200).json(result);
} catch (error) {
console.error("Error submitting scrape job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
} else {
res.setHeader("Allow", ["POST"]);
res.status(405).end(`Method ${req.method} Not Allowed`);
}
}

View File

@@ -0,0 +1,39 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
if (req.method === "POST") {
const { data } = req.body;
console.log("Data", data);
const headers = new Headers();
headers.set("content-type", "application/json");
try {
const response = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/schedule-cron-job`,
{
method: "POST",
headers,
body: JSON.stringify(data),
}
);
if (!response.ok) {
console.error(response);
throw new Error(`Error: ${response.statusText}`);
}
const result = await response.json();
res.status(200).json(result);
} catch (error) {
console.error("Error scheduling cron job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
} else {
res.setHeader("Allow", ["POST"]);
res.status(405).end(`Method ${req.method} Not Allowed`);
}
}

37
src/pages/api/signup.ts Normal file
View File

@@ -0,0 +1,37 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
if (req.method === "POST") {
const { data } = req.body;
const headers = new Headers();
headers.set("content-type", "application/json");
try {
const response = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/auth/signup`,
{
method: "POST",
headers,
body: JSON.stringify(data),
}
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const result = await response.json();
res.status(200).json(result);
} catch (error) {
console.error("Error submitting scrape job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
} else {
res.setHeader("Allow", ["POST"]);
res.status(405).end(`Method ${req.method} Not Allowed`);
}
}

View File

@@ -0,0 +1,38 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
if (req.method === "POST") {
const { data } = req.body;
const headers = new Headers();
headers.set("Authorization", `Bearer ${req.headers.authorization}`);
headers.set("content-type", "application/json");
try {
const response = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/submit-scrape-job`,
{
method: "POST",
headers,
body: JSON.stringify(data),
}
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const result = await response.json();
res.status(200).json(result);
} catch (error) {
console.error("Error submitting scrape job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
} else {
res.setHeader("Allow", ["POST"]);
res.status(405).end(`Method ${req.method} Not Allowed`);
}
}

39
src/pages/api/token.ts Normal file
View File

@@ -0,0 +1,39 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
if (req.method === "POST") {
const body = new URLSearchParams(req.body as string);
const username = body.get("username") || "";
const password = body.get("password") || "";
const headers = new Headers();
headers.set("content-type", "application/x-www-form-urlencoded");
try {
const response = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/auth/token`,
{
method: "POST",
headers,
body: new URLSearchParams({ username, password }).toString(),
}
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const result = await response.json();
res.status(200).json(result);
} catch (error) {
console.error("Error submitting scrape job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
} else {
res.setHeader("Allow", ["POST"]);
res.status(405).end(`Method ${req.method} Not Allowed`);
}
}

48
src/pages/api/update.ts Normal file
View File

@@ -0,0 +1,48 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
if (req.method === "POST") {
const { data } = req.body;
const headers = new Headers();
headers.set("content-type", "application/json");
headers.set("Authorization", `Bearer ${req.headers.authorization}`);
try {
const response = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/update`,
{
method: "POST",
headers,
body: JSON.stringify(data),
}
);
if (!response.ok) {
const errorDetails = await response.text();
if (response.status === 422) {
console.error(`422 Error: ${errorDetails}`);
}
throw new Error(
`Error fetching logs: ${response.statusText} - ${errorDetails}`
);
}
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const result = await response.json();
res.status(200).json(result);
} catch (error) {
console.error("Error submitting scrape job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
} else {
res.setHeader("Allow", ["POST"]);
res.status(405).end(`Method ${req.method} Not Allowed`);
}
}

View File

@@ -86,7 +86,9 @@ const AI: React.FC = () => {
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ messages: [jobMessage, ...messages, newMessage] }),
body: JSON.stringify({
data: { messages: [jobMessage, ...messages, newMessage] },
}),
});
const updatedMessages = [...messages, newMessage];

4
src/pages/cron-jobs.tsx Normal file
View File

@@ -0,0 +1,4 @@
import { CronJobs } from "../components/pages/cron-jobs";
import { getServerSideProps } from "../components/pages/cron-jobs/get-server-side-props";
export { getServerSideProps };
export default CronJobs;

Some files were not shown because too many files have changed in this diff Show More