35 Commits

Author SHA1 Message Date
Jayden Pyles
8703f706a1 feat: add in optional registration (#65)
* feat: add in optional registration

* fix: issue with registration var

* fix: issue with registration var

* fix: issue with registration var
2025-05-11 11:11:19 -05:00
Jayden Pyles
b40d378bbf fix: chat jobs not loading (#64)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled
2025-05-10 18:34:42 -05:00
Jayden Pyles
8123e1f149 docs: update README [skip ci] 2025-05-10 15:24:16 -05:00
Jayden Pyles
8cd30599fa feat: add in media downloading (#62)
* feat: add in media downloading

* fix: build issue
2025-05-10 15:14:54 -05:00
Jayden Pyles
a58212b214 feat: add authentication test 2025-05-10 14:22:06 -05:00
Jayden Pyles
a6ab6ec71d fix: vulns 2025-05-10 12:04:39 -05:00
Jayden Pyles
c5c9427af4 fix: vulns 2025-05-10 11:49:24 -05:00
Jayden Pyles
e8d80c1a77 fix: add cypress tests to CI [skip ci] 2025-05-10 11:29:20 -05:00
Jayden Pyles
ee8047ac78 fix: add cypress tests to CI [skip ci] 2025-05-10 10:46:05 -05:00
Jayden Pyles
e74c4f392c fix: add cypress tests to CI [skip ci] 2025-05-10 10:41:54 -05:00
Jayden Pyles
6b484952a3 fix: add cypress tests to CI [skip ci] 2025-05-10 10:35:31 -05:00
Jayden Pyles
2283808605 fix: add cypress tests to CI [skip ci] 2025-05-10 10:17:22 -05:00
Jayden Pyles
ee5ada70f7 fix: add cypress tests to CI [skip ci] 2025-05-10 10:04:55 -05:00
Jayden Pyles
56cc457e6e fix: add cypress tests to CI [skip ci] 2025-05-10 09:48:54 -05:00
Jayden Pyles
21a38181de fix: add cypress tests to CI [skip ci] 2025-05-10 09:44:43 -05:00
Jayden Pyles
3063bc0d53 fix: add cypress tests to CI [skip ci] 2025-05-10 09:41:43 -05:00
Jayden Pyles
f42e7ed531 fix: add cypress tests to CI [skip ci] 2025-05-10 09:39:44 -05:00
Jayden Pyles
c197f2becd fix: add cypress tests to CI [skip ci] 2025-05-10 09:38:11 -05:00
Jayden Pyles
a534129702 fix: swap to using chrome driver manager [skip ci] 2025-05-10 09:24:48 -05:00
Jayden Pyles
455ed049c9 fix: allow workflow dispatch [skip ci] 2025-05-10 09:16:41 -05:00
Jayden Pyles
de4ccfbf3a fix: only allow cron on logged in
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
2025-04-24 22:14:00 -05:00
Jayden Pyles
3475d66995 Add cron jobs (#60)
* feat: finish up cron jobs

* feat: clean up
2025-04-24 22:03:28 -05:00
Jayden Pyles
186b4a0231 Merge branch 'master' of github.com:jaypyles/Scraperr 2025-04-24 22:02:06 -05:00
Jayden Pyles
0af0ebf5b5 feat: fix authentication 2025-04-24 18:24:19 -05:00
c3Nz
ef35db00d7 fix: Python handler Fixed (#51)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
* Fix: Python handler Fixed

* fix: Python handler Fixed without comment
2024-11-26 10:05:43 -06:00
Jayden Pyles
d65e600ec3 Merge branch 'master' of github.com:jaypyles/Scraperr
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
2024-11-21 18:13:18 -06:00
Jayden Pyles
6fe145f649 chore: remove uneeded files [skip ci] 2024-11-21 18:12:46 -06:00
Jayden Pyles
563ca2245e Refactor: Drop MongoDB (#48)
* feat: replace mongodb with sqllite

* feat: update docker compose to drop mongo

* chore: drop logs

* chore: cleanup

* fix: unit tests

* fix: workflow

* fix: workflow run
2024-11-21 18:11:46 -06:00
Jayden Pyles
d54fdbd405 fix: workflow ruin [skip ci] 2024-11-21 18:11:31 -06:00
Jayden Pyles
7169755cd2 fix: workflow 2024-11-21 18:03:40 -06:00
Jayden Pyles
15b56b5704 fix: unit tests 2024-11-21 18:00:57 -06:00
Jayden Pyles
bf6b740005 chore: cleanup 2024-11-21 17:43:20 -06:00
Jayden Pyles
c339e75e06 chore: drop logs 2024-11-21 17:36:47 -06:00
Jayden Pyles
b6ed40e6cf feat: update docker compose to drop mongo 2024-11-21 17:36:22 -06:00
Jayden Pyles
3085f9d31a feat: replace mongodb with sqllite 2024-11-20 21:32:27 -06:00
76 changed files with 5523 additions and 14982 deletions

View File

@@ -0,0 +1,58 @@
name: Run Cypress Tests
description: Run Cypress tests
runs:
using: "composite"
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Node
uses: actions/setup-node@v4
with:
node-version: 22
- name: Setup Docker project
shell: bash
run: make build up-dev
- name: Install dependencies
shell: bash
run: npm install
- name: Wait for frontend to be ready
shell: bash
run: |
for i in {1..10}; do
curl -s http://127.0.0.1:80 && echo "Frontend is ready" && exit 0
echo "Waiting for frontend to be ready... attempt $i"
sleep 1
done
echo "Frontend failed to be ready after 10 retries"
exit 1
- name: Wait for backend to be ready
shell: bash
run: |
for i in {1..10}; do
curl -s http://127.0.0.1:8000 && echo "Backend is ready" && exit 0
echo "Waiting for backend to be ready... attempt $i"
sleep 1
done
echo "Backend failed to be ready after 10 retries"
exit 1
- name: Show backend logs on failure
if: failure()
shell: bash
run: |
echo "== Docker Containers =="
docker ps -a
echo "== Backend Logs =="
docker logs $(docker ps -a --filter "name=scraperr_api" --format "{{.Names}}") || echo "Could not get backend logs"
- name: Run Cypress tests
shell: bash
run: npm run cy:run

View File

@@ -4,10 +4,11 @@ on:
workflows: ["Unit Tests"]
types:
- completed
workflow_dispatch:
jobs:
build:
if: ${{ github.event.workflow_run.conclusion == 'success' && github.ref == 'refs/heads/master' && github.event_name != 'pull_request' }}
if: ${{ github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.head_branch == 'master' }}
runs-on: ubuntu-latest
steps:
- name: Checkout
@@ -37,3 +38,20 @@ jobs:
file: ./docker/api/Dockerfile
push: true
tags: ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
success-message:
runs-on: ubuntu-latest
needs:
- build
steps:
- name: Send Discord Message
uses: jaypyles/discord-webhook-action@v1.0.0
with:
webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
content: "Scraperr Successfully Built Docker Images"
username: "Scraperr CI"
embed-title: "✅ Deployment Status"
embed-description: "Scraperr successfully built docker images."
embed-color: 3066993 # Green
embed-footer-text: "Scraperr CI"
embed-timestamp: ${{ github.event.head_commit.timestamp }}

View File

@@ -4,9 +4,11 @@ on:
push:
branches:
- master
pull_request:
branches:
- master
types: [opened, synchronize, reopened]
workflow_dispatch:
jobs:
unit-tests:
@@ -15,6 +17,9 @@ jobs:
- name: Checkout
uses: actions/checkout@v4
- name: Set env
run: echo "ENV=test" >> $GITHUB_ENV
- name: Install pdm
run: pip install pdm
@@ -23,3 +28,27 @@ jobs:
- name: Run tests
run: PYTHONPATH=. pdm run pytest api/backend/tests
cypress-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/run-cypress-tests
success-message:
runs-on: ubuntu-latest
needs:
- unit-tests
- cypress-tests
steps:
- name: Send Discord Message
uses: jaypyles/discord-webhook-action@v1.0.0
with:
webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
content: "Scraperr Successfully Passed Tests"
username: "Scraperr CI"
embed-title: "✅ Deployment Status"
embed-description: "Scraperr successfully passed all tests."
embed-color: 3066993 # Green
embed-footer-text: "Scraperr CI"
embed-timestamp: ${{ github.event.head_commit.timestamp }}

2
.gitignore vendored
View File

@@ -187,3 +187,5 @@ cython_debug/
postgres_data
.vscode
ollama
data
media

1
.python-version Normal file
View File

@@ -0,0 +1 @@
3.10.12

View File

@@ -24,6 +24,7 @@ View the [docs](https://scraperr-docs.pages.dev) for a quickstart guide and more
- Scrape all pages within same domain
- Add custom json headers to send in requests to URLs
- Display results of scraped data
- Download media found on the page (images, videos, etc.)
![main_page](https://github.com/jaypyles/www-scrape/blob/master/docs/main_page.png)

View File

@@ -1,3 +0,0 @@
github_repo: https://github.com/jaypyles/webapp-template.git
deploy_path: /home/admin/site-test6
deploy_command: make pull up-prd

View File

@@ -1,10 +0,0 @@
- name: Deploy site
hosts: all
become: true
vars_files:
- ./config.yaml
tasks:
- name: Deploy
command: "{{deploy_command}}"
args:
chdir: "{{deploy_path}}"

View File

@@ -1,6 +0,0 @@
all:
hosts:
host1:
ansible_host: 192.168.0.1
ansible_user: admin
ansible_ssh_private_key_file: private_key.pem

View File

@@ -1,54 +0,0 @@
- name: Install Docker and run make pull up
hosts: all
become: true
vars_files:
- ./config.yaml
tasks:
- name: Update apt cache
apt:
update_cache: yes
- name: Install required packages
apt:
name:
- apt-transport-https
- ca-certificates
- curl
- gnupg-agent
- software-properties-common
- rsync
- make
state: present
- name: Add Dockers official GPG key
apt_key:
url: https://download.docker.com/linux/ubuntu/gpg
state: present
- name: Add Docker APT repository
apt_repository:
repo: deb [arch=amd64] https://download.docker.com/linux/ubuntu focal stable
state: present
- name: Update apt cache again after adding Docker repo
apt:
update_cache: yes
- name: Install Docker
apt:
name: docker-ce
state: present
- name: Start and enable Docker service
systemd:
name: docker
enabled: yes
state: started
- name: Install Docker Compose
apt:
name: docker-compose-plugin
state: present
- name: Verify Docker is installed
command: docker --version
register: docker_version
- name: Display Docker version
debug:
msg: "Docker version: {{ docker_version.stdout }}"
- name: Clone repo
ansible.builtin.git:
repo: "{{github_repo}}"
dest: "{{deploy_path}}"

View File

@@ -67,4 +67,4 @@ async def ai(c: AI):
@ai_router.get("/ai/check")
async def check():
return JSONResponse(content=bool(open_ai_key or llama_model))
return JSONResponse(content={"ai_enabled": bool(open_ai_key or llama_model)})

View File

@@ -1,9 +1,13 @@
# STL
import os
import logging
import apscheduler # type: ignore
# PDM
from fastapi import FastAPI
import apscheduler.schedulers
import apscheduler.schedulers.background
from fastapi import FastAPI, Request, status
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
# LOCAL
@@ -13,6 +17,11 @@ from api.backend.utils import get_log_level
from api.backend.routers.job_router import job_router
from api.backend.routers.log_router import log_router
from api.backend.routers.stats_router import stats_router
from api.backend.database.startup import init_database
from fastapi.responses import JSONResponse
from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
from api.backend.scheduler import scheduler
log_level = os.getenv("LOG_LEVEL")
LOG_LEVEL = get_log_level(log_level)
@@ -41,3 +50,28 @@ app.include_router(ai_router)
app.include_router(job_router)
app.include_router(log_router)
app.include_router(stats_router)
@app.on_event("startup")
async def startup_event():
start_cron_scheduler(scheduler)
scheduler.start()
if os.getenv("ENV") != "test":
init_database()
LOG.info("Starting up...")
@app.on_event("shutdown")
def shutdown_scheduler():
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(request: Request, exc: RequestValidationError):
exc_str = f"{exc}".replace("\n", " ").replace(" ", " ")
logging.error(f"{request}: {exc_str}")
content = {"status_code": 10422, "message": exc_str, "data": None}
return JSONResponse(
content=content, status_code=status.HTTP_422_UNPROCESSABLE_ENTITY
)

View File

@@ -1,5 +1,6 @@
# STL
from datetime import timedelta
import os
# PDM
from fastapi import Depends, APIRouter, HTTPException, status
@@ -7,7 +8,6 @@ from fastapi.security import OAuth2PasswordRequestForm
# LOCAL
from api.backend.schemas import User, Token, UserCreate
from api.backend.database import get_user_collection
from api.backend.auth.auth_utils import (
ACCESS_TOKEN_EXPIRE_MINUTES,
get_current_user,
@@ -15,9 +15,14 @@ from api.backend.auth.auth_utils import (
get_password_hash,
create_access_token,
)
import logging
from api.backend.database.common import update
auth_router = APIRouter()
LOG = logging.getLogger("auth_router")
@auth_router.post("/auth/token", response_model=Token)
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
@@ -43,15 +48,22 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends(
@auth_router.post("/auth/signup", response_model=User)
async def create_user(user: UserCreate):
users_collection = get_user_collection()
hashed_password = get_password_hash(user.password)
user_dict = user.model_dump()
user_dict["hashed_password"] = hashed_password
del user_dict["password"]
_ = await users_collection.insert_one(user_dict)
query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)"
_ = update(query, (user_dict["email"], hashed_password, user_dict["full_name"]))
return user_dict
@auth_router.get("/auth/users/me", response_model=User)
async def read_users_me(current_user: User = Depends(get_current_user)):
return current_user
@auth_router.get("/auth/check")
async def check_auth():
return {"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True"}

View File

@@ -1,7 +1,5 @@
# STL
import os
from gc import disable
from queue import Empty
from typing import Any, Optional
from datetime import datetime, timedelta
import logging
@@ -15,7 +13,8 @@ from fastapi.security import OAuth2PasswordBearer
# LOCAL
from api.backend.schemas import User, UserInDB, TokenData
from api.backend.database import get_user_collection
from api.backend.database.common import query
LOG = logging.getLogger(__name__)
@@ -40,8 +39,8 @@ def get_password_hash(password: str):
async def get_user(email: str):
user_collection = get_user_collection()
user = await user_collection.find_one({"email": email})
user_query = "SELECT * FROM users WHERE email = ?"
user = query(user_query, (email,))[0]
if not user:
return
@@ -77,27 +76,42 @@ def create_access_token(
async def get_current_user(token: str = Depends(oauth2_scheme)):
LOG.info(f"Getting current user with token: {token}")
LOG.debug(f"Getting current user with token: {token}")
if not token:
LOG.debug("No token provided")
return EMPTY_USER
if len(token.split(".")) != 3:
LOG.error(f"Malformed token: {token}")
return EMPTY_USER
try:
LOG.debug(
f"Decoding token: {token} with secret key: {SECRET_KEY} and algorithm: {ALGORITHM}"
)
if token.startswith("Bearer "):
token = token.split(" ")[1]
payload: Optional[dict[str, Any]] = jwt.decode(
token, SECRET_KEY, algorithms=[ALGORITHM]
)
if not payload:
LOG.error("No payload found in token")
return EMPTY_USER
email = payload.get("sub")
if email is None:
LOG.error("No email found in payload")
return EMPTY_USER
token_data = TokenData(email=email)
except JWTError:
except JWTError as e:
LOG.error(f"JWTError occurred: {e}")
return EMPTY_USER
except Exception as e:
@@ -105,7 +119,6 @@ async def get_current_user(token: str = Depends(oauth2_scheme)):
return EMPTY_USER
user = await get_user(email=token_data.email)
if user is None:
return EMPTY_USER

1
api/backend/constants.py Normal file
View File

@@ -0,0 +1 @@
DATABASE_PATH = "data/database.db"

View File

@@ -1,23 +0,0 @@
# STL
import os
from typing import Any
# PDM
from dotenv import load_dotenv
from motor.motor_asyncio import AsyncIOMotorClient
_ = load_dotenv()
MONGODB_URI = os.getenv("MONGODB_URI")
def get_user_collection():
client: AsyncIOMotorClient[dict[str, Any]] = AsyncIOMotorClient(MONGODB_URI)
db = client["scrape"]
return db["users"]
def get_job_collection():
client: AsyncIOMotorClient[dict[str, Any]] = AsyncIOMotorClient(MONGODB_URI)
db = client["scrape"]
return db["jobs"]

View File

@@ -0,0 +1,3 @@
from .common import insert, QUERIES, update
__all__ = ["insert", "QUERIES", "update"]

View File

@@ -0,0 +1,92 @@
import sqlite3
from typing import Any, Optional
from api.backend.constants import DATABASE_PATH
from api.backend.utils import format_json, format_sql_row_to_python
from api.backend.database.schema import INIT_QUERY
from api.backend.database.queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
import logging
LOG = logging.getLogger(__name__)
def connect():
connection = sqlite3.connect(DATABASE_PATH)
connection.set_trace_callback(print)
cursor = connection.cursor()
return cursor
def insert(query: str, values: tuple[Any, ...]):
connection = sqlite3.connect(DATABASE_PATH)
cursor = connection.cursor()
copy = list(values)
format_json(copy)
try:
_ = cursor.execute(query, copy)
connection.commit()
except sqlite3.Error as e:
LOG.error(f"An error occurred: {e}")
finally:
cursor.close()
connection.close()
def query(query: str, values: Optional[tuple[Any, ...]] = None):
connection = sqlite3.connect(DATABASE_PATH)
connection.row_factory = sqlite3.Row
cursor = connection.cursor()
rows = []
try:
if values:
_ = cursor.execute(query, values)
else:
_ = cursor.execute(query)
rows = cursor.fetchall()
finally:
cursor.close()
connection.close()
formatted_rows: list[dict[str, Any]] = []
for row in rows:
row = dict(row)
formatted_row = format_sql_row_to_python(row)
formatted_rows.append(formatted_row)
return formatted_rows
def update(query: str, values: Optional[tuple[Any, ...]] = None):
connection = sqlite3.connect(DATABASE_PATH)
cursor = connection.cursor()
copy = None
if values:
copy = list(values)
format_json(copy)
try:
if copy:
res = cursor.execute(query, copy)
else:
res = cursor.execute(query)
connection.commit()
return res.rowcount
except sqlite3.Error as e:
LOG.error(f"An error occurred: {e}")
finally:
cursor.close()
connection.close()
return 0
QUERIES = {
"init": INIT_QUERY,
"insert_job": JOB_INSERT_QUERY,
"delete_job": DELETE_JOB_QUERY,
}

View File

@@ -0,0 +1,3 @@
from .queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
__all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"]

View File

@@ -0,0 +1,9 @@
JOB_INSERT_QUERY = """
INSERT INTO jobs
(id, url, elements, user, time_created, result, status, chat, job_options)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
"""
DELETE_JOB_QUERY = """
DELETE FROM jobs WHERE id IN ()
"""

View File

@@ -0,0 +1,3 @@
from .schema import INIT_QUERY
__all__ = ["INIT_QUERY"]

View File

@@ -0,0 +1,30 @@
INIT_QUERY = """
CREATE TABLE IF NOT EXISTS jobs (
id STRING PRIMARY KEY NOT NULL,
url STRING NOT NULL,
elements JSON NOT NULL,
user STRING,
time_created DATETIME NOT NULL,
result JSON NOT NULL,
status STRING NOT NULL,
chat JSON,
job_options JSON
);
CREATE TABLE IF NOT EXISTS users (
email STRING PRIMARY KEY NOT NULL,
hashed_password STRING NOT NULL,
full_name STRING,
disabled BOOLEAN
);
CREATE TABLE IF NOT EXISTS cron_jobs (
id STRING PRIMARY KEY NOT NULL,
user_email STRING NOT NULL,
job_id STRING NOT NULL,
cron_expression STRING NOT NULL,
time_created DATETIME NOT NULL,
time_updated DATETIME NOT NULL,
FOREIGN KEY (job_id) REFERENCES jobs(id)
);
"""

View File

@@ -0,0 +1,43 @@
import os
from api.backend.database.common import connect, QUERIES
import logging
from api.backend.auth.auth_utils import get_password_hash
LOG = logging.getLogger(__name__)
def init_database():
cursor = connect()
for query in QUERIES["init"].strip().split(";"):
if query.strip():
LOG.info(f"Executing query: {query}")
_ = cursor.execute(query)
if os.environ.get("REGISTRATION_ENABLED", "True") == "False":
default_user_email = os.environ.get("DEFAULT_USER_EMAIL")
default_user_password = os.environ.get("DEFAULT_USER_PASSWORD")
default_user_full_name = os.environ.get("DEFAULT_USER_FULL_NAME")
if (
not default_user_email
or not default_user_password
or not default_user_full_name
):
LOG.error(
"DEFAULT_USER_EMAIL, DEFAULT_USER_PASSWORD, or DEFAULT_USER_FULL_NAME is not set!"
)
exit(1)
query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)"
_ = cursor.execute(
query,
(
default_user_email,
get_password_hash(default_user_password),
default_user_full_name,
),
)
cursor.close()

View File

@@ -1,5 +1,4 @@
from .job import (
query,
insert,
update_job,
delete_jobs,
@@ -9,7 +8,6 @@ from .job import (
)
__all__ = [
"query",
"insert",
"update_job",
"delete_jobs",

View File

@@ -0,0 +1,100 @@
import datetime
from typing import Any
import uuid
from api.backend.database.common import insert, query
from api.backend.models import CronJob
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
from apscheduler.triggers.cron import CronTrigger # type: ignore
from api.backend.job import insert as insert_job
import logging
LOG = logging.getLogger("Cron Scheduler")
def insert_cron_job(cron_job: CronJob):
query = """
INSERT INTO cron_jobs (id, user_email, job_id, cron_expression, time_created, time_updated)
VALUES (?, ?, ?, ?, ?, ?)
"""
values = (
cron_job.id,
cron_job.user_email,
cron_job.job_id,
cron_job.cron_expression,
cron_job.time_created,
cron_job.time_updated,
)
insert(query, values)
return True
def delete_cron_job(id: str, user_email: str):
query = """
DELETE FROM cron_jobs
WHERE id = ? AND user_email = ?
"""
values = (id, user_email)
insert(query, values)
return True
def get_cron_jobs(user_email: str):
cron_jobs = query("SELECT * FROM cron_jobs WHERE user_email = ?", (user_email,))
return cron_jobs
def get_all_cron_jobs():
cron_jobs = query("SELECT * FROM cron_jobs")
return cron_jobs
def insert_job_from_cron_job(job: dict[str, Any]):
insert_job(
{
**job,
"id": uuid.uuid4().hex,
"status": "Queued",
"result": "",
"chat": None,
"time_created": datetime.datetime.now(),
"time_updated": datetime.datetime.now(),
}
)
def get_cron_job_trigger(cron_expression: str):
expression_parts = cron_expression.split()
if len(expression_parts) != 5:
print(f"Invalid cron expression: {cron_expression}")
return None
minute, hour, day, month, day_of_week = expression_parts
return CronTrigger(
minute=minute, hour=hour, day=day, month=month, day_of_week=day_of_week
)
def start_cron_scheduler(scheduler: BackgroundScheduler):
cron_jobs = get_all_cron_jobs()
LOG.info(f"Cron jobs: {cron_jobs}")
for job in cron_jobs:
queried_job = query("SELECT * FROM jobs WHERE id = ?", (job["job_id"],))
LOG.info(f"Adding job: {queried_job}")
scheduler.add_job(
insert_job_from_cron_job,
get_cron_job_trigger(job["cron_expression"]),
id=job["id"],
args=[queried_job[0]],
)

View File

@@ -1,161 +1,97 @@
# STL
import logging
from typing import Any, Optional
# PDM
from pymongo import DESCENDING
from typing import Any
# LOCAL
from api.backend.database import get_job_collection
from api.backend.job.models.job_options import FetchOptions
from api.backend.utils import format_list_for_query
from api.backend.database.common import (
insert as common_insert,
query as common_query,
QUERIES,
update as common_update,
)
LOG = logging.getLogger(__name__)
async def insert(item: dict[str, Any]) -> None:
collection = get_job_collection()
i = await collection.insert_one(item)
LOG.info(f"Inserted item: {i}")
def insert(item: dict[str, Any]) -> None:
common_insert(
QUERIES["insert_job"],
(
item["id"],
item["url"],
item["elements"],
item["user"],
item["time_created"],
item["result"],
item["status"],
item["chat"],
item["job_options"],
),
)
LOG.info(f"Inserted item: {item}")
async def get_queued_job():
collection = get_job_collection()
return await collection.find_one(
{"status": "Queued"}, sort=[("created_at", DESCENDING)]
query = (
"SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
)
async def query(
filter: dict[str, Any], fetch_options: Optional[FetchOptions] = None
) -> list[dict[str, Any]]:
collection = get_job_collection()
cursor = collection.find(filter)
results: list[dict[str, Any]] = []
async for document in cursor:
del document["_id"]
if fetch_options and not fetch_options.chat and document.get("chat"):
del document["chat"]
results.append(document)
return results
res = common_query(query)
LOG.info(f"Got queued job: {res}")
return res[0] if res else None
async def update_job(ids: list[str], field: str, value: Any):
collection = get_job_collection()
for id in ids:
_ = await collection.update_one(
{"id": id},
{"$set": {field: value}},
)
query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
res = common_update(query, tuple([value] + ids))
LOG.info(f"Updated job: {res}")
async def delete_jobs(jobs: list[str]):
collection = get_job_collection()
result = await collection.delete_many({"id": {"$in": jobs}})
LOG.info(f"{result.deleted_count} documents deleted")
if not jobs:
LOG.info("No jobs to delete.")
return False
return True if result.deleted_count > 0 else False
query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}"
res = common_update(query, tuple(jobs))
return res > 0
async def average_elements_per_link(user: str):
collection = get_job_collection()
pipeline = [
{"$match": {"status": "Completed", "user": user}},
{
"$addFields": {
"time_created_date": {
"$cond": {
"if": {"$eq": [{"$type": "$time_created"}, "date"]},
"then": "$time_created",
"else": {
"$convert": {
"input": "$time_created",
"to": "date",
"onError": None,
"onNull": None,
}
},
}
}
}
},
{
"$project": {
"date": {
"$dateToString": {
"format": "%Y-%m-%d",
"date": "$time_created_date",
}
},
"num_elements": {"$size": "$elements"},
}
},
{
"$group": {
"_id": "$date",
"average_elements": {"$avg": "$num_elements"},
"count": {"$sum": 1},
}
},
{"$sort": {"_id": 1}},
]
cursor = collection.aggregate(pipeline)
results: list[dict[str, Any]] = []
async for document in cursor:
results.append(
{
"date": document["_id"],
"average_elements": document["average_elements"],
"count": document["count"],
}
)
job_query = """
SELECT
DATE(time_created) AS date,
AVG(json_array_length(elements)) AS average_elements,
COUNT(*) AS count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = common_query(job_query, (user,))
return results
async def get_jobs_per_day(user: str):
collection = get_job_collection()
pipeline = [
{"$match": {"status": "Completed", "user": user}},
{
"$addFields": {
"time_created_date": {
"$cond": {
"if": {"$eq": [{"$type": "$time_created"}, "date"]},
"then": "$time_created",
"else": {
"$convert": {
"input": "$time_created",
"to": "date",
"onError": None,
"onNull": None,
}
},
}
}
}
},
{
"$project": {
"date": {
"$dateToString": {
"format": "%Y-%m-%d",
"date": "$time_created_date",
}
}
}
},
{"$group": {"_id": "$date", "job_count": {"$sum": 1}}},
{"$sort": {"_id": 1}},
]
cursor = collection.aggregate(pipeline)
results: list[dict[str, Any]] = []
async for document in cursor:
results.append({"date": document["_id"], "job_count": document["job_count"]})
job_query = """
SELECT
DATE(time_created) AS date,
COUNT(*) AS job_count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = common_query(job_query, (user,))
return results

View File

@@ -0,0 +1,3 @@
from .job_options import JobOptions
__all__ = ["JobOptions"]

View File

@@ -12,3 +12,4 @@ class JobOptions(BaseModel):
custom_headers: dict[str, Any] = {}
proxies: list[str] = []
site_map: Optional[SiteMap] = None
collect_media: bool = False

View File

@@ -0,0 +1,91 @@
import os
import requests
from pathlib import Path
from selenium.webdriver.common.by import By
from seleniumwire import webdriver
from urllib.parse import urlparse
from api.backend.utils import LOG
def collect_media(driver: webdriver.Chrome):
media_types = {
"images": "img",
"videos": "video",
"audio": "audio",
"pdfs": 'a[href$=".pdf"]',
"documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]',
"presentations": 'a[href$=".ppt"], a[href$=".pptx"]',
"spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]',
}
base_dir = Path("media")
base_dir.mkdir(exist_ok=True)
media_urls = {}
for media_type, selector in media_types.items():
elements = driver.find_elements(By.CSS_SELECTOR, selector)
urls: list[dict[str, str]] = []
media_dir = base_dir / media_type
media_dir.mkdir(exist_ok=True)
for element in elements:
if media_type == "images":
url = element.get_attribute("src")
elif media_type == "videos":
url = element.get_attribute("src") or element.get_attribute("data-src")
else:
url = element.get_attribute("href")
if url and url.startswith(("http://", "https://")):
try:
filename = os.path.basename(urlparse(url).path)
if not filename:
filename = f"{media_type}_{len(urls)}"
if media_type == "images":
filename += ".jpg"
elif media_type == "videos":
filename += ".mp4"
elif media_type == "audio":
filename += ".mp3"
elif media_type == "pdfs":
filename += ".pdf"
elif media_type == "documents":
filename += ".doc"
elif media_type == "presentations":
filename += ".ppt"
elif media_type == "spreadsheets":
filename += ".xls"
response = requests.get(url, stream=True)
response.raise_for_status()
# Save the file
file_path = media_dir / filename
with open(file_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
urls.append({"url": url, "local_path": str(file_path)})
LOG.info(f"Downloaded {filename} to {file_path}")
except Exception as e:
LOG.error(f"Error downloading {url}: {str(e)}")
continue
media_urls[media_type] = urls
with open(base_dir / "download_summary.txt", "w") as f:
for media_type, downloads in media_urls.items():
if downloads:
f.write(f"\n=== {media_type.upper()} ===\n")
for download in downloads:
f.write(f"URL: {download['url']}\n")
f.write(f"Saved to: {download['local_path']}\n\n")
return media_urls

View File

@@ -1,13 +1,19 @@
import time
from typing import cast
from selenium import webdriver
from seleniumwire import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from api.backend.utils import LOG
def scrape_content(driver: webdriver.Chrome, pages: set[tuple[str, str]]):
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
def scrape_content(
driver: webdriver.Chrome, pages: set[tuple[str, str]], collect_media: bool
):
_ = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
@@ -27,4 +33,9 @@ def scrape_content(driver: webdriver.Chrome, pages: set[tuple[str, str]]):
last_height = new_height
pages.add((driver.page_source, driver.current_url))
if collect_media:
LOG.info("Collecting media")
collect_media_utils(driver)
return driver.page_source

View File

@@ -77,7 +77,6 @@ async def handle_site_mapping(
pages: set[tuple[str, str]],
):
site_map = SiteMap(**site_map_dict)
LOG.info(f"Handling site map: {site_map}")
for action in site_map.actions:
action_handler = ACTION_MAP[action.type]

View File

@@ -9,9 +9,6 @@ from api.backend.job.models.job_options import JobOptions
import pydantic
class Element(pydantic.BaseModel):
name: str
xpath: str
@@ -60,3 +57,17 @@ class Job(pydantic.BaseModel):
job_options: JobOptions
status: str = "Queued"
chat: Optional[str] = None
class CronJob(pydantic.BaseModel):
id: Optional[str] = None
user_email: str
job_id: str
cron_expression: str
time_created: Optional[Union[datetime, str]] = None
time_updated: Optional[Union[datetime, str]] = None
class DeleteCronJob(pydantic.BaseModel):
id: str
user_email: str

View File

@@ -1,4 +1,5 @@
# STL
import datetime
import uuid
import traceback
from io import StringIO
@@ -10,20 +11,34 @@ import random
from fastapi import Depends, APIRouter
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse, StreamingResponse
from api.backend.scheduler import scheduler
from apscheduler.triggers.cron import CronTrigger # type: ignore
# LOCAL
from api.backend.job import query, insert, update_job, delete_jobs
from api.backend.job import insert, update_job, delete_jobs
from api.backend.models import (
DeleteCronJob,
UpdateJobs,
DownloadJob,
DeleteScrapeJobs,
Job,
CronJob,
)
from api.backend.schemas import User
from api.backend.auth.auth_utils import get_current_user
from api.backend.utils import clean_text
from api.backend.utils import clean_text, format_list_for_query
from api.backend.job.models.job_options import FetchOptions
from api.backend.database.common import query
from api.backend.job.cron_scheduling.cron_scheduling import (
delete_cron_job,
get_cron_job_trigger,
insert_cron_job,
get_cron_jobs,
insert_job_from_cron_job,
)
LOG = logging.getLogger(__name__)
job_router = APIRouter()
@@ -42,10 +57,11 @@ async def submit_scrape_job(job: Job):
job.id = uuid.uuid4().hex
job_dict = job.model_dump()
await insert(job_dict)
insert(job_dict)
return JSONResponse(content={"id": job.id})
except Exception as e:
LOG.error(f"Exception occurred: {traceback.format_exc()}")
return JSONResponse(content={"error": str(e)}, status_code=500)
@@ -54,8 +70,11 @@ async def retrieve_scrape_jobs(
fetch_options: FetchOptions, user: User = Depends(get_current_user)
):
LOG.info(f"Retrieving jobs for account: {user.email}")
ATTRIBUTES = "chat" if fetch_options.chat else "*"
try:
results = await query({"user": user.email}, fetch_options=fetch_options)
job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?"
results = query(job_query, (user.email,))
return JSONResponse(content=jsonable_encoder(results[::-1]))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
@@ -67,8 +86,8 @@ async def job(id: str, user: User = Depends(get_current_user)):
LOG.info(f"Retrieving jobs for account: {user.email}")
try:
filter = {"user": user.email, "id": id}
results = await query(filter)
job_query = "SELECT * FROM jobs WHERE user = ? AND id = ?"
results = query(job_query, (user.email, id))
return JSONResponse(content=jsonable_encoder(results))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
@@ -80,7 +99,10 @@ async def download(download_job: DownloadJob):
LOG.info(f"Downloading job with ids: {download_job.ids}")
try:
results = await query({"id": {"$in": download_job.ids}})
job_query = (
f"SELECT * FROM jobs WHERE id IN {format_list_for_query(download_job.ids)}"
)
results = query(job_query, tuple(download_job.ids))
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
@@ -131,3 +153,47 @@ async def delete(delete_scrape_jobs: DeleteScrapeJobs):
if result
else JSONResponse({"error": "Jobs not deleted."})
)
@job_router.post("/schedule-cron-job")
async def schedule_cron_job(cron_job: CronJob):
if not cron_job.id:
cron_job.id = uuid.uuid4().hex
if not cron_job.time_created:
cron_job.time_created = datetime.datetime.now()
if not cron_job.time_updated:
cron_job.time_updated = datetime.datetime.now()
insert_cron_job(cron_job)
queried_job = query("SELECT * FROM jobs WHERE id = ?", (cron_job.job_id,))
scheduler.add_job(
insert_job_from_cron_job,
get_cron_job_trigger(cron_job.cron_expression),
id=cron_job.id,
args=[queried_job[0]],
)
return JSONResponse(content={"message": "Cron job scheduled successfully."})
@job_router.post("/delete-cron-job")
async def delete_cron_job_request(request: DeleteCronJob):
if not request.id:
return JSONResponse(
content={"error": "Cron job id is required."}, status_code=400
)
delete_cron_job(request.id, request.user_email)
scheduler.remove_job(request.id)
return JSONResponse(content={"message": "Cron job deleted successfully."})
@job_router.get("/cron-jobs")
async def get_cron_jobs_request(user: User = Depends(get_current_user)):
cron_jobs = get_cron_jobs(user.email)
return JSONResponse(content=jsonable_encoder(cron_jobs))

3
api/backend/scheduler.py Normal file
View File

@@ -0,0 +1,3 @@
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
scheduler = BackgroundScheduler()

View File

@@ -2,10 +2,10 @@ import logging
from typing import Any, Optional
import random
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag
from lxml import etree
from seleniumwire import webdriver
from lxml.etree import _Element # pyright: ignore [reportPrivateUsage]
from lxml.etree import _Element
from fake_useragent import UserAgent
from selenium.webdriver.chrome.options import Options as ChromeOptions
from urllib.parse import urlparse, urljoin
@@ -13,8 +13,9 @@ from api.backend.models import Element, CapturedElement
from api.backend.job.site_mapping.site_mapping import (
handle_site_mapping,
)
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from api.backend.job.scraping.scraping_utils import scrape_content
from api.backend.job.models.site_map import SiteMap
LOG = logging.getLogger(__name__)
@@ -70,21 +71,27 @@ def create_driver(proxies: Optional[list[str]] = []):
chrome_options.add_argument(f"user-agent={ua.random}")
sw_options = {}
if proxies:
selected_proxy = proxies[random.randint(0, len(proxies) - 1)]
selected_proxy = random.choice(proxies)
LOG.info(f"Using proxy: {selected_proxy}")
sw_options = {
"proxy": {
"https": f"https://{selected_proxy}",
"http": f"http://{selected_proxy}",
"no_proxy": "localhost,127.0.0.1",
}
}
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(
service=service,
options=chrome_options,
seleniumwire_options=sw_options,
)
return driver
@@ -97,6 +104,7 @@ async def make_site_request(
original_url: str = "",
proxies: Optional[list[str]] = [],
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
) -> None:
"""Make basic `GET` request to site using Selenium."""
# Check if URL has already been visited
@@ -117,7 +125,7 @@ async def make_site_request(
visited_urls.add(url)
visited_urls.add(final_url)
page_source = scrape_content(driver, pages)
page_source = scrape_content(driver, pages, collect_media)
if site_map:
LOG.info("Site map: %s", site_map)
@@ -135,7 +143,10 @@ async def make_site_request(
soup = BeautifulSoup(page_source, "html.parser")
for a_tag in soup.find_all("a"):
link = a_tag.get("href")
if not isinstance(a_tag, Tag):
continue
link = str(a_tag.get("href", ""))
if link:
if not urlparse(link).netloc:
@@ -163,7 +174,10 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
el = sxpath(root, elem.xpath)
for e in el:
text = "\t".join(str(t) for t in e.itertext())
if isinstance(e, etree._Element): # type: ignore
text = "\t".join(str(t) for t in e.itertext())
else:
text = str(e)
captured_element = CapturedElement(
xpath=elem.xpath, text=text, name=elem.name
)
@@ -183,7 +197,8 @@ async def scrape(
headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False,
proxies: Optional[list[str]] = [],
site_map: Optional[SiteMap] = None,
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
):
visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set()
@@ -197,6 +212,7 @@ async def scrape(
original_url=url,
proxies=proxies,
site_map=site_map,
collect_media=collect_media,
)
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()

View File

@@ -1,15 +1,10 @@
import pytest
import logging
from unittest.mock import AsyncMock, patch, MagicMock
from api.backend.tests.factories.job_factory import create_job
from api.backend.models import JobOptions
from api.backend.scraping import create_driver
mocked_job = create_job(
job_options=JobOptions(
multi_page_scrape=False, custom_headers={}, proxies=["127.0.0.1:8080"]
)
).model_dump()
logging.basicConfig(level=logging.DEBUG)
LOG = logging.getLogger(__name__)
@pytest.mark.asyncio
@@ -26,8 +21,7 @@ async def test_proxy(mock_get: AsyncMock):
driver.get("http://example.com")
response = driver.last_request
# Check if the proxy header is set correctly
if response:
assert response.headers["Proxy"] == "127.0.0.1:8080"
assert response.headers["Proxy-Connection"] == "keep-alive"
driver.quit()

View File

@@ -1,5 +1,8 @@
from typing import Optional
from typing import Any, Optional
import logging
import json
LOG = logging.getLogger(__name__)
def clean_text(text: str):
@@ -17,3 +20,30 @@ def get_log_level(level_name: Optional[str]) -> int:
level = getattr(logging, level_name, logging.INFO)
return level
def format_list_for_query(ids: list[str]):
return (
f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)"
)
def format_sql_row_to_python(row: dict[str, Any]):
new_row: dict[str, Any] = {}
for key, value in row.items():
if isinstance(value, str):
try:
new_row[key] = json.loads(value)
except json.JSONDecodeError:
new_row[key] = value
else:
new_row[key] = value
return new_row
def format_json(items: list[Any]):
for idx, item in enumerate(items):
if isinstance(item, (dict, list)):
formatted_item = json.dumps(item)
items[idx] = formatted_item

View File

@@ -8,6 +8,8 @@ import logging
import sys
import traceback
from api.backend.database.startup import init_database
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
LOG = logging.getLogger(__name__)
@@ -25,6 +27,7 @@ async def process_job():
job["job_options"]["multi_page_scrape"],
job["job_options"]["proxies"],
job["job_options"]["site_map"],
job["job_options"]["collect_media"],
)
LOG.info(
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
@@ -41,6 +44,9 @@ async def process_job():
async def main():
LOG.info("Starting job worker...")
init_database()
while True:
await process_job()
await asyncio.sleep(5)

View File

@@ -0,0 +1,60 @@
describe("Authentication", () => {
it("should register", () => {
cy.intercept("POST", "/api/signup").as("signup");
cy.visit("/").then(() => {
cy.get("button").contains("Login").click();
cy.url().should("include", "/login");
cy.get("form").should("be.visible");
cy.get("button")
.contains("No Account? Sign up")
.should("be.visible")
.click();
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("input[name='fullName']").type("John Doe");
cy.get("button[type='submit']").contains("Signup").click();
cy.wait("@signup").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("signup request did not return a response");
}
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
expect(interception.response.statusCode).to.eq(200);
});
});
});
it("should login", () => {
cy.intercept("POST", "/api/token").as("token");
cy.visit("/").then(() => {
cy.get("button")
.contains("Login")
.click()
.then(() => {
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("button[type='submit']").contains("Login").click();
cy.wait("@token").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("token request did not return a response");
}
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
expect(interception.response.statusCode).to.eq(200);
});
});
});
});
});

View File

@@ -1,19 +1,34 @@
describe("Job", () => {
describe.only("Job", () => {
it("should create a job", () => {
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
cy.visit("/");
const input = cy.get('[data-cy="url-input"]');
input.type("https://example.com");
cy.get('[data-cy="url-input"]').type("https://example.com");
cy.get('[data-cy="name-field"]').type("example");
cy.get('[data-cy="xpath-field"]').type("//body");
cy.get('[data-cy="add-button"]').click();
const nameField = cy.get('[data-cy="name-field"]');
const xPathField = cy.get('[data-cy="xpath-field"]');
const addButton = cy.get('[data-cy="add-button"]');
cy.contains("Submit").click();
nameField.type("example");
xPathField.type("//body");
addButton.click();
cy.wait("@submitScrapeJob").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
cy.log("Request body: " + JSON.stringify(interception.request?.body));
throw new Error("submitScrapeJob request did not return a response");
}
const submit = cy.contains("Submit");
submit.click();
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
expect(interception.response.statusCode).to.eq(200);
});
cy.get("li").contains("Previous Jobs").click();
cy.contains("div", "https://example.com", { timeout: 10000 }).should(
"exist"
);
cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
});
});

View File

@@ -34,4 +34,4 @@
// visit(originalFn: CommandOriginalFn, url: string, options: Partial<VisitOptions>): Chainable<Element>
// }
// }
// }
// }

View File

@@ -23,25 +23,17 @@ services:
dockerfile: docker/api/Dockerfile
environment:
- LOG_LEVEL=INFO
- MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB
- SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string)
- SECRET_KEY=MRo9PfasPibnqFeK4Oswb6Z+PhFmjzdvxZzwdAkbf/Y= # used to encode authentication tokens (can be a random string)
- ALGORITHM=HS256 # authentication encoding algorithm
- ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes
container_name: scraperr_api
ports:
- 8000:8000
volumes:
- "$PWD/data:/project/data"
- "$PWD/media:/project/media"
- /var/run/docker.sock:/var/run/docker.sock
networks:
- web
mongo:
container_name: webscrape-mongo
image: mongo
restart: always
environment:
MONGO_INITDB_ROOT_USERNAME: root
MONGO_INITDB_ROOT_PASSWORD: example
networks:
- web
networks:
web:

View File

@@ -1,5 +1,5 @@
# Build next dependencies
FROM node:latest
FROM node:23.1
WORKDIR /app
COPY package*.json ./
@@ -15,6 +15,4 @@ COPY src /app/src
RUN npm run build
EXPOSE 3000
# CMD [ "npm", "run" ]
EXPOSE 3000

View File

@@ -1,4 +0,0 @@
tls:
certificates:
- certFile: /etc/certs/ssl-cert.pem
keyFile: /etc/certs/ssl-cert.key

View File

@@ -1,37 +0,0 @@
# STL
import os
# PDM
import boto3
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
def test_insert_and_delete():
# Get environment variables
region_name = os.getenv("AWS_REGION")
# Initialize DynamoDB resource
dynamodb = boto3.resource("dynamodb", region_name=region_name)
table = dynamodb.Table("scrape")
# Item to insert
item = {
"id": "123", # Replace with the appropriate id value
"attribute1": "value1",
"attribute2": "value2",
# Add more attributes as needed
}
# Insert the item
table.put_item(Item=item)
print(f"Inserted item: {item}")
# Delete the item
table.delete_item(Key={"id": "123"}) # Replace with the appropriate id value
print(f"Deleted item with id: {item['id']}")
if __name__ == "__main__":
test_insert_and_delete()

15660
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -19,6 +19,7 @@
"bootstrap": "^5.3.0",
"chart.js": "^4.4.3",
"cookie": "^0.6.0",
"dotenv": "^16.5.0",
"framer-motion": "^4.1.17",
"js-cookie": "^3.0.5",
"next": "^14.2.4",
@@ -31,7 +32,6 @@
"react-modal-image": "^2.6.0",
"react-router": "^6.14.1",
"react-router-dom": "^6.14.1",
"react-scripts": "^5.0.1",
"react-spinners": "^0.14.1",
"typescript": "^4.9.5",
"web-vitals": "^2.1.4"
@@ -63,12 +63,18 @@
]
},
"devDependencies": {
"@types/cypress": "^0.1.6",
"@types/cypress": "^1.1.6",
"@types/js-cookie": "^3.0.6",
"cypress": "^13.15.0",
"autoprefixer": "^10.4.21",
"cypress": "^13.17.0",
"eslint": "^9.26.0",
"postcss": "^8.5.3",
"tailwindcss": "^3.3.5"
},
"overrides": {
"react-refresh": "0.11.0"
},
"resolutions": {
"postcss": "^8.4.31"
}
}

2247
pdm.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -2,9 +2,7 @@
name = "web-scrape"
version = "0.1.0"
description = ""
authors = [
{name = "Jayden Pyles", email = "jpylesbuisness@gmail.com"},
]
authors = [{ name = "Jayden Pyles", email = "jpylesbuisness@gmail.com" }]
dependencies = [
"uvicorn>=0.30.1",
"fastapi>=0.111.0",
@@ -39,20 +37,19 @@ dependencies = [
"exceptiongroup>=1.2.2",
"Faker>=30.6.0",
"pytest-asyncio>=0.24.0",
"python-multipart>=0.0.12",
"python-multipart>=0.0.1",
"bcrypt==4.0.1",
"apscheduler>=3.11.0",
]
requires-python = ">=3.10"
readme = "README.md"
license = {text = "MIT"}
license = { text = "MIT" }
[tool.pdm]
distribution = true
[tool.pdm.dev-dependencies]
dev = [
"ipython>=8.26.0",
"pytest>=8.3.3",
]
dev = ["ipython>=8.26.0", "pytest>=8.3.3"]
[tool.pyright]
include = ["./api/backend/"]
exclude = ["**/node_modules", "**/__pycache__"]
@@ -60,14 +57,42 @@ ignore = []
defineConstant = { DEBUG = true }
stubPath = ""
reportUnknownMemberType= false
reportMissingImports = true
reportMissingTypeStubs = false
reportAny = false
reportCallInDefaultInitializer = false
# Type checking strictness
typeCheckingMode = "strict" # Enables strict type checking mode
reportPrivateUsage = "none"
reportMissingTypeStubs = "none"
reportUntypedFunctionDecorator = "error"
reportUntypedClassDecorator = "error"
reportUntypedBaseClass = "error"
reportInvalidTypeVarUse = "error"
reportUnnecessaryTypeIgnoreComment = "information"
reportUnknownVariableType = "none"
reportUnknownMemberType = "none"
reportUnknownParameterType = "none"
pythonVersion = "3.9"
pythonPlatform = "Linux"
# Additional checks
reportImplicitStringConcatenation = "error"
reportInvalidStringEscapeSequence = "error"
reportMissingImports = "error"
reportMissingModuleSource = "error"
reportOptionalCall = "error"
reportOptionalIterable = "error"
reportOptionalMemberAccess = "error"
reportOptionalOperand = "error"
reportOptionalSubscript = "error"
reportTypedDictNotRequiredAccess = "error"
# Function return type checking
reportIncompleteStub = "error"
reportIncompatibleMethodOverride = "error"
reportInvalidStubStatement = "error"
reportInconsistentOverload = "error"
# Misc settings
pythonVersion = "3.10" # Matches your Python version from pyproject.toml
strictListInference = true
strictDictionaryInference = true
strictSetInference = true
[tool.isort]

View File

@@ -28,10 +28,6 @@ export const JobSelector = ({
const [popoverJob, setPopoverJob] = useState<Job | null>(null);
const theme = useTheme();
useEffect(() => {
fetchJobs(setJobs, { chat: true });
}, []);
const handlePopoverOpen = (
event: React.MouseEvent<HTMLElement>,
job: Job
@@ -124,7 +120,9 @@ export const JobSelector = ({
fontStyle: "italic",
}}
>
{new Date(popoverJob.time_created).toLocaleString()}
{popoverJob.time_created
? new Date(popoverJob.time_created).toLocaleString()
: "Unknown"}
</Typography>
</div>
</Box>

View File

@@ -2,7 +2,7 @@
import React from "react";
import { useAuth } from "../../../contexts/AuthContext";
import { Box, Drawer, Divider } from "@mui/material";
import { Box, Drawer } from "@mui/material";
import { QuickSettings } from "../../nav/quick-settings";
import { NavItems } from "./nav-items/nav-items";

View File

@@ -7,6 +7,7 @@ import TerminalIcon from "@mui/icons-material/Terminal";
import BarChart from "@mui/icons-material/BarChart";
import AutoAwesomeIcon from "@mui/icons-material/AutoAwesome";
import { List } from "@mui/material";
import { Schedule } from "@mui/icons-material";
const items = [
{
@@ -34,6 +35,11 @@ const items = [
text: "View App Logs",
href: "/logs",
},
{
icon: <Schedule />,
text: "Cron Jobs",
href: "/cron-jobs",
},
];
export const NavItems = () => {

View File

@@ -175,7 +175,11 @@ export const JobQueue = ({
onDownload([row.id]);
}}
size="small"
sx={{ minWidth: 0, padding: "4px 8px" }}
sx={{
minWidth: 0,
padding: "4px 8px",
fontSize: "0.625rem",
}}
>
Download
</Button>
@@ -184,7 +188,11 @@ export const JobQueue = ({
onNavigate(row.elements, row.url, row.job_options)
}
size="small"
sx={{ minWidth: 0, padding: "4px 8px" }}
sx={{
minWidth: 0,
padding: "4px 8px",
fontSize: "0.625rem",
}}
>
Rerun
</Button>

View File

@@ -0,0 +1,351 @@
import React, { useEffect, useRef, useState } from "react";
import {
Box,
TextField,
Typography,
Paper,
useTheme,
IconButton,
Tooltip,
} from "@mui/material";
import { JobSelector } from "../../ai";
import { Job, Message } from "../../../types";
import { useSearchParams } from "next/navigation";
import { checkAI, fetchJob, fetchJobs, updateJob } from "../../../lib";
import SendIcon from "@mui/icons-material/Send";
import EditNoteIcon from "@mui/icons-material/EditNote";
export const AI: React.FC = () => {
const theme = useTheme();
const [currentMessage, setCurrentMessage] = useState<string>("");
const [selectedJob, setSelectedJob] = useState<Job | null>(null);
const [messages, setMessages] = useState<Message[]>([]);
const [aiEnabled, setAiEnabled] = useState<boolean>(false);
const [jobs, setJobs] = useState<Job[]>([]);
const [thinking, setThinking] = useState<boolean>(false);
const searchParams = useSearchParams();
const getJobFromParam = async () => {
const jobId = searchParams.get("job");
if (jobId) {
const job = await fetchJob(jobId);
if (job.length) {
setSelectedJob(job[0]);
if (job[0].chat) {
setMessages(job[0].chat);
}
}
}
};
useEffect(() => {
checkAI(setAiEnabled);
getJobFromParam();
}, []);
useEffect(() => {
if (selectedJob?.chat) {
setMessages(selectedJob?.chat);
return;
}
setMessages([]);
}, [selectedJob]);
const handleMessageSend = async (msg: string) => {
if (!selectedJob) {
throw Error("Job is not currently selected, but should be.");
}
const updatedMessages = await sendMessage(msg);
await updateJob([selectedJob?.id], "chat", updatedMessages);
};
const sendMessage = async (msg: string) => {
const newMessage = {
content: msg,
role: "user",
};
setMessages((prevMessages) => [...prevMessages, newMessage]);
setCurrentMessage("");
setThinking(true);
const jobMessage = {
role: "system",
content: `Here is the content return from a scraping job: ${JSON.stringify(
selectedJob?.result
)} for the url: ${
selectedJob?.url
}. The following messages will pertain to the content of the scraped job.`,
};
const response = await fetch("/api/ai", {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
data: { messages: [jobMessage, ...messages, newMessage] },
}),
});
const updatedMessages = [...messages, newMessage];
const reader = response.body?.getReader();
const decoder = new TextDecoder("utf-8");
let aiResponse = "";
if (reader) {
setThinking(false);
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = decoder.decode(value, { stream: true });
aiResponse += chunk;
setMessages((prevMessages) => {
const lastMessage = prevMessages[prevMessages.length - 1];
if (lastMessage && lastMessage.role === "assistant") {
return [
...prevMessages.slice(0, -1),
{ ...lastMessage, content: aiResponse },
];
} else {
return [
...prevMessages,
{
content: aiResponse,
role: "assistant",
},
];
}
});
}
}
return [...updatedMessages, { role: "assistant", content: aiResponse }];
};
const handleNewChat = (selectedJob: Job) => {
updateJob([selectedJob.id], "chat", []);
setMessages([]);
};
useEffect(() => {
fetchJobs(setJobs);
}, []);
return (
<Box
sx={{
display: "flex",
flexDirection: "column",
height: "95vh",
maxWidth: "100%",
paddingLeft: 0,
paddingRight: 0,
borderRadius: "8px",
border:
theme.palette.mode === "light" ? "solid white" : "solid #4b5057",
boxShadow: "0 4px 8px rgba(0, 0, 0, 0.1)",
overflow: "hidden",
}}
>
{aiEnabled ? (
<>
<Paper
elevation={3}
sx={{
p: 2,
textAlign: "center",
fontSize: "1.2em",
position: "relative",
borderRadius: "8px 8px 0 0",
borderBottom: `2px solid ${theme.palette.divider}`,
}}
>
<Box
sx={{
display: "flex",
justifyContent: "center",
alignItems: "center",
position: "relative",
padding: theme.spacing(1),
}}
>
<Typography
sx={{
flex: 1,
textAlign: "center",
}}
>
Chat with AI
</Typography>
<JobSelector
selectedJob={selectedJob}
setSelectedJob={setSelectedJob}
setJobs={setJobs}
jobs={jobs}
sxProps={{
position: "absolute",
right: theme.spacing(2),
width: "25%",
}}
/>
</Box>
</Paper>
<Box
sx={{
position: "relative",
flex: 1,
p: 2,
overflowY: "auto",
maxHeight: "100%",
}}
>
{!selectedJob ? (
<Box
sx={{
position: "absolute",
top: 0,
left: "50%",
transform: "translateX(-50%)",
padding: 2,
bgcolor: "rgba(128,128,128,0.1)",
mt: 1,
borderRadius: "8px",
}}
className="rounded-md"
>
<Typography variant="body1">
Select a Job to Begin Chatting
</Typography>
</Box>
) : (
<>
{messages &&
messages.map((message, index) => (
<Box
key={index}
sx={{
my: 2,
p: 1,
borderRadius: "8px",
boxShadow: "0 2px 4px rgba(0, 0, 0, 0.1)",
bgcolor:
message.role === "user"
? theme.palette.UserMessage.main
: theme.palette.AIMessage.main,
marginLeft: message.role === "user" ? "auto" : "",
maxWidth: "40%",
}}
>
<Typography variant="body1" sx={{ color: "white" }}>
{message.content}
</Typography>
</Box>
))}
{thinking && (
<Box
sx={{
width: "full",
display: "flex",
flexDirection: "column",
justifyContent: "start",
}}
>
<Typography
sx={{
bgcolor: "rgba(128,128,128,0.1)",
maxWidth: "20%",
my: 2,
p: 1,
borderRadius: "8px",
boxShadow: "0 2px 4px rgba(0, 0, 0, 0.1)",
}}
variant="body1"
>
AI is thinking...
</Typography>
</Box>
)}
</>
)}
</Box>
<Box
sx={{
display: "flex",
p: 2,
borderTop: `1px solid ${theme.palette.divider}`,
}}
>
<Tooltip title="New Chat" placement="top">
<IconButton
disabled={!(messages.length > 0)}
sx={{ marginRight: 2 }}
size="medium"
onClick={() => {
if (!selectedJob) {
throw new Error("Selected job must be present but isn't.");
}
handleNewChat(selectedJob);
}}
>
<EditNoteIcon fontSize="medium" />
</IconButton>
</Tooltip>
<TextField
fullWidth
placeholder="Type your message here..."
disabled={!selectedJob}
value={currentMessage}
onChange={(e) => setCurrentMessage(e.target.value)}
onKeyDown={(e) => {
if (e.key === "Enter") {
handleMessageSend(currentMessage);
}
}}
sx={{ borderRadius: "8px" }}
/>
<Tooltip title="Send" placement="top">
<IconButton
color="primary"
sx={{ ml: 2 }}
disabled={!selectedJob}
onClick={() => {
handleMessageSend(currentMessage);
}}
>
<SendIcon />
</IconButton>
</Tooltip>
</Box>
</>
) : (
<Box
bgcolor="background.default"
minHeight="100vh"
display="flex"
justifyContent="center"
alignItems="center"
>
<h4
style={{
color: "#fff",
padding: "20px",
borderRadius: "8px",
background: "rgba(0, 0, 0, 0.6)",
boxShadow: "0 4px 8px rgba(0, 0, 0, 0.2)",
}}
>
Must set either OPENAI_KEY or OLLAMA_MODEL to use AI features.
</h4>
</Box>
)}
</Box>
);
};

View File

@@ -0,0 +1,182 @@
import { Job } from "@/types";
import {
Button,
Dialog,
DialogTitle,
DialogContent,
TextField,
Snackbar,
Alert,
} from "@mui/material";
import Cookies from "js-cookie";
import { useState } from "react";
export type CreateCronJobsProps = {
availableJobs: Job[];
user: any;
};
export const CreateCronJobs = ({
availableJobs,
user,
}: CreateCronJobsProps) => {
const [open, setOpen] = useState(false);
return (
<>
<Button
variant="contained"
color="primary"
onClick={() => setOpen(true)}
sx={{ borderRadius: 2 }}
>
Create Cron Job
</Button>
<CreateCronJobDialog
open={open}
onClose={() => setOpen(false)}
availableJobs={availableJobs}
user={user}
/>
</>
);
};
const CreateCronJobDialog = ({
open,
onClose,
availableJobs,
user,
}: {
open: boolean;
onClose: () => void;
availableJobs: Job[];
user: any;
}) => {
const [cronExpression, setCronExpression] = useState("");
const [jobId, setJobId] = useState("");
const [successOpen, setSuccessOpen] = useState(false);
const [isSubmitting, setIsSubmitting] = useState(false);
const [error, setError] = useState("");
const handleSubmit = async () => {
if (!cronExpression || !jobId) {
setError("Please fill in all fields");
return;
}
setIsSubmitting(true);
const token = Cookies.get("token");
try {
const response = await fetch("/api/schedule-cron-job", {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${token}`,
},
body: JSON.stringify({
data: {
cron_expression: cronExpression,
job_id: jobId,
user_email: user.email,
},
}),
});
if (!response.ok) {
throw new Error("Failed to schedule job");
}
setSuccessOpen(true);
setCronExpression("");
setJobId("");
setTimeout(() => {
onClose();
}, 1500);
window.location.reload();
} catch (error) {
console.error(error);
setError("Failed to create cron job");
} finally {
setIsSubmitting(false);
}
};
const handleClose = () => {
setSuccessOpen(false);
};
return (
<>
<Dialog
open={open}
onClose={onClose}
PaperProps={{
sx: { borderRadius: 2, minWidth: "400px" },
}}
>
<DialogTitle sx={{ fontWeight: 500 }}>Create Cron Job</DialogTitle>
<DialogContent>
<div className="flex flex-col gap-1 mt0">
<TextField
label="Cron Expression"
fullWidth
value={cronExpression}
onChange={(e) => setCronExpression(e.target.value)}
variant="outlined"
placeholder="* * * * *"
margin="normal"
helperText="Format: minute hour day month day-of-week"
/>
<TextField
label="Job ID"
fullWidth
value={jobId}
onChange={(e) => setJobId(e.target.value)}
variant="outlined"
margin="normal"
/>
{error && (
<Alert severity="error" sx={{ mt: 2 }}>
{error}
</Alert>
)}
<div className="flex justify-end gap-2 mt-4">
<Button
variant="outlined"
onClick={onClose}
sx={{ borderRadius: 2 }}
>
Cancel
</Button>
<Button
variant="contained"
color="primary"
onClick={handleSubmit}
disabled={isSubmitting}
sx={{ borderRadius: 2 }}
>
{isSubmitting ? "Submitting..." : "Create Job"}
</Button>
</div>
</div>
</DialogContent>
</Dialog>
<Snackbar
open={successOpen}
autoHideDuration={4000}
onClose={handleClose}
anchorOrigin={{ vertical: "bottom", horizontal: "right" }}
>
<Alert onClose={handleClose} severity="success" sx={{ width: "100%" }}>
Cron job created successfully!
</Alert>
</Snackbar>
</>
);
};

View File

@@ -0,0 +1 @@
export * from "./create-cron-jobs";

View File

@@ -0,0 +1,104 @@
import { Job, CronJob } from "@/types/job";
import { useState, useEffect } from "react";
import { CreateCronJobs } from "./create-cron-jobs";
import {
Table,
TableHead,
TableRow,
TableCell,
TableBody,
Button,
Box,
Typography,
} from "@mui/material";
import Cookies from "js-cookie";
export type CronJobsProps = {
initialJobs: Job[];
initialCronJobs: CronJob[];
initialUser: any;
};
export const CronJobs = ({
initialJobs,
initialCronJobs,
initialUser,
}: CronJobsProps) => {
const [jobs, setJobs] = useState<Job[]>(initialJobs);
const [cronJobs, setCronJobs] = useState<CronJob[]>(initialCronJobs);
const [user, setUser] = useState<any>(initialUser);
useEffect(() => {
setJobs(initialJobs);
setCronJobs(initialCronJobs);
setUser(initialUser);
}, [initialJobs, initialCronJobs, initialUser]);
const handleDeleteCronJob = async (id: string) => {
const token = Cookies.get("token");
const response = await fetch("/api/delete-cron-job", {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${token}`,
},
body: JSON.stringify({ data: { id, user_email: user.email } }),
});
if (response.ok) {
console.log("Cron job deleted successfully");
setCronJobs(cronJobs.filter((cronJob) => cronJob.id !== id));
} else {
console.error("Failed to delete cron job");
}
};
if (!user) {
return (
<Box>
<Typography variant="h6">
Please login to view your cron jobs
</Typography>
</Box>
);
}
return (
<div>
<CreateCronJobs availableJobs={jobs} user={user} />
<Table>
<TableHead>
<TableRow>
<TableCell>Cron Expression</TableCell>
<TableCell>Job ID</TableCell>
<TableCell>User Email</TableCell>
<TableCell>Created At</TableCell>
<TableCell>Updated At</TableCell>
<TableCell>Actions</TableCell>
</TableRow>
</TableHead>
<TableBody>
{cronJobs.map((cronJob) => (
<TableRow key={cronJob.id}>
<TableCell>{cronJob.cron_expression}</TableCell>
<TableCell>{cronJob.job_id}</TableCell>
<TableCell>{cronJob.user_email}</TableCell>
<TableCell>
{new Date(cronJob.time_created).toLocaleString()}
</TableCell>
<TableCell>
{new Date(cronJob.time_updated).toLocaleString()}
</TableCell>
<TableCell>
<Button onClick={() => handleDeleteCronJob(cronJob.id)}>
Delete
</Button>
</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</div>
);
};

View File

@@ -0,0 +1,62 @@
import axios from "axios";
import { GetServerSideProps } from "next";
import { parseCookies } from "nookies";
import { CronJob, Job } from "../../../types";
export const getServerSideProps: GetServerSideProps = async (context) => {
const { req } = context;
const cookies = parseCookies({ req });
const token = cookies.token;
let user = null;
let initialJobs: Job[] = [];
let initialCronJobs: CronJob[] = [];
if (token) {
try {
const userResponse = await axios.get(
`${process.env.NEXT_PUBLIC_API_URL}/api/auth/users/me`,
{
headers: { Authorization: `Bearer ${token}` },
}
);
user = userResponse.data;
const jobsResponse = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/retrieve-scrape-jobs`,
{
method: "POST",
body: JSON.stringify({ user: user.email }),
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
}
);
initialJobs = await jobsResponse.json();
console.log(initialJobs);
const cronJobsResponse = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/cron-jobs`,
{
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
}
);
initialCronJobs = await cronJobsResponse.json();
} catch (error) {
console.error("Error fetching user or jobs:", error);
}
}
return {
props: {
initialJobs,
initialUser: user,
initialCronJobs,
},
};
};

View File

@@ -0,0 +1 @@
export { CronJobs } from "./cron-jobs";

View File

@@ -1,7 +1,6 @@
import { RawJobOptions } from "@/types/job";
import { Box, FormControlLabel, Checkbox, TextField } from "@mui/material";
import { Dispatch, SetStateAction } from "react";
import { useJobSubmitterProvider } from "../provider";
export type JobSubmitterOptionsProps = {
jobOptions: RawJobOptions;
@@ -43,6 +42,13 @@ export const JobSubmitterOptions = ({
}));
};
const handleCollectMediaChange = () => {
setJobOptions((prevJobOptions) => ({
...prevJobOptions,
collect_media: !prevJobOptions.collect_media,
}));
};
return (
<Box bgcolor="background.paper" className="flex flex-col mb-2 rounded-md">
<div id="options" className="p-2 flex flex-row space-x-2">
@@ -95,6 +101,15 @@ export const JobSubmitterOptions = ({
/>
}
></FormControlLabel>
<FormControlLabel
label="Collect Media"
control={
<Checkbox
checked={jobOptions.collect_media}
onChange={handleCollectMediaChange}
/>
}
/>
</div>
{customJSONSelected ? (
<div id="custom-json" className="pl-2 pr-2 pb-2">

View File

@@ -15,6 +15,7 @@ const initialJobOptions: RawJobOptions = {
multi_page_scrape: false,
custom_headers: null,
proxies: null,
collect_media: false,
};
export const JobSubmitter = () => {

View File

@@ -7,9 +7,10 @@
border-radius: 0.375rem;
transition: transform 0.2s ease-in-out;
transform: scale(1);
&:hover {
transform: scale(1.05);
}
}
.button:hover {
transform: scale(1.05);
}
.remove {

View File

@@ -1,6 +1,5 @@
import React, { createContext, useContext, useState, useEffect } from "react";
import axios from "axios";
import { Constants } from "../lib";
import Cookies from "js-cookie";
interface AuthContextProps {
@@ -55,6 +54,7 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
const userResponse = await axios.get(`/api/me`, {
headers: { Authorization: `Bearer ${response.data.access_token}` },
});
setUser(userResponse.data);
setIsAuthenticated(true);
};

View File

@@ -11,11 +11,11 @@ export const parseJobOptions = (
) => {
if (job_options) {
const jsonOptions = JSON.parse(job_options as string);
console.log(jsonOptions);
const newJobOptions: RawJobOptions = {
multi_page_scrape: false,
custom_headers: null,
proxies: null,
collect_media: false,
};
if (

View File

@@ -48,14 +48,14 @@ export const checkAI = async (
) => {
const token = Cookies.get("token");
try {
const response = await fetch("/api/ai/check", {
const response = await fetch("/api/check", {
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
});
const data = await response.json();
setAiEnabled(data);
setAiEnabled(data.ai_enabled);
} catch (error) {
console.error("Error fetching jobs:", error);
throw error;

View File

@@ -17,12 +17,21 @@ export default async function handler(
}
);
const checksResponse = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/auth/check`,
{
method: "GET",
headers,
}
);
if (!response.ok) {
throw new Error(`Error: ${response.statusText}`);
}
const result = await response.json();
res.status(200).json(result);
const checksResult = await checksResponse.json();
res.status(200).json({ ...result, ...checksResult });
} catch (error) {
console.error("Error submitting scrape job:", error);
res.status(500).json({ error: "Internal Server Error" });

View File

@@ -0,0 +1,39 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
if (req.method === "POST") {
const { data } = req.body;
console.log("Data", data);
const headers = new Headers();
headers.set("content-type", "application/json");
try {
const response = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/delete-cron-job`,
{
method: "POST",
headers,
body: JSON.stringify(data),
}
);
if (!response.ok) {
console.error(response);
throw new Error(`Error: ${response.statusText}`);
}
const result = await response.json();
res.status(200).json(result);
} catch (error) {
console.error("Error deleting cron job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
} else {
res.setHeader("Allow", ["POST"]);
res.status(405).end(`Method ${req.method} Not Allowed`);
}
}

View File

@@ -0,0 +1,39 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
if (req.method === "POST") {
const { data } = req.body;
console.log("Data", data);
const headers = new Headers();
headers.set("content-type", "application/json");
try {
const response = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/schedule-cron-job`,
{
method: "POST",
headers,
body: JSON.stringify(data),
}
);
if (!response.ok) {
console.error(response);
throw new Error(`Error: ${response.statusText}`);
}
const result = await response.json();
res.status(200).json(result);
} catch (error) {
console.error("Error scheduling cron job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
} else {
res.setHeader("Allow", ["POST"]);
res.status(405).end(`Method ${req.method} Not Allowed`);
}
}

View File

@@ -1,348 +1 @@
import React, { useEffect, useRef, useState } from "react";
import {
Box,
TextField,
Typography,
Paper,
useTheme,
IconButton,
Tooltip,
} from "@mui/material";
import { JobSelector } from "../components/ai";
import { Job, Message } from "../types";
import { useSearchParams } from "next/navigation";
import { checkAI, fetchJob, fetchJobs, updateJob } from "../lib";
import SendIcon from "@mui/icons-material/Send";
import EditNoteIcon from "@mui/icons-material/EditNote";
const AI: React.FC = () => {
const theme = useTheme();
const [currentMessage, setCurrentMessage] = useState<string>("");
const [selectedJob, setSelectedJob] = useState<Job | null>(null);
const [messages, setMessages] = useState<Message[]>([]);
const [aiEnabled, setAiEnabled] = useState<boolean>(false);
const [jobs, setJobs] = useState<Job[]>([]);
const [thinking, setThinking] = useState<boolean>(false);
const searchParams = useSearchParams();
const getJobFromParam = async () => {
const jobId = searchParams.get("job");
if (jobId) {
const job = await fetchJob(jobId);
if (job.length) {
setSelectedJob(job[0]);
if (job[0].chat) {
setMessages(job[0].chat);
}
}
}
};
useEffect(() => {
checkAI(setAiEnabled);
getJobFromParam();
}, []);
useEffect(() => {
if (selectedJob?.chat) {
setMessages(selectedJob?.chat);
return;
}
setMessages([]);
}, [selectedJob]);
const handleMessageSend = async (msg: string) => {
if (!selectedJob) {
throw Error("Job is not currently selected, but should be.");
}
const updatedMessages = await sendMessage(msg);
await updateJob([selectedJob?.id], "chat", updatedMessages);
};
const sendMessage = async (msg: string) => {
const newMessage = {
content: msg,
role: "user",
};
setMessages((prevMessages) => [...prevMessages, newMessage]);
setCurrentMessage("");
setThinking(true);
const jobMessage = {
role: "system",
content: `Here is the content return from a scraping job: ${JSON.stringify(
selectedJob?.result
)} for the url: ${
selectedJob?.url
}. The following messages will pertain to the content of the scraped job.`,
};
const response = await fetch("/api/ai", {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
data: { messages: [jobMessage, ...messages, newMessage] },
}),
});
const updatedMessages = [...messages, newMessage];
const reader = response.body?.getReader();
const decoder = new TextDecoder("utf-8");
let aiResponse = "";
if (reader) {
setThinking(false);
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = decoder.decode(value, { stream: true });
aiResponse += chunk;
setMessages((prevMessages) => {
const lastMessage = prevMessages[prevMessages.length - 1];
if (lastMessage && lastMessage.role === "assistant") {
return [
...prevMessages.slice(0, -1),
{ ...lastMessage, content: aiResponse },
];
} else {
return [
...prevMessages,
{
content: aiResponse,
role: "assistant",
},
];
}
});
}
}
return [...updatedMessages, { role: "assistant", content: aiResponse }];
};
const handleNewChat = (selectedJob: Job) => {
updateJob([selectedJob.id], "chat", []);
setMessages([]);
fetchJobs(setJobs, { chat: true });
};
return (
<Box
sx={{
display: "flex",
flexDirection: "column",
height: "95vh",
maxWidth: "100%",
paddingLeft: 0,
paddingRight: 0,
borderRadius: "8px",
border:
theme.palette.mode === "light" ? "solid white" : "solid #4b5057",
boxShadow: "0 4px 8px rgba(0, 0, 0, 0.1)",
overflow: "hidden",
}}
>
{aiEnabled ? (
<>
<Paper
elevation={3}
sx={{
p: 2,
textAlign: "center",
fontSize: "1.2em",
position: "relative",
borderRadius: "8px 8px 0 0",
borderBottom: `2px solid ${theme.palette.divider}`,
}}
>
<Box
sx={{
display: "flex",
justifyContent: "center",
alignItems: "center",
position: "relative",
padding: theme.spacing(1),
}}
>
<Typography
sx={{
flex: 1,
textAlign: "center",
}}
>
Chat with AI
</Typography>
<JobSelector
selectedJob={selectedJob}
setSelectedJob={setSelectedJob}
setJobs={setJobs}
jobs={jobs}
sxProps={{
position: "absolute",
right: theme.spacing(2),
width: "25%",
}}
/>
</Box>
</Paper>
<Box
sx={{
position: "relative",
flex: 1,
p: 2,
overflowY: "auto",
maxHeight: "100%",
}}
>
{!selectedJob ? (
<Box
sx={{
position: "absolute",
top: 0,
left: "50%",
transform: "translateX(-50%)",
padding: 2,
bgcolor: "rgba(128,128,128,0.1)",
mt: 1,
borderRadius: "8px",
}}
className="rounded-md"
>
<Typography variant="body1">
Select a Job to Begin Chatting
</Typography>
</Box>
) : (
<>
{messages &&
messages.map((message, index) => (
<Box
key={index}
sx={{
my: 2,
p: 1,
borderRadius: "8px",
boxShadow: "0 2px 4px rgba(0, 0, 0, 0.1)",
bgcolor:
message.role === "user"
? theme.palette.UserMessage.main
: theme.palette.AIMessage.main,
marginLeft: message.role === "user" ? "auto" : "",
maxWidth: "40%",
}}
>
<Typography variant="body1" sx={{ color: "white" }}>
{message.content}
</Typography>
</Box>
))}
{thinking && (
<Box
sx={{
width: "full",
display: "flex",
flexDirection: "column",
justifyContent: "start",
}}
>
<Typography
sx={{
bgcolor: "rgba(128,128,128,0.1)",
maxWidth: "20%",
my: 2,
p: 1,
borderRadius: "8px",
boxShadow: "0 2px 4px rgba(0, 0, 0, 0.1)",
}}
variant="body1"
>
AI is thinking...
</Typography>
</Box>
)}
</>
)}
</Box>
<Box
sx={{
display: "flex",
p: 2,
borderTop: `1px solid ${theme.palette.divider}`,
}}
>
<Tooltip title="New Chat" placement="top">
<IconButton
disabled={!(messages.length > 0)}
sx={{ marginRight: 2 }}
size="medium"
onClick={() => {
if (!selectedJob) {
throw new Error("Selected job must be present but isn't.");
}
handleNewChat(selectedJob);
}}
>
<EditNoteIcon fontSize="medium" />
</IconButton>
</Tooltip>
<TextField
fullWidth
placeholder="Type your message here..."
disabled={!selectedJob}
value={currentMessage}
onChange={(e) => setCurrentMessage(e.target.value)}
onKeyDown={(e) => {
if (e.key === "Enter") {
handleMessageSend(currentMessage);
}
}}
sx={{ borderRadius: "8px" }}
/>
<Tooltip title="Send" placement="top">
<IconButton
color="primary"
sx={{ ml: 2 }}
disabled={!selectedJob}
onClick={() => {
handleMessageSend(currentMessage);
}}
>
<SendIcon />
</IconButton>
</Tooltip>
</Box>
</>
) : (
<Box
bgcolor="background.default"
minHeight="100vh"
display="flex"
justifyContent="center"
alignItems="center"
>
<h4
style={{
color: "#fff",
padding: "20px",
borderRadius: "8px",
background: "rgba(0, 0, 0, 0.6)",
boxShadow: "0 4px 8px rgba(0, 0, 0, 0.2)",
}}
>
Must set either OPENAI_KEY or OLLAMA_MODEL to use AI features.
</h4>
</Box>
)}
</Box>
);
};
export default AI;
export { AI as default } from "../components/pages/chat/chat";

4
src/pages/cron-jobs.tsx Normal file
View File

@@ -0,0 +1,4 @@
import { CronJobs } from "../components/pages/cron-jobs";
import { getServerSideProps } from "../components/pages/cron-jobs/get-server-side-props";
export { getServerSideProps };
export default CronJobs;

View File

@@ -1,6 +1,6 @@
"use client";
import React, { useState } from "react";
import React, { useEffect, useState } from "react";
import axios from "axios";
import { Button, TextField, Typography, Box } from "@mui/material";
import { useTheme } from "@mui/material/styles";
@@ -18,7 +18,16 @@ const AuthForm: React.FC = () => {
const theme = useTheme();
const router = useRouter();
const { login } = useAuth();
const [registrationEnabled, setRegistrationEnabled] = useState<boolean>(true);
const checkRegistrationEnabled = async () => {
const response = await axios.get(`/api/check`);
setRegistrationEnabled(response.data.registration);
};
useEffect(() => {
checkRegistrationEnabled();
}, []);
const handleSubmit = async (event: React.FormEvent) => {
event.preventDefault();
try {
@@ -124,9 +133,37 @@ const AuthForm: React.FC = () => {
>
{mode.charAt(0).toUpperCase() + mode.slice(1)}
</Button>
<Button onClick={toggleMode} fullWidth variant="text" color="primary">
{mode === "login" ? "No Account? Sign up" : "Login"}
</Button>
{registrationEnabled && (
<Button
onClick={toggleMode}
fullWidth
variant="text"
color="primary"
>
{mode === "login" ? "No Account? Sign up" : "Login"}
</Button>
)}
{!registrationEnabled && (
<div
style={{
marginTop: 10,
width: "100%",
textAlign: "center",
border: "1px solid #ccc",
backgroundColor: "#f8f8f8",
padding: 8,
borderRadius: 4,
display: "flex",
justifyContent: "center",
alignItems: "center",
}}
>
<Typography variant="body2" color="text">
Registration has been disabled
</Typography>
</div>
)}
</Box>
</Box>
</Box>

View File

@@ -19,6 +19,7 @@ export const submitJob = async (
time_created: new Date().toISOString(),
job_options: {
...jobOptions,
collect_media: jobOptions.collect_media || false,
custom_headers: customHeaders || {},
proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [],
site_map: siteMap,

View File

@@ -23,6 +23,7 @@ export type RawJobOptions = {
multi_page_scrape: boolean;
custom_headers: string | null;
proxies: string | null;
collect_media: boolean;
};
export type ActionOption = "click" | "input";
@@ -38,3 +39,12 @@ export type Action = {
export type SiteMap = {
actions: Action[];
};
export type CronJob = {
id: string;
user_email: string;
job_id: string;
cron_expression: string;
time_created: Date;
time_updated: Date;
};

View File

@@ -1,30 +0,0 @@
# LOCAL
from freeaskinternet.models.Models import (
ModelCard,
ModelList,
SearchItem,
SearchResp,
ChatMessage,
DeltaMessage,
QueryRequest,
SearchItemList,
ChatCompletionRequest,
ChatCompletionResponse,
ChatCompletionResponseChoice,
ChatCompletionResponseStreamChoice,
)
__all__ = [
"ModelCard",
"ModelList",
"ChatMessage",
"DeltaMessage",
"QueryRequest",
"ChatCompletionRequest",
"ChatCompletionResponseChoice",
"ChatCompletionResponseStreamChoice",
"ChatCompletionResponse",
"SearchItem",
"SearchItemList",
"SearchResp",
]

View File

@@ -109,5 +109,5 @@
"isolatedModules": true
},
"include": ["src", "src/declaration.d.ts", "src/next-auth.d.ts"],
"exclude": ["node_modules"]
"exclude": ["node_modules", "src-tauri"]
}