29 Commits

Author SHA1 Message Date
Jayden Pyles
d2bc9c53ff fix: cypress test 2025-05-31 23:11:18 -05:00
Jayden Pyles
734974df83 fix: cypress test 2025-05-31 23:01:39 -05:00
Jayden Pyles
b6dbd0dc82 fix: cypress test 2025-05-31 22:17:57 -05:00
Jayden Pyles
d57dd0af1a fix: cypress test 2025-05-31 21:55:47 -05:00
Jayden Pyles
23fccd7afb fix: cypress test 2025-05-31 21:39:38 -05:00
Jayden Pyles
f89a460206 fix: cypress test 2025-05-31 21:13:52 -05:00
Jayden Pyles
1169d48992 fix: cypress test 2025-05-31 21:05:26 -05:00
Jayden Pyles
ff809d7833 fix: cypress test 2025-05-31 19:29:48 -05:00
Jayden Pyles
41c7f6795c fix: build 2025-05-31 18:01:13 -05:00
Jayden Pyles
21c2155786 chore: refactor wip 2025-05-31 17:22:29 -05:00
Jayden Pyles
6d45bd129c chore: refactor wip 2025-05-31 14:26:50 -05:00
Jayden Pyles
3ab31bd186 chore: refactor wip 2025-05-31 14:21:51 -05:00
Jayden Pyles
e00c187e68 chore: work in progress 2025-05-30 22:18:26 -05:00
Jayden Pyles
39c0d17e1e chore: refactor wip 2025-05-28 20:42:06 -05:00
Jayden Pyles
b66963ed33 chore: work in progress 2025-05-25 19:00:16 -05:00
Jayden Pyles
813550e07c chore: work in progress 2025-05-25 18:52:17 -05:00
Jayden Pyles
dcb4afe01b chore: work in progress 2025-05-25 18:50:58 -05:00
Jayden Pyles
344d9036c3 chore: work in progress 2025-05-25 18:39:27 -05:00
Jayden Pyles
96552cd1d1 chore: work in progress 2025-05-25 18:17:20 -05:00
Jayden Pyles
d4ac85d206 chore: work in progress 2025-05-25 18:01:49 -05:00
Jayden Pyles
a805b98ce3 chore: work in progress 2025-05-25 17:54:01 -05:00
Jayden Pyles
eaf047ecd8 chore: work in progress 2025-05-25 17:36:42 -05:00
Jayden Pyles
d43040fe08 chore: work in progress 2025-05-25 17:16:42 -05:00
Jayden Pyles
f0813323f0 chore: refactor wip 2025-05-23 18:12:37 -05:00
Jayden Pyles
fb7986bccf chore: refactor wip 2025-05-22 18:36:17 -05:00
Jayden Pyles
a1664856a6 chore: refactor wip 2025-05-21 22:13:40 -05:00
Jayden Pyles
467244b7f8 chore: refactor wip 2025-05-20 19:49:38 -05:00
Jayden Pyles
b91a133b4d chore: refactor wip 2025-05-20 19:34:30 -05:00
Jayden Pyles
aeed81a6df chore: refactor wip 2025-05-20 19:05:48 -05:00
134 changed files with 2467 additions and 1702 deletions

View File

@@ -2,6 +2,13 @@ name: Run Cypress Tests
description: Run Cypress tests
inputs:
openai_key:
description: "OpenAI API key"
required: true
default: ""
runs:
using: "composite"
steps:
@@ -15,6 +22,8 @@ runs:
- name: Setup Docker project
shell: bash
env:
OPENAI_KEY: ${{ inputs.openai_key }}
run: make build-ci up-ci
- name: Install dependencies

View File

@@ -37,6 +37,8 @@ jobs:
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/run-cypress-tests
with:
openai_key: ${{ secrets.OPENAI_KEY }}
success-message:
runs-on: ubuntu-latest

View File

@@ -60,4 +60,7 @@ up-ci:
docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d --force-recreate
cypress-start:
DISPLAY=:0 npx cypress open
DISPLAY=:0 npx cypress open
cypress-run:
npx cypress run

View File

@@ -1,29 +1,26 @@
# STL
import random
from typing import Any
# PDM
from camoufox import AsyncCamoufox
from playwright.async_api import Page
# LOCAL
from api.backend.ai.clients import ask_ollama, ask_open_ai, open_ai_key
from api.backend.job.models import CapturedElement
from api.backend.worker.logger import LOG
from api.backend.ai.agent.utils import (
parse_response,
capture_elements,
convert_to_markdown,
parse_response,
)
from api.backend.ai.clients import ask_open_ai, ask_ollama, open_ai_key
from api.backend.ai.agent.prompts import (
ELEMENT_EXTRACTION_PROMPT,
EXTRACT_ELEMENTS_PROMPT,
ELEMENT_EXTRACTION_PROMPT,
)
from api.backend.job.scraping.collect_media import collect_media
from api.backend.worker.logger import LOG
from api.backend.job.scraping.add_custom import add_custom_items
from api.backend.models import CapturedElement
from api.backend.job.scraping.collect_media import collect_media
ask_ai = ask_open_ai if open_ai_key else ask_ollama

View File

@@ -1,10 +1,13 @@
from lxml import html, etree
# STL
import re
# PDM
from lxml import html, etree
from playwright.async_api import Page
from api.backend.models import CapturedElement
from api.backend.job.scraping.scraping_utils import clean_format_characters
# LOCAL
from api.backend.job.models import CapturedElement
from api.backend.job.utils.text_utils import clean_text
def convert_to_markdown(html_str: str):
@@ -231,7 +234,7 @@ async def capture_elements(
if link:
element_text += f" ({link})"
cleaned = clean_format_characters(element_text)
cleaned = clean_text(element_text)
if cleaned in seen_texts:
continue

View File

@@ -3,24 +3,23 @@ import logging
from collections.abc import Iterable, AsyncGenerator
# PDM
from ollama import Message
from fastapi import APIRouter
from fastapi.responses import JSONResponse, StreamingResponse
from openai.types.chat import ChatCompletionMessageParam
# LOCAL
from ollama import Message
from api.backend.models import AI
from api.backend.ai.clients import (
llama_client,
llama_model,
openai_client,
open_ai_model,
open_ai_key,
llama_client,
open_ai_model,
openai_client,
)
from api.backend.ai.schemas import AI
from api.backend.routers.handle_exceptions import handle_exceptions
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("AI")
ai_router = APIRouter()
@@ -64,6 +63,7 @@ chat_function = llama_chat if llama_client else openai_chat
@ai_router.post("/ai")
@handle_exceptions(logger=LOG)
async def ai(c: AI):
return StreamingResponse(
chat_function(chat_messages=c.messages), media_type="text/plain"
@@ -71,5 +71,6 @@ async def ai(c: AI):
@ai_router.get("/ai/check")
@handle_exceptions(logger=LOG)
async def check():
return JSONResponse(content={"ai_enabled": bool(open_ai_key or llama_model)})

View File

@@ -1,8 +1,9 @@
# STL
import os
from openai import OpenAI
# PDM
from ollama import AsyncClient
from openai import OpenAI
# Load environment variables
open_ai_key = os.getenv("OPENAI_KEY")

View File

@@ -0,0 +1,4 @@
# LOCAL
from .ai import AI
__all__ = ["AI"]

View File

@@ -0,0 +1,9 @@
# STL
from typing import Any
# PDM
import pydantic
class AI(pydantic.BaseModel):
messages: list[Any]

View File

@@ -1,34 +1,30 @@
# STL
import os
import logging
import apscheduler # type: ignore
from contextlib import asynccontextmanager
# PDM
import apscheduler.schedulers
import apscheduler.schedulers.background
from fastapi import FastAPI, Request, status
from fastapi.responses import JSONResponse
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
# LOCAL
from api.backend.ai.ai_router import ai_router
from api.backend.auth.auth_router import auth_router
from api.backend.utils import get_log_level
from api.backend.routers.job_router import job_router
from api.backend.routers.stats_router import stats_router
from api.backend.database.startup import init_database
from fastapi.responses import JSONResponse
from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
from api.backend.scheduler import scheduler
from api.backend.ai.ai_router import ai_router
from api.backend.job.job_router import job_router
from api.backend.auth.auth_router import auth_router
from api.backend.database.startup import init_database
from api.backend.stats.stats_router import stats_router
from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
log_level = os.getenv("LOG_LEVEL")
LOG_LEVEL = get_log_level(log_level)
logging.basicConfig(
level=LOG_LEVEL,
format="%(levelname)s: %(asctime)s - %(name)s - %(message)s",
format="%(levelname)s: %(asctime)s - [%(name)s] - %(message)s",
handlers=[logging.StreamHandler()],
)
@@ -36,7 +32,7 @@ LOG = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI):
async def lifespan(_: FastAPI):
# Startup
LOG.info("Starting application...")
@@ -45,6 +41,7 @@ async def lifespan(app: FastAPI):
LOG.info("Starting cron scheduler...")
start_cron_scheduler(scheduler)
scheduler.start()
LOG.info("Cron scheduler started successfully")
yield

View File

@@ -1,13 +1,14 @@
# STL
from datetime import timedelta
import os
import logging
from datetime import timedelta
# PDM
from fastapi import Depends, APIRouter, HTTPException, status
from fastapi.security import OAuth2PasswordRequestForm
# LOCAL
from api.backend.schemas import User, Token, UserCreate
from api.backend.auth.schemas import User, Token, UserCreate
from api.backend.auth.auth_utils import (
ACCESS_TOKEN_EXPIRE_MINUTES,
get_current_user,
@@ -15,18 +16,19 @@ from api.backend.auth.auth_utils import (
get_password_hash,
create_access_token,
)
import logging
from api.backend.database.common import update
from api.backend.routers.handle_exceptions import handle_exceptions
auth_router = APIRouter()
LOG = logging.getLogger("auth_router")
LOG = logging.getLogger("Auth")
@auth_router.post("/auth/token", response_model=Token)
@handle_exceptions(logger=LOG)
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
user = await authenticate_user(form_data.username, form_data.password)
if not user:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
@@ -47,6 +49,7 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends(
@auth_router.post("/auth/signup", response_model=User)
@handle_exceptions(logger=LOG)
async def create_user(user: UserCreate):
hashed_password = get_password_hash(user.password)
user_dict = user.model_dump()
@@ -60,11 +63,13 @@ async def create_user(user: UserCreate):
@auth_router.get("/auth/users/me", response_model=User)
@handle_exceptions(logger=LOG)
async def read_users_me(current_user: User = Depends(get_current_user)):
return current_user
@auth_router.get("/auth/check")
@handle_exceptions(logger=LOG)
async def check_auth():
return {
"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True",

View File

@@ -1,8 +1,8 @@
# STL
import os
import logging
from typing import Any, Optional
from datetime import datetime, timedelta
import logging
# PDM
from jose import JWTError, jwt
@@ -12,11 +12,10 @@ from passlib.context import CryptContext
from fastapi.security import OAuth2PasswordBearer
# LOCAL
from api.backend.schemas import User, UserInDB, TokenData
from api.backend.auth.schemas import User, UserInDB, TokenData
from api.backend.database.common import query
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Auth")
_ = load_dotenv()
@@ -118,7 +117,8 @@ async def get_current_user(token: str = Depends(oauth2_scheme)):
LOG.error(f"Exception occurred: {e}")
return EMPTY_USER
user = await get_user(email=token_data.email)
user = await get_user(email=token_data.email or "")
if user is None:
return EMPTY_USER
@@ -136,6 +136,7 @@ async def require_user(token: str = Depends(oauth2_scheme)):
payload: Optional[dict[str, Any]] = jwt.decode(
token, SECRET_KEY, algorithms=[ALGORITHM]
)
if not payload:
raise credentials_exception
@@ -149,7 +150,7 @@ async def require_user(token: str = Depends(oauth2_scheme)):
except JWTError:
raise credentials_exception
user = await get_user(email=token_data.email)
user = await get_user(email=token_data.email or "")
if user is None:
raise credentials_exception

View File

@@ -0,0 +1,4 @@
# LOCAL
from .auth import User, Token, UserInDB, TokenData, UserCreate
__all__ = ["User", "Token", "UserInDB", "TokenData", "UserCreate"]

View File

@@ -1,5 +1,6 @@
from pathlib import Path
# STL
import os
from pathlib import Path
DATABASE_PATH = "data/database.db"
RECORDINGS_DIR = Path("media/recordings")
@@ -14,3 +15,10 @@ MEDIA_TYPES = [
"spreadsheets",
"videos",
]
REGISTRATION_ENABLED = os.getenv("REGISTRATION_ENABLED", "true").lower() == "true"
DEFAULT_USER_EMAIL = os.getenv("DEFAULT_USER_EMAIL")
DEFAULT_USER_PASSWORD = os.getenv("DEFAULT_USER_PASSWORD")
DEFAULT_USER_FULL_NAME = os.getenv("DEFAULT_USER_FULL_NAME")
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")

View File

@@ -1,3 +1,5 @@
from .common import insert, QUERIES, update
# LOCAL
from .common import insert, update, connect
from .schema import INIT_QUERY
__all__ = ["insert", "QUERIES", "update"]
__all__ = ["insert", "update", "INIT_QUERY", "connect"]

View File

@@ -1,12 +1,13 @@
# STL
import logging
import sqlite3
from typing import Any, Optional
from api.backend.constants import DATABASE_PATH
from api.backend.utils import format_json, format_sql_row_to_python
from api.backend.database.schema import INIT_QUERY
from api.backend.database.queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
import logging
LOG = logging.getLogger(__name__)
# LOCAL
from api.backend.constants import DATABASE_PATH
from api.backend.database.utils import format_json, format_sql_row_to_python
LOG = logging.getLogger("Database")
def connect():
@@ -25,8 +26,10 @@ def insert(query: str, values: tuple[Any, ...]):
try:
_ = cursor.execute(query, copy)
connection.commit()
except sqlite3.Error as e:
LOG.error(f"An error occurred: {e}")
finally:
cursor.close()
connection.close()
@@ -78,15 +81,9 @@ def update(query: str, values: Optional[tuple[Any, ...]] = None):
return res.rowcount
except sqlite3.Error as e:
LOG.error(f"An error occurred: {e}")
finally:
cursor.close()
connection.close()
return 0
QUERIES = {
"init": INIT_QUERY,
"insert_job": JOB_INSERT_QUERY,
"delete_job": DELETE_JOB_QUERY,
}

View File

@@ -1,3 +1,4 @@
from .queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
# LOCAL
from .job.job_queries import DELETE_JOB_QUERY, JOB_INSERT_QUERY
__all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"]

View File

@@ -0,0 +1,68 @@
# STL
import logging
from typing import Any
# LOCAL
from api.backend.database.utils import format_list_for_query
from api.backend.database.common import query, insert, update
JOB_INSERT_QUERY = """
INSERT INTO jobs
(id, url, elements, user, time_created, result, status, chat, job_options, agent_mode, prompt)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
"""
DELETE_JOB_QUERY = """
DELETE FROM jobs WHERE id IN ()
"""
LOG = logging.getLogger("Database")
def insert_job(item: dict[str, Any]) -> None:
insert(
JOB_INSERT_QUERY,
(
item["id"],
item["url"],
item["elements"],
item["user"],
item["time_created"],
item["result"],
item["status"],
item["chat"],
item["job_options"],
item["agent_mode"],
item["prompt"],
),
)
LOG.info(f"Inserted item: {item}")
async def get_queued_job():
queued_job_query = (
"SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
)
res = query(queued_job_query)
LOG.info(f"Got queued job: {res}")
return res[0] if res else None
async def update_job(ids: list[str], field: str, value: Any):
query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
res = update(query, tuple([value] + ids))
LOG.info(f"Updated job: {res}")
async def delete_jobs(jobs: list[str]):
if not jobs:
LOG.info("No jobs to delete.")
return False
query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}"
res = update(query, tuple(jobs))
LOG.info(f"Deleted jobs: {res}")
return res

View File

@@ -1,9 +0,0 @@
JOB_INSERT_QUERY = """
INSERT INTO jobs
(id, url, elements, user, time_created, result, status, chat, job_options, agent_mode, prompt)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
"""
DELETE_JOB_QUERY = """
DELETE FROM jobs WHERE id IN ()
"""

View File

@@ -0,0 +1,41 @@
# LOCAL
from api.backend.database.common import query
async def average_elements_per_link(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
AVG(json_array_length(elements)) AS average_elements,
COUNT(*) AS count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = query(job_query, (user,))
return results
async def get_jobs_per_day(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
COUNT(*) AS job_count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = query(job_query, (user,))
return results

View File

@@ -30,4 +30,5 @@ CREATE TABLE IF NOT EXISTS cron_jobs (
ALTER TABLE jobs ADD COLUMN agent_mode BOOLEAN NOT NULL DEFAULT FALSE;
ALTER TABLE jobs ADD COLUMN prompt STRING;
ALTER TABLE jobs ADD COLUMN favorite BOOLEAN NOT NULL DEFAULT FALSE;
"""

View File

@@ -1,24 +1,34 @@
import os
from api.backend.database.common import connect, QUERIES, insert
# STL
import logging
import sqlite3
# LOCAL
from api.backend.constants import (
DEFAULT_USER_EMAIL,
REGISTRATION_ENABLED,
DEFAULT_USER_PASSWORD,
DEFAULT_USER_FULL_NAME,
)
from api.backend.auth.auth_utils import get_password_hash
from api.backend.database.common import insert, connect
from api.backend.database.schema import INIT_QUERY
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Database")
def init_database():
def execute_startup_query():
cursor = connect()
for query in QUERIES["init"].strip().split(";"):
for query in INIT_QUERY.strip().split(";"):
query = query.strip()
if not query:
continue
try:
LOG.info(f"Executing query: {query}")
_ = cursor.execute(query)
except sqlite3.OperationalError as e:
if "duplicate column name" in str(e).lower():
LOG.warning(f"Skipping duplicate column error: {e}")
@@ -27,10 +37,16 @@ def init_database():
LOG.error(f"Error executing query: {query}")
raise
if os.environ.get("REGISTRATION_ENABLED", "true").lower() == "false":
default_user_email = os.environ.get("DEFAULT_USER_EMAIL")
default_user_password = os.environ.get("DEFAULT_USER_PASSWORD")
default_user_full_name = os.environ.get("DEFAULT_USER_FULL_NAME")
cursor.close()
def init_database():
execute_startup_query()
if not REGISTRATION_ENABLED:
default_user_email = DEFAULT_USER_EMAIL
default_user_password = DEFAULT_USER_PASSWORD
default_user_full_name = DEFAULT_USER_FULL_NAME
if (
not default_user_email
@@ -51,5 +67,3 @@ def init_database():
default_user_full_name,
),
)
cursor.close()

View File

@@ -0,0 +1,30 @@
# STL
import json
from typing import Any
def format_list_for_query(ids: list[str]):
return (
f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)"
)
def format_sql_row_to_python(row: dict[str, Any]):
new_row: dict[str, Any] = {}
for key, value in row.items():
if isinstance(value, str):
try:
new_row[key] = json.loads(value)
except json.JSONDecodeError:
new_row[key] = value
else:
new_row[key] = value
return new_row
def format_json(items: list[Any]):
for idx, item in enumerate(items):
if isinstance(item, (dict, list)):
formatted_item = json.dumps(item)
items[idx] = formatted_item

View File

@@ -1,17 +1,9 @@
from .job import (
insert,
update_job,
delete_jobs,
get_jobs_per_day,
get_queued_job,
average_elements_per_link,
)
# LOCAL
from .job import insert, update_job, delete_jobs, get_queued_job
__all__ = [
"insert",
"update_job",
"delete_jobs",
"get_jobs_per_day",
"get_queued_job",
"average_elements_per_link",
]

View File

@@ -1,15 +1,19 @@
# STL
import uuid
import logging
import datetime
from typing import Any
import uuid
from api.backend.database.common import insert, query
from api.backend.models import CronJob
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
from apscheduler.triggers.cron import CronTrigger # type: ignore
# PDM
from apscheduler.triggers.cron import CronTrigger
from apscheduler.schedulers.background import BackgroundScheduler
# LOCAL
from api.backend.job import insert as insert_job
import logging
from api.backend.schemas.cron import CronJob
from api.backend.database.common import query, insert
LOG = logging.getLogger("Cron Scheduler")
LOG = logging.getLogger("Cron")
def insert_cron_job(cron_job: CronJob):
@@ -17,6 +21,7 @@ def insert_cron_job(cron_job: CronJob):
INSERT INTO cron_jobs (id, user_email, job_id, cron_expression, time_created, time_updated)
VALUES (?, ?, ?, ?, ?, ?)
"""
values = (
cron_job.id,
cron_job.user_email,
@@ -36,6 +41,7 @@ def delete_cron_job(id: str, user_email: str):
DELETE FROM cron_jobs
WHERE id = ? AND user_email = ?
"""
values = (id, user_email)
insert(query, values)

View File

@@ -3,20 +3,18 @@ import logging
from typing import Any
# LOCAL
from api.backend.utils import format_list_for_query
from api.backend.database.common import (
insert as common_insert,
query as common_query,
QUERIES,
update as common_update,
)
from api.backend.database.utils import format_list_for_query
from api.backend.database.common import query as common_query
from api.backend.database.common import insert as common_insert
from api.backend.database.common import update as common_update
from api.backend.database.queries.job.job_queries import JOB_INSERT_QUERY
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Job")
def insert(item: dict[str, Any]) -> None:
common_insert(
QUERIES["insert_job"],
JOB_INSERT_QUERY,
(
item["id"],
item["url"],
@@ -31,7 +29,8 @@ def insert(item: dict[str, Any]) -> None:
item["prompt"],
),
)
LOG.info(f"Inserted item: {item}")
LOG.debug(f"Inserted item: {item}")
async def get_queued_job():
@@ -39,61 +38,22 @@ async def get_queued_job():
"SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
)
res = common_query(query)
LOG.info(f"Got queued job: {res}")
LOG.debug(f"Got queued job: {res}")
return res[0] if res else None
async def update_job(ids: list[str], field: str, value: Any):
query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
res = common_update(query, tuple([value] + ids))
LOG.info(f"Updated job: {res}")
LOG.debug(f"Updated job: {res}")
async def delete_jobs(jobs: list[str]):
if not jobs:
LOG.info("No jobs to delete.")
LOG.debug("No jobs to delete.")
return False
query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}"
res = common_update(query, tuple(jobs))
return res > 0
async def average_elements_per_link(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
AVG(json_array_length(elements)) AS average_elements,
COUNT(*) AS count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = common_query(job_query, (user,))
return results
async def get_jobs_per_day(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
COUNT(*) AS job_count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = common_query(job_query, (user,))
return results

View File

@@ -0,0 +1,248 @@
# STL
import csv
import uuid
import random
import logging
import datetime
from io import StringIO
# PDM
from fastapi import Depends, APIRouter
from fastapi.encoders import jsonable_encoder
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
from apscheduler.triggers.cron import CronTrigger # type: ignore
# LOCAL
from api.backend.job import insert, update_job, delete_jobs
from api.backend.constants import MEDIA_DIR, MEDIA_TYPES, RECORDINGS_DIR
from api.backend.scheduler import scheduler
from api.backend.schemas.job import Job, UpdateJobs, DownloadJob, DeleteScrapeJobs
from api.backend.auth.schemas import User
from api.backend.schemas.cron import CronJob, DeleteCronJob
from api.backend.database.utils import format_list_for_query
from api.backend.auth.auth_utils import get_current_user
from api.backend.database.common import query
from api.backend.job.utils.text_utils import clean_text
from api.backend.job.models.job_options import FetchOptions
from api.backend.routers.handle_exceptions import handle_exceptions
from api.backend.job.utils.clean_job_format import clean_job_format
from api.backend.job.cron_scheduling.cron_scheduling import (
get_cron_jobs,
delete_cron_job,
insert_cron_job,
get_cron_job_trigger,
insert_job_from_cron_job,
)
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
LOG = logging.getLogger("Job")
job_router = APIRouter()
@job_router.post("/update")
@handle_exceptions(logger=LOG)
async def update(update_jobs: UpdateJobs, _: User = Depends(get_current_user)):
"""Used to update jobs"""
await update_job(update_jobs.ids, update_jobs.field, update_jobs.value)
return JSONResponse(content={"message": "Jobs updated successfully."})
@job_router.post("/submit-scrape-job")
@handle_exceptions(logger=LOG)
async def submit_scrape_job(job: Job):
LOG.info(f"Recieved job: {job}")
job.id = uuid.uuid4().hex
job_dict = job.model_dump()
insert(job_dict)
return JSONResponse(
content={"id": job.id, "message": "Job submitted successfully."}
)
@job_router.post("/retrieve-scrape-jobs")
@handle_exceptions(logger=LOG)
async def retrieve_scrape_jobs(
fetch_options: FetchOptions, user: User = Depends(get_current_user)
):
LOG.info(f"Retrieving jobs for account: {user.email}")
ATTRIBUTES = "chat" if fetch_options.chat else "*"
job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?"
results = query(job_query, (user.email,))
return JSONResponse(content=jsonable_encoder(results[::-1]))
@job_router.get("/job/{id}")
@handle_exceptions(logger=LOG)
async def job(id: str, user: User = Depends(get_current_user)):
LOG.info(f"Retrieving jobs for account: {user.email}")
job_query = "SELECT * FROM jobs WHERE user = ? AND id = ?"
results = query(job_query, (user.email, id))
return JSONResponse(content=jsonable_encoder(results))
@job_router.post("/download")
@handle_exceptions(logger=LOG)
async def download(download_job: DownloadJob):
LOG.info(f"Downloading job with ids: {download_job.ids}")
job_query = (
f"SELECT * FROM jobs WHERE id IN {format_list_for_query(download_job.ids)}"
)
results = query(job_query, tuple(download_job.ids))
if download_job.job_format == "csv":
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
headers = [
"id",
"url",
"element_name",
"xpath",
"text",
"user",
"time_created",
]
csv_writer.writerow(headers)
for result in results:
for res in result["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
csv_writer.writerow(
[
result.get("id", "")
+ "-"
+ str(random.randint(0, 1000000)),
url,
element_name,
value.get("xpath", ""),
text,
result.get("user", ""),
result.get("time_created", ""),
]
)
_ = csv_buffer.seek(0)
response = StreamingResponse(
csv_buffer,
media_type="text/csv",
)
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
elif download_job.job_format == "md":
response = StreamingResponse(
stream_md_from_job_results(results),
media_type="text/markdown",
)
response.headers["Content-Disposition"] = "attachment; filename=export.md"
return response
@job_router.get("/job/{id}/convert-to-csv")
@handle_exceptions(logger=LOG)
async def convert_to_csv(id: str):
job_query = f"SELECT * FROM jobs WHERE id = ?"
results = query(job_query, (id,))
return JSONResponse(content=clean_job_format(results))
@job_router.post("/delete-scrape-jobs")
@handle_exceptions(logger=LOG)
async def delete(delete_scrape_jobs: DeleteScrapeJobs):
result = await delete_jobs(delete_scrape_jobs.ids)
return (
JSONResponse(content={"message": "Jobs successfully deleted."})
if result
else JSONResponse(content={"error": "Jobs not deleted."})
)
@job_router.post("/schedule-cron-job")
@handle_exceptions(logger=LOG)
async def schedule_cron_job(cron_job: CronJob):
if not cron_job.id:
cron_job.id = uuid.uuid4().hex
if not cron_job.time_created:
cron_job.time_created = datetime.datetime.now()
if not cron_job.time_updated:
cron_job.time_updated = datetime.datetime.now()
insert_cron_job(cron_job)
queried_job = query("SELECT * FROM jobs WHERE id = ?", (cron_job.job_id,))
scheduler.add_job(
insert_job_from_cron_job,
get_cron_job_trigger(cron_job.cron_expression),
id=cron_job.id,
args=[queried_job[0]],
)
return JSONResponse(content={"message": "Cron job scheduled successfully."})
@job_router.post("/delete-cron-job")
@handle_exceptions(logger=LOG)
async def delete_cron_job_request(request: DeleteCronJob):
if not request.id:
return JSONResponse(
content={"error": "Cron job id is required."}, status_code=400
)
delete_cron_job(request.id, request.user_email)
scheduler.remove_job(request.id)
return JSONResponse(content={"message": "Cron job deleted successfully."})
@job_router.get("/cron-jobs")
@handle_exceptions(logger=LOG)
async def get_cron_jobs_request(user: User = Depends(get_current_user)):
cron_jobs = get_cron_jobs(user.email)
return JSONResponse(content=jsonable_encoder(cron_jobs))
@job_router.get("/recordings/{id}")
@handle_exceptions(logger=LOG)
async def get_recording(id: str):
path = RECORDINGS_DIR / f"{id}.mp4"
if not path.exists():
return JSONResponse(content={"error": "Recording not found."}, status_code=404)
return FileResponse(
path, headers={"Content-Type": "video/mp4", "Accept-Ranges": "bytes"}
)
@job_router.get("/get-media")
@handle_exceptions(logger=LOG)
async def get_media(id: str):
files: dict[str, list[str]] = {}
for media_type in MEDIA_TYPES:
path = MEDIA_DIR / media_type / f"{id}"
files[media_type] = [file.name for file in path.glob("*")]
return JSONResponse(content={"files": files})
@job_router.get("/media")
@handle_exceptions(logger=LOG)
async def get_media_file(id: str, type: str, file: str):
path = MEDIA_DIR / type / f"{id}" / file
if not path.exists():
return JSONResponse(content={"error": "Media file not found."}, status_code=404)
return FileResponse(path)

View File

@@ -1,3 +1,5 @@
from .job_options import JobOptions
# LOCAL
from .job import Element, CapturedElement
from .job_options import Proxy, JobOptions
__all__ = ["JobOptions"]
__all__ = ["JobOptions", "CapturedElement", "Element", "Proxy"]

View File

@@ -0,0 +1,15 @@
from typing import Optional
import pydantic
class Element(pydantic.BaseModel):
name: str
xpath: str
url: Optional[str] = None
class CapturedElement(pydantic.BaseModel):
xpath: str
text: str
name: str

View File

@@ -1,8 +1,19 @@
from pydantic import BaseModel
# STL
from typing import Any, Optional
# PDM
from pydantic import BaseModel
# LOCAL
from api.backend.job.models.site_map import SiteMap
class Proxy(BaseModel):
server: str
username: Optional[str] = None
password: Optional[str] = None
class FetchOptions(BaseModel):
chat: Optional[bool] = None
@@ -10,7 +21,7 @@ class FetchOptions(BaseModel):
class JobOptions(BaseModel):
multi_page_scrape: bool = False
custom_headers: dict[str, Any] = {}
proxies: list[str] = []
proxies: list[Proxy] = []
site_map: Optional[SiteMap] = None
collect_media: bool = False
custom_cookies: list[dict[str, Any]] = []

View File

@@ -1,11 +1,12 @@
# STL
import logging
from typing import Any, Optional
from urllib.parse import urlparse
# PDM
from playwright.async_api import Page, BrowserContext
import logging
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Job")
async def add_custom_cookies(
@@ -18,8 +19,8 @@ async def add_custom_cookies(
for cookie in custom_cookies:
cookie_dict = {
"name": cookie.get("name", "default_name"),
"value": cookie.get("value", "default_value"),
"name": cookie.get("name", ""),
"value": cookie.get("value", ""),
"domain": domain,
"path": "/",
}

View File

@@ -1,13 +1,16 @@
# STL
import os
from pathlib import Path
import re
from urllib.parse import urljoin, urlparse
import logging
from typing import Dict, List
from pathlib import Path
from urllib.parse import urljoin, urlparse
# PDM
import aiohttp
from playwright.async_api import Page
from api.backend.utils import LOG
LOG = logging.getLogger("Job")
async def collect_media(id: str, page: Page) -> dict[str, list[dict[str, str]]]:

View File

@@ -1,59 +1,45 @@
import logging
# STL
import random
from typing import Any, Optional, cast
import logging
from typing import Any, cast
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup, Tag
# PDM
from bs4 import Tag, BeautifulSoup
from lxml import etree
from camoufox import AsyncCamoufox
from playwright.async_api import Page
from urllib.parse import urlparse, urljoin
from api.backend.models import Element, CapturedElement
# LOCAL
from api.backend.constants import RECORDINGS_ENABLED
from api.backend.job.models import Element, CapturedElement
from api.backend.job.utils.text_utils import clean_text
from api.backend.job.scraping.add_custom import add_custom_items
from api.backend.job.scraping.scraping_utils import (
clean_format_characters,
sxpath,
is_same_domain,
scrape_content,
)
from api.backend.job.site_mapping.site_mapping import handle_site_mapping
from api.backend.job.scraping.add_custom import add_custom_items
from api.backend.constants import RECORDINGS_ENABLED
LOG = logging.getLogger(__name__)
def is_same_domain(url: str, original_url: str) -> bool:
parsed_url = urlparse(url)
parsed_original_url = urlparse(original_url)
return parsed_url.netloc == parsed_original_url.netloc or parsed_url.netloc == ""
def clean_xpath(xpath: str) -> str:
parts = xpath.split("/")
clean_parts = ["/" if part == "" else part for part in parts]
clean_xpath = "//".join(clean_parts).replace("////", "//").replace("'", "\\'")
LOG.info(f"Cleaned xpath: {clean_xpath}")
return clean_xpath
def sxpath(context: etree._Element, xpath: str):
return context.xpath(xpath)
LOG = logging.getLogger("Job")
async def make_site_request(
id: str,
url: str,
headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False,
job_options: dict[str, Any],
visited_urls: set[str] = set(),
pages: set[tuple[str, str]] = set(),
original_url: str = "",
proxies: Optional[list[str]] = None,
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
custom_cookies: Optional[list[dict[str, Any]]] = None,
):
headers = job_options["custom_headers"]
multi_page_scrape = job_options["multi_page_scrape"]
proxies = job_options["proxies"]
site_map = job_options["site_map"]
collect_media = job_options["collect_media"]
custom_cookies = job_options["custom_cookies"]
if url in visited_urls:
return
@@ -117,15 +103,10 @@ async def make_site_request(
await make_site_request(
id,
link,
headers=headers,
multi_page_scrape=multi_page_scrape,
job_options=job_options,
visited_urls=visited_urls,
pages=pages,
original_url=original_url,
proxies=proxies,
site_map=site_map,
collect_media=collect_media,
custom_cookies=custom_cookies,
)
@@ -145,7 +126,7 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
else str(e) # type: ignore
)
text = clean_format_characters(text)
text = clean_text(text)
captured_element = CapturedElement(
xpath=elem.xpath, text=text, name=elem.name
@@ -163,12 +144,7 @@ async def scrape(
id: str,
url: str,
xpaths: list[Element],
headers: Optional[dict[str, Any]] = None,
multi_page_scrape: bool = False,
proxies: Optional[list[str]] = None,
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
custom_cookies: Optional[list[dict[str, Any]]] = None,
job_options: dict[str, Any],
):
visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set()
@@ -176,15 +152,10 @@ async def scrape(
await make_site_request(
id,
url,
headers=headers,
multi_page_scrape=multi_page_scrape,
job_options=job_options,
visited_urls=visited_urls,
pages=pages,
original_url=url,
proxies=proxies,
site_map=site_map,
collect_media=collect_media,
custom_cookies=custom_cookies,
)
elements: list[dict[str, dict[str, list[CapturedElement]]]] = []

View File

@@ -1,11 +1,18 @@
# STL
import asyncio
import logging
from typing import Set, Tuple
from urllib.parse import urlparse
# PDM
from lxml import etree
from playwright.async_api import Page
from api.backend.utils import LOG
# LOCAL
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
LOG = logging.getLogger("Job")
async def scrape_content(
id: str, page: Page, pages: Set[Tuple[str, str]], collect_media: bool
@@ -32,14 +39,20 @@ async def scrape_content(
return html
def clean_format_characters(text: str) -> str:
text = text.strip()
text = text.replace("\n", " ")
text = text.replace("\t", " ")
text = text.replace("\r", " ")
text = text.replace("\f", " ")
text = text.replace("\v", " ")
text = text.replace("\b", " ")
text = text.replace("\a", " ")
def is_same_domain(url: str, original_url: str) -> bool:
parsed_url = urlparse(url)
parsed_original_url = urlparse(original_url)
return parsed_url.netloc == parsed_original_url.netloc or parsed_url.netloc == ""
return text
def clean_xpath(xpath: str) -> str:
parts = xpath.split("/")
clean_parts = ["/" if part == "" else part for part in parts]
clean_xpath = "//".join(clean_parts).replace("////", "//").replace("'", "\\'")
LOG.info(f"Cleaned xpath: {clean_xpath}")
return clean_xpath
def sxpath(context: etree._Element, xpath: str):
return context.xpath(xpath)

View File

@@ -1,14 +1,17 @@
import logging
# STL
import asyncio
import logging
from copy import deepcopy
from typing import Any
# PDM
from playwright.async_api import Page
# LOCAL
from api.backend.job.models.site_map import Action, SiteMap
from api.backend.job.scraping.scraping_utils import scrape_content
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Job")
def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]:

View File

@@ -1,6 +1,8 @@
# STL
from typing import Any
from api.backend.utils import clean_text
# LOCAL
from api.backend.job.utils.text_utils import clean_text
def clean_job_format(jobs: list[dict[str, Any]]) -> dict[str, Any]:

View File

@@ -1,6 +1,8 @@
# STL
from typing import Any
from api.backend.utils import clean_text
# LOCAL
from api.backend.job.utils.text_utils import clean_text
def stream_md_from_job_results(jobs: list[dict[str, Any]]):

View File

@@ -0,0 +1,10 @@
def clean_text(text: str):
text = text.strip()
text = text.replace("\n", " ")
text = text.replace("\t", " ")
text = text.replace("\r", " ")
text = text.replace("\f", " ")
text = text.replace("\v", " ")
text = text.replace("\b", " ")
text = text.replace("\a", " ")
return text

View File

@@ -0,0 +1,31 @@
# STL
import logging
import traceback
from typing import Any, Union, Callable, Awaitable
from functools import wraps
# PDM
from fastapi.responses import JSONResponse
def handle_exceptions(
logger: logging.Logger,
) -> Callable[
[Callable[..., Awaitable[Any]]], Callable[..., Awaitable[Union[Any, JSONResponse]]]
]:
def decorator(
func: Callable[..., Awaitable[Any]],
) -> Callable[..., Awaitable[Union[Any, JSONResponse]]]:
@wraps(func)
async def wrapper(*args: Any, **kwargs: Any) -> Union[Any, JSONResponse]:
try:
return await func(*args, **kwargs)
except Exception as e:
logger.error(f"Exception occurred: {e}")
traceback.print_exc()
return JSONResponse(content={"error": str(e)}, status_code=500)
return wrapper
return decorator

View File

@@ -1,273 +0,0 @@
# STL
import datetime
import uuid
import traceback
from io import StringIO
import csv
import logging
import random
# PDM
from fastapi import Depends, APIRouter
from fastapi.encoders import jsonable_encoder
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
from api.backend.scheduler import scheduler
from apscheduler.triggers.cron import CronTrigger # type: ignore
# LOCAL
from api.backend.job import insert, update_job, delete_jobs
from api.backend.models import (
DeleteCronJob,
UpdateJobs,
DownloadJob,
DeleteScrapeJobs,
Job,
CronJob,
)
from api.backend.schemas import User
from api.backend.auth.auth_utils import get_current_user
from api.backend.utils import clean_text, format_list_for_query
from api.backend.job.models.job_options import FetchOptions
from api.backend.database.common import query
from api.backend.job.cron_scheduling.cron_scheduling import (
delete_cron_job,
get_cron_job_trigger,
insert_cron_job,
get_cron_jobs,
insert_job_from_cron_job,
)
from api.backend.job.utils.clean_job_format import clean_job_format
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
from api.backend.constants import MEDIA_DIR, MEDIA_TYPES, RECORDINGS_DIR
LOG = logging.getLogger(__name__)
job_router = APIRouter()
@job_router.post("/update")
async def update(update_jobs: UpdateJobs, _: User = Depends(get_current_user)):
"""Used to update jobs"""
await update_job(update_jobs.ids, update_jobs.field, update_jobs.value)
@job_router.post("/submit-scrape-job")
async def submit_scrape_job(job: Job):
LOG.info(f"Recieved job: {job}")
try:
job.id = uuid.uuid4().hex
job_dict = job.model_dump()
insert(job_dict)
return JSONResponse(content={"id": job.id})
except Exception as e:
LOG.error(f"Exception occurred: {traceback.format_exc()}")
return JSONResponse(content={"error": str(e)}, status_code=500)
@job_router.post("/retrieve-scrape-jobs")
async def retrieve_scrape_jobs(
fetch_options: FetchOptions, user: User = Depends(get_current_user)
):
LOG.info(f"Retrieving jobs for account: {user.email}")
ATTRIBUTES = "chat" if fetch_options.chat else "*"
try:
job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?"
results = query(job_query, (user.email,))
return JSONResponse(content=jsonable_encoder(results[::-1]))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
return JSONResponse(content=[], status_code=500)
@job_router.get("/job/{id}")
async def job(id: str, user: User = Depends(get_current_user)):
LOG.info(f"Retrieving jobs for account: {user.email}")
try:
job_query = "SELECT * FROM jobs WHERE user = ? AND id = ?"
results = query(job_query, (user.email, id))
return JSONResponse(content=jsonable_encoder(results))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
return JSONResponse(content={"error": str(e)}, status_code=500)
@job_router.post("/download")
async def download(download_job: DownloadJob):
LOG.info(f"Downloading job with ids: {download_job.ids}")
try:
job_query = (
f"SELECT * FROM jobs WHERE id IN {format_list_for_query(download_job.ids)}"
)
results = query(job_query, tuple(download_job.ids))
if download_job.job_format == "csv":
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
headers = [
"id",
"url",
"element_name",
"xpath",
"text",
"user",
"time_created",
]
csv_writer.writerow(headers)
for result in results:
for res in result["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
csv_writer.writerow(
[
result.get("id", "")
+ "-"
+ str(random.randint(0, 1000000)),
url,
element_name,
value.get("xpath", ""),
text,
result.get("user", ""),
result.get("time_created", ""),
]
)
_ = csv_buffer.seek(0)
response = StreamingResponse(
csv_buffer,
media_type="text/csv",
)
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
elif download_job.job_format == "md":
response = StreamingResponse(
stream_md_from_job_results(results),
media_type="text/markdown",
)
response.headers["Content-Disposition"] = "attachment; filename=export.md"
return response
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
return {"error": str(e)}
@job_router.get("/job/{id}/convert-to-csv")
async def convert_to_csv(id: str):
try:
job_query = f"SELECT * FROM jobs WHERE id = ?"
results = query(job_query, (id,))
return JSONResponse(content=clean_job_format(results))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
return {"error": str(e)}
@job_router.post("/delete-scrape-jobs")
async def delete(delete_scrape_jobs: DeleteScrapeJobs):
result = await delete_jobs(delete_scrape_jobs.ids)
return (
JSONResponse(content={"message": "Jobs successfully deleted."})
if result
else JSONResponse({"error": "Jobs not deleted."})
)
@job_router.post("/schedule-cron-job")
async def schedule_cron_job(cron_job: CronJob):
if not cron_job.id:
cron_job.id = uuid.uuid4().hex
if not cron_job.time_created:
cron_job.time_created = datetime.datetime.now()
if not cron_job.time_updated:
cron_job.time_updated = datetime.datetime.now()
insert_cron_job(cron_job)
queried_job = query("SELECT * FROM jobs WHERE id = ?", (cron_job.job_id,))
scheduler.add_job(
insert_job_from_cron_job,
get_cron_job_trigger(cron_job.cron_expression),
id=cron_job.id,
args=[queried_job[0]],
)
return JSONResponse(content={"message": "Cron job scheduled successfully."})
@job_router.post("/delete-cron-job")
async def delete_cron_job_request(request: DeleteCronJob):
if not request.id:
return JSONResponse(
content={"error": "Cron job id is required."}, status_code=400
)
delete_cron_job(request.id, request.user_email)
scheduler.remove_job(request.id)
return JSONResponse(content={"message": "Cron job deleted successfully."})
@job_router.get("/cron-jobs")
async def get_cron_jobs_request(user: User = Depends(get_current_user)):
cron_jobs = get_cron_jobs(user.email)
return JSONResponse(content=jsonable_encoder(cron_jobs))
@job_router.get("/recordings/{id}")
async def get_recording(id: str):
path = RECORDINGS_DIR / f"{id}.mp4"
if not path.exists():
return JSONResponse(content={"error": "Recording not found."}, status_code=404)
return FileResponse(
path, headers={"Content-Type": "video/mp4", "Accept-Ranges": "bytes"}
)
@job_router.get("/get-media")
async def get_media(id: str):
try:
files: dict[str, list[str]] = {}
for media_type in MEDIA_TYPES:
path = MEDIA_DIR / media_type / f"{id}"
files[media_type] = [file.name for file in path.glob("*")]
return JSONResponse(content={"files": files})
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
return JSONResponse(content={"error": str(e)}, status_code=500)
@job_router.get("/media")
async def get_media_file(id: str, type: str, file: str):
path = MEDIA_DIR / type / f"{id}" / file
if not path.exists():
return JSONResponse(content={"error": "Media file not found."}, status_code=404)
return FileResponse(path)

View File

@@ -1,3 +1,4 @@
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
# PDM
from apscheduler.schedulers.background import BackgroundScheduler
scheduler = BackgroundScheduler()

View File

@@ -0,0 +1,17 @@
from typing import Optional, Union
from datetime import datetime
import pydantic
class CronJob(pydantic.BaseModel):
id: Optional[str] = None
user_email: str
job_id: str
cron_expression: str
time_created: Optional[Union[datetime, str]] = None
time_updated: Optional[Union[datetime, str]] = None
class DeleteCronJob(pydantic.BaseModel):
id: str
user_email: str

View File

@@ -1,24 +1,24 @@
# STL
from typing import Any, Literal, Optional, Union
from datetime import datetime
# LOCAL
from api.backend.job.models.job_options import JobOptions
# PDM
import pydantic
class Element(pydantic.BaseModel):
name: str
xpath: str
url: Optional[str] = None
from api.backend.job.models import Element, CapturedElement
class CapturedElement(pydantic.BaseModel):
xpath: str
text: str
name: str
class Job(pydantic.BaseModel):
id: Optional[str] = None
url: str
elements: list[Element]
user: str = ""
time_created: Optional[Union[datetime, str]] = None
result: list[dict[str, dict[str, list[CapturedElement]]]] = []
job_options: JobOptions
status: str = "Queued"
chat: Optional[str] = None
agent_mode: bool = False
prompt: Optional[str] = None
favorite: bool = False
class RetrieveScrapeJobs(pydantic.BaseModel):
@@ -34,43 +34,7 @@ class DeleteScrapeJobs(pydantic.BaseModel):
ids: list[str]
class GetStatistics(pydantic.BaseModel):
user: str
class UpdateJobs(pydantic.BaseModel):
ids: list[str]
field: str
value: Any
class AI(pydantic.BaseModel):
messages: list[Any]
class Job(pydantic.BaseModel):
id: Optional[str] = None
url: str
elements: list[Element]
user: str = ""
time_created: Optional[Union[datetime, str]] = None
result: list[dict[str, dict[str, list[CapturedElement]]]] = []
job_options: JobOptions
status: str = "Queued"
chat: Optional[str] = None
agent_mode: bool = False
prompt: Optional[str] = None
class CronJob(pydantic.BaseModel):
id: Optional[str] = None
user_email: str
job_id: str
cron_expression: str
time_created: Optional[Union[datetime, str]] = None
time_updated: Optional[Union[datetime, str]] = None
class DeleteCronJob(pydantic.BaseModel):
id: str
user_email: str

View File

@@ -2,28 +2,30 @@
import logging
# PDM
from fastapi import APIRouter, Depends
from fastapi import Depends, APIRouter
# LOCAL
from api.backend.job import (
from api.backend.auth.schemas import User
from api.backend.auth.auth_utils import get_current_user
from api.backend.routers.handle_exceptions import handle_exceptions
from api.backend.database.queries.statistics.statistic_queries import (
get_jobs_per_day,
average_elements_per_link,
)
from api.backend.auth.auth_utils import get_current_user
from api.backend.schemas import User
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Statistics")
stats_router = APIRouter()
@stats_router.get("/statistics/get-average-element-per-link")
@handle_exceptions(logger=LOG)
async def get_average_element_per_link(user: User = Depends(get_current_user)):
return await average_elements_per_link(user.email)
@stats_router.get("/statistics/get-average-jobs-per-day")
@handle_exceptions(logger=LOG)
async def average_jobs_per_day(user: User = Depends(get_current_user)):
data = await get_jobs_per_day(user.email)
return data

View File

@@ -0,0 +1,63 @@
# STL
import os
import sqlite3
from typing import Generator
from unittest.mock import patch
# PDM
import pytest
from proxy import Proxy
# LOCAL
from api.backend.database.schema import INIT_QUERY
from api.backend.tests.constants import TEST_DB_PATH
@pytest.fixture(scope="session", autouse=True)
def running_proxy():
proxy = Proxy(["--hostname", "127.0.0.1", "--port", "8080"])
proxy.setup()
yield proxy
proxy.shutdown()
@pytest.fixture(scope="session", autouse=True)
def patch_database_path():
with patch("api.backend.database.common.DATABASE_PATH", TEST_DB_PATH):
yield
@pytest.fixture(scope="session", autouse=True)
def patch_recordings_enabled():
with patch("api.backend.job.scraping.scraping.RECORDINGS_ENABLED", False):
yield
@pytest.fixture(scope="session")
def test_db_path() -> str:
return TEST_DB_PATH
@pytest.fixture(scope="session", autouse=True)
def test_db(test_db_path: str) -> Generator[str, None, None]:
"""Create a fresh test database for each test function."""
os.makedirs(os.path.dirname(test_db_path), exist_ok=True)
if os.path.exists(test_db_path):
os.remove(test_db_path)
conn = sqlite3.connect(test_db_path)
cursor = conn.cursor()
for query in INIT_QUERY.strip().split(";"):
query = query.strip()
if query:
cursor.execute(query)
conn.commit()
conn.close()
yield test_db_path
if os.path.exists(test_db_path):
os.remove(test_db_path)

View File

@@ -0,0 +1 @@
TEST_DB_PATH = "tests/test_db.sqlite"

View File

@@ -1,7 +1,13 @@
from api.backend.models import Element, Job, JobOptions, CapturedElement
# STL
import uuid
# PDM
from faker import Faker
# LOCAL
from api.backend.job.models import Element, JobOptions, CapturedElement
from api.backend.schemas.job import Job
fake = Faker()

View File

@@ -1,8 +1,13 @@
# STL
from unittest.mock import AsyncMock, patch
# PDM
import pytest
from fastapi.testclient import TestClient
from unittest.mock import AsyncMock, patch
# LOCAL
from api.backend.app import app
from api.backend.models import DownloadJob
from api.backend.schemas.job import DownloadJob
from api.backend.tests.factories.job_factory import create_completed_job
client = TestClient(app)
@@ -13,8 +18,8 @@ mocked_random_int = 123456
@pytest.mark.asyncio
@patch("api.backend.routers.job_router.query")
@patch("api.backend.routers.job_router.random.randint")
@patch("api.backend.job.job_router.query")
@patch("api.backend.job.job_router.random.randint")
async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
# Ensure the mock returns immediately
mock_query.return_value = mock_results

View File

@@ -1,12 +1,26 @@
import pytest
# STL
import logging
from typing import Dict
from playwright.async_api import async_playwright, Cookie, Route
from datetime import datetime
# PDM
import pytest
from fastapi.testclient import TestClient
from playwright.async_api import Route, Cookie, async_playwright
# LOCAL
from api.backend.app import app
from api.backend.job.models import Proxy, Element, JobOptions
from api.backend.schemas.job import Job
from api.backend.database.common import query
from api.backend.job.scraping.scraping import scrape
from api.backend.job.scraping.add_custom import add_custom_items
logging.basicConfig(level=logging.DEBUG)
LOG = logging.getLogger(__name__)
client = TestClient(app)
@pytest.mark.asyncio
async def test_add_custom_items():
@@ -51,3 +65,46 @@ async def test_add_custom_items():
assert captured_headers.get("user-agent") == "test-agent"
await browser.close()
@pytest.mark.asyncio
async def test_proxies():
job = Job(
url="https://example.com",
elements=[Element(xpath="//div", name="test")],
job_options=JobOptions(
proxies=[
Proxy(
server="127.0.0.1:8080",
username="user",
password="pass",
)
],
),
time_created=datetime.now().isoformat(),
)
response = client.post("/submit-scrape-job", json=job.model_dump())
assert response.status_code == 200
jobs = query("SELECT * FROM jobs")
job = jobs[0]
assert job is not None
assert job["job_options"]["proxies"] == [
{
"server": "127.0.0.1:8080",
"username": "user",
"password": "pass",
}
]
response = await scrape(
id=job["id"],
url=job["url"],
xpaths=[Element(**e) for e in job["elements"]],
job_options=job["job_options"],
)
example_response = response[0]["https://example.com/"]
assert example_response is not {}

View File

@@ -0,0 +1,17 @@
# STL
import sqlite3
# LOCAL
from api.backend.database.schema import INIT_QUERY
from api.backend.tests.constants import TEST_DB_PATH
def connect_to_db():
conn = sqlite3.connect(TEST_DB_PATH)
cur = conn.cursor()
for query in INIT_QUERY.split(";"):
cur.execute(query)
conn.commit()
return conn, cur

View File

@@ -1,17 +1,10 @@
from typing import Any, Optional
# STL
import logging
import json
from typing import Optional
LOG = logging.getLogger(__name__)
def clean_text(text: str):
text = text.replace("\r\n", "\n") # Normalize newlines
text = text.replace("\n", "\\n") # Escape newlines
text = text.replace('"', '\\"') # Escape double quotes
return text
def get_log_level(level_name: Optional[str]) -> int:
level = logging.INFO
@@ -20,30 +13,3 @@ def get_log_level(level_name: Optional[str]) -> int:
level = getattr(logging, level_name, logging.INFO)
return level
def format_list_for_query(ids: list[str]):
return (
f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)"
)
def format_sql_row_to_python(row: dict[str, Any]):
new_row: dict[str, Any] = {}
for key, value in row.items():
if isinstance(value, str):
try:
new_row[key] = json.loads(value)
except json.JSONDecodeError:
new_row[key] = value
else:
new_row[key] = value
return new_row
def format_json(items: list[Any]):
for idx, item in enumerate(items):
if isinstance(item, (dict, list)):
formatted_item = json.dumps(item)
items[idx] = formatted_item

View File

@@ -0,0 +1,17 @@
# STL
import os
from pathlib import Path
NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "")
NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "")
SCRAPERR_FRONTEND_URL = os.getenv("SCRAPERR_FRONTEND_URL", "")
EMAIL = os.getenv("EMAIL", "")
TO = os.getenv("TO", "")
SMTP_HOST = os.getenv("SMTP_HOST", "")
SMTP_PORT = int(os.getenv("SMTP_PORT", 587))
SMTP_USER = os.getenv("SMTP_USER", "")
SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "")
USE_TLS = os.getenv("USE_TLS", "false").lower() == "true"
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
RECORDINGS_DIR = Path("/project/app/media/recordings")

View File

@@ -1,37 +1,34 @@
import os
# STL
import json
from pathlib import Path
from api.backend.job import get_queued_job, update_job
from api.backend.scraping import scrape
from api.backend.models import Element
from fastapi.encoders import jsonable_encoder
import subprocess
import asyncio
import traceback
import subprocess
from api.backend.database.startup import init_database
# PDM
from fastapi.encoders import jsonable_encoder
from api.backend.worker.post_job_complete.post_job_complete import post_job_complete
# LOCAL
from api.backend.job import update_job, get_queued_job
from api.backend.job.models import Element
from api.backend.worker.logger import LOG
from api.backend.ai.agent.agent import scrape_with_agent
NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "")
NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "")
SCRAPERR_FRONTEND_URL = os.getenv("SCRAPERR_FRONTEND_URL", "")
EMAIL = os.getenv("EMAIL", "")
TO = os.getenv("TO", "")
SMTP_HOST = os.getenv("SMTP_HOST", "")
SMTP_PORT = int(os.getenv("SMTP_PORT", 587))
SMTP_USER = os.getenv("SMTP_USER", "")
SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "")
USE_TLS = os.getenv("USE_TLS", "false").lower() == "true"
RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
RECORDINGS_DIR = Path("/project/app/media/recordings")
from api.backend.database.startup import init_database
from api.backend.worker.constants import (
TO,
EMAIL,
USE_TLS,
SMTP_HOST,
SMTP_PORT,
SMTP_USER,
SMTP_PASSWORD,
RECORDINGS_DIR,
RECORDINGS_ENABLED,
NOTIFICATION_CHANNEL,
SCRAPERR_FRONTEND_URL,
NOTIFICATION_WEBHOOK_URL,
)
from api.backend.job.scraping.scraping import scrape
from api.backend.worker.post_job_complete.post_job_complete import post_job_complete
async def process_job():
@@ -84,12 +81,7 @@ async def process_job():
job["id"],
job["url"],
[Element(**j) for j in job["elements"]],
job["job_options"]["custom_headers"],
job["job_options"]["multi_page_scrape"],
proxies,
job["job_options"]["site_map"],
job["job_options"]["collect_media"],
job["job_options"]["custom_cookies"],
{**job["job_options"], "proxies": proxies},
)
LOG.info(

View File

@@ -1,12 +1,13 @@
# STL
import logging
import os
from api.backend.utils import get_log_level
# LOCAL
from api.backend.app import LOG_LEVEL
logging.basicConfig(
level=get_log_level(os.getenv("LOG_LEVEL")),
format="%(levelname)s: %(asctime)s - %(name)s - %(message)s",
level=LOG_LEVEL,
format="%(levelname)s: %(asctime)s - [%(name)s] - %(message)s",
handlers=[logging.StreamHandler()],
)
LOG = logging.getLogger(__name__)
LOG = logging.getLogger("Job Worker")

View File

@@ -1,5 +1,7 @@
# STL
from typing import Any
# LOCAL
from api.backend.worker.post_job_complete.models import PostJobCompleteOptions
from api.backend.worker.post_job_complete.email_notifcation import (
send_job_complete_email,
@@ -16,9 +18,10 @@ async def post_job_complete(job: dict[str, Any], options: PostJobCompleteOptions
if not options.values():
return
if options["channel"] == "discord":
discord_notification(job, options)
elif options["channel"] == "email":
send_job_complete_email(job, options)
else:
raise ValueError(f"Invalid channel: {options['channel']}")
match options["channel"]:
case "discord":
discord_notification(job, options)
case "email":
send_job_complete_email(job, options)
case _:
raise ValueError(f"Invalid channel: {options['channel']}")

View File

@@ -0,0 +1,23 @@
describe("Global setup", () => {
it("signs up user once", () => {
cy.request({
method: "POST",
url: "/api/signup",
body: JSON.stringify({
data: {
email: "test@test.com",
password: "password",
full_name: "John Doe",
},
}),
headers: {
"Content-Type": "application/json",
},
failOnStatusCode: false,
}).then((response) => {
if (response.status !== 200 && response.status !== 201) {
console.warn("Signup failed:", response.status, response.body);
}
});
});
});

View File

@@ -0,0 +1,101 @@
import { login } from "../utilities/authentication.utils";
import {
addCustomHeaders,
addElement,
addMedia,
addSiteMapAction,
checkForMedia,
cleanUpJobs,
enterJobUrl,
openAdvancedJobOptions,
submitBasicJob,
submitJob,
waitForJobCompletion,
} from "../utilities/job.utilities";
import { mockSubmitJob } from "../utilities/mocks";
describe.only("Advanced Job Options", () => {
beforeEach(() => {
mockSubmitJob();
login();
cy.visit("/");
});
afterEach(() => {
cleanUpJobs();
});
it.only("should handle custom headers", () => {
const customHeaders = {
"User-Agent": "Test Agent",
"Accept-Language": "en-US",
};
addCustomHeaders(customHeaders);
submitBasicJob("https://httpbin.org/headers", "headers", "//pre");
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
expect(
interception.request?.body.data.job_options.custom_headers
).to.deep.equal(customHeaders);
});
waitForJobCompletion("https://httpbin.org/headers");
});
it("should handle site map actions", () => {
addSiteMapAction("click", "//button[contains(text(), 'Load More')]");
addSiteMapAction("input", "//input[@type='search']", "test search");
submitBasicJob("https://example.com", "content", "//div[@class='content']");
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
const siteMap = interception.request?.body.data.job_options.site_map;
expect(siteMap.actions).to.have.length(2);
expect(siteMap.actions[0].type).to.equal("click");
expect(siteMap.actions[1].type).to.equal("input");
});
waitForJobCompletion("https://example.com");
});
it("should handle multiple elements", () => {
enterJobUrl("https://books.toscrape.com");
addElement("titles", "//h3");
addElement("prices", "//p[@class='price_color']");
submitJob();
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
expect(interception.request?.body.data.elements).to.have.length(2);
});
waitForJobCompletion("https://books.toscrape.com");
});
it.only("should handle collecting media", () => {
enterJobUrl("https://books.toscrape.com");
openAdvancedJobOptions();
addMedia();
cy.get("body").type("{esc}");
addElement("images", "//img");
submitJob();
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
expect(interception.request?.body.data.job_options.collect_media).to.be
.true;
});
waitForJobCompletion("https://books.toscrape.com");
checkForMedia();
});
});

35
cypress/e2e/agent.cy.ts Normal file
View File

@@ -0,0 +1,35 @@
import { login } from "../utilities/authentication.utils";
import {
buildAgentJob,
cleanUpJobs,
submitJob,
waitForJobCompletion,
} from "../utilities/job.utilities";
import { mockSubmitJob } from "../utilities/mocks";
describe.only("Agent", () => {
beforeEach(() => {
mockSubmitJob();
login();
cy.visit("/agent");
});
afterEach(() => {
cleanUpJobs();
});
it("should be able to scrape some data", () => {
const url = "https://books.toscrape.com";
const prompt = "Collect all the links on the page";
buildAgentJob(url, prompt);
submitJob();
cy.wait("@submitScrapeJob").then((interception) => {
expect(interception.response?.statusCode).to.eq(200);
expect(interception.request?.body.data.url).to.eq(url);
expect(interception.request?.body.data.prompt).to.eq(prompt);
});
waitForJobCompletion("https://books.toscrape.com");
});
});

View File

@@ -1,60 +1,61 @@
describe("Authentication", () => {
it("should register", () => {
cy.intercept("POST", "/api/signup").as("signup");
import { faker } from "@faker-js/faker";
import { mockLogin, mockSignup } from "../utilities/mocks";
cy.visit("/").then(() => {
cy.get("button").contains("Login").click();
cy.url().should("include", "/login");
const mockEmail = faker.internet.email();
const mockPassword = faker.internet.password();
cy.get("form").should("be.visible");
cy.get("button")
.contains("No Account? Sign up")
.should("be.visible")
.click();
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("input[name='fullName']").type("John Doe");
cy.get("button[type='submit']").contains("Signup").click();
cy.wait("@signup").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("signup request did not return a response");
}
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
expect(interception.response.statusCode).to.eq(200);
});
});
describe.only("Authentication", () => {
beforeEach(() => {
cy.visit("/");
mockSignup();
mockLogin();
});
it("should login", () => {
cy.intercept("POST", "/api/token").as("token");
it("should register", () => {
cy.get("button").contains("Login").click();
cy.url().should("include", "/login");
cy.visit("/").then(() => {
cy.get("button")
.contains("Login")
.click()
.then(() => {
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("button[type='submit']").contains("Login").click();
cy.get("form").should("be.visible");
cy.wait("@token").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("token request did not return a response");
}
cy.get("button")
.contains("No Account? Sign up")
.should("be.visible")
.click();
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
cy.get("input[name='email']").type(mockEmail);
cy.get("input[name='password']").type(mockPassword);
cy.get("input[name='fullName']").type(faker.person.fullName());
cy.get("button[type='submit']").contains("Signup").click();
expect(interception.response.statusCode).to.eq(200);
});
});
cy.wait("@signup").then((interception) => {
if (!interception.response) {
throw new Error("signup request did not return a response");
}
expect(interception.response.statusCode).to.eq(200);
});
});
});
it("should login", () => {
cy.intercept("POST", "/api/token").as("token");
cy.visit("/").then(() => {
cy.get("button")
.contains("Login")
.click()
.then(() => {
cy.get("input[name='email']").type(mockEmail);
cy.get("input[name='password']").type(mockPassword);
cy.get("button[type='submit']").contains("Login").click();
cy.wait("@token").then((interception) => {
if (!interception.response) {
throw new Error("token request did not return a response");
}
expect(interception.response.statusCode).to.eq(200);
});
});
});
});

34
cypress/e2e/chat.cy.ts Normal file
View File

@@ -0,0 +1,34 @@
import { login } from "../utilities/authentication.utils";
import {
cleanUpJobs,
selectJobFromSelector,
submitBasicJob,
waitForJobCompletion,
} from "../utilities/job.utilities";
import { mockLogin } from "../utilities/mocks";
describe.only("Chat", () => {
beforeEach(() => {
mockLogin();
login();
cy.visit("/");
});
afterEach(() => {
cleanUpJobs();
});
it.only("should be able to chat", () => {
const url = "https://books.toscrape.com";
submitBasicJob(url, "test", "//body");
waitForJobCompletion(url);
cy.visit("/chat");
selectJobFromSelector();
cy.get("[data-cy='message-input']").type("Hello");
cy.get("[data-cy='send-message']").click();
cy.get("[data-cy='ai-message']").should("exist");
});
});

View File

@@ -1,88 +1,37 @@
import { login } from "../utilities/authentication.utils";
import {
addElement,
cleanUpJobs,
enterJobUrl,
submitJob,
waitForJobCompletion,
} from "../utilities/job.utilities";
import { mockSubmitJob } from "../utilities/mocks";
describe.only("Job", () => {
it("should create a job", () => {
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
beforeEach(() => {
mockSubmitJob();
login();
cy.visit("/");
});
cy.get('[data-cy="url-input"]').type("https://example.com");
cy.get('[data-cy="name-field"]').type("example");
cy.get('[data-cy="xpath-field"]').type("//body");
cy.get('[data-cy="add-button"]').click();
afterEach(() => {
cleanUpJobs();
});
cy.contains("Submit").click();
it("should create a job", () => {
enterJobUrl("https://books.toscrape.com");
addElement("body", "//body");
submitJob();
cy.wait("@submitScrapeJob").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
cy.log("Request body: " + JSON.stringify(interception.request?.body));
throw new Error("submitScrapeJob request did not return a response");
}
cy.log("Response status: " + interception.response.statusCode);
cy.log("Response body: " + JSON.stringify(interception.response.body));
expect(interception.response.statusCode).to.eq(200);
});
cy.get("li").contains("Jobs").click();
cy.contains("div", "https://example.com", { timeout: 10000 }).should(
"exist"
);
cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
cy.get("tbody tr")
.first()
.within(() => {
cy.get('input[type="checkbox"]').click();
});
cy.get("[data-testid='DeleteIcon']").click();
cy.contains("div", "https://example.com", { timeout: 10000 }).should(
"not.exist"
);
});
it("should create a job with advanced options (media)", () => {
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
cy.visit("/");
cy.get("button").contains("Advanced Job Options").click();
cy.get('[data-cy="collect-media-checkbox"]').click();
cy.get("body").type("{esc}");
cy.get('[data-cy="url-input"]').type("https://books.toscrape.com");
cy.get('[data-cy="name-field"]').type("example");
cy.get('[data-cy="xpath-field"]').type("//body");
cy.get('[data-cy="add-button"]').click();
cy.get("button").contains("Submit").click();
cy.get("li").contains("Jobs").click();
cy.contains("div", "https://books.toscrape.com", { timeout: 10000 }).should(
"exist"
);
cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
cy.get("li").contains("Media").click();
cy.get("div[id='select-job']").click();
cy.get("li[role='option']").click();
cy.get("[data-testid='media-grid']", { timeout: 10000 }).should("exist");
cy.get("li").contains("Jobs").click();
cy.get("tbody tr")
.first()
.within(() => {
cy.get('input[type="checkbox"]').click();
});
cy.get("[data-testid='DeleteIcon']").click();
waitForJobCompletion("https://books.toscrape.com");
});
});

View File

@@ -14,7 +14,7 @@
// ***********************************************************
// Import commands.js using ES2015 syntax:
import './commands'
import "./commands";
// Alternatively you can use CommonJS syntax:
// require('./commands')
// require('./commands')

View File

@@ -0,0 +1,68 @@
export const signup = () => {
cy.intercept("POST", "/api/token").as("token");
cy.visit("/").then(() => {
cy.get("button").contains("Login").click();
cy.url().should("include", "/login");
cy.get("form").should("be.visible");
cy.get("button")
.contains("No Account? Sign up")
.should("be.visible")
.click();
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("input[name='fullName']").type("John Doe");
cy.get("button[type='submit']").contains("Signup").click();
cy.wait("@token").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("token request did not return a response");
}
});
});
};
export const login = () => {
cy.intercept("POST", "/api/token").as("token");
cy.intercept("GET", "/api/me").as("me");
cy.intercept("GET", "/api/check").as("check");
cy.visit("/").then(() => {
cy.get("body").then(() => {
cy.get("button")
.contains("Login")
.click()
.then(() => {
cy.get("input[name='email']").type("test@test.com");
cy.get("input[name='password']").type("password");
cy.get("button[type='submit']").contains("Login").click();
cy.wait("@token").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("token request did not return a response");
}
});
cy.wait("@me").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("me request did not return a response");
}
});
cy.wait("@check").then((interception) => {
if (!interception.response) {
cy.log("No response received!");
throw new Error("check request did not return a response");
}
});
cy.url().should("not.include", "/login");
});
});
});
};

View File

@@ -0,0 +1,151 @@
export const cleanUpJobs = () => {
cy.intercept("POST", "/api/retrieve").as("retrieve");
cy.visit("/jobs");
cy.wait("@retrieve", { timeout: 15000 });
cy.get("tbody tr", { timeout: 10000 }).should("have.length.at.least", 1);
const tryClickSelectAll = (attempt = 1, maxAttempts = 5) => {
cy.log(`Attempt ${attempt} to click Select All`);
cy.get('[data-testid="select-all"]')
.closest("button")
.then(($btn) => {
// Retry if button is disabled
if ($btn.is(":disabled") || $btn.css("pointer-events") === "none") {
if (attempt < maxAttempts) {
cy.wait(1000).then(() =>
tryClickSelectAll(attempt + 1, maxAttempts)
);
} else {
throw new Error(
"Select All button is still disabled after max retries"
);
}
} else {
// Click and then verify if checkbox is checked
cy.wrap($btn)
.click({ force: true })
.then(() => {
cy.get("tbody tr")
.first()
.find("td")
.first()
.find("input[type='checkbox']")
.should("be.checked")
.then(() => {
cy.log("Select All successful");
});
});
// Handle failure case
cy.on("fail", () => {
cy.log("Error clicking Select All");
if (attempt < maxAttempts) {
cy.wait(1000).then(() =>
tryClickSelectAll(attempt + 1, maxAttempts)
);
} else {
throw new Error(
"Checkbox was never checked after clicking Select All"
);
}
return false; // Prevent Cypress from failing the test
});
}
});
};
tryClickSelectAll();
cy.get('[data-testid="DeleteIcon"]', { timeout: 10000 })
.closest("button")
.should("not.be.disabled")
.click();
};
export const submitBasicJob = (url: string, name: string, xpath: string) => {
cy.get('[data-cy="url-input"]').type(url);
cy.get('[data-cy="name-field"]').type(name);
cy.get('[data-cy="xpath-field"]').type(xpath);
cy.get('[data-cy="add-button"]').click();
cy.contains("Submit").click();
};
export const waitForJobCompletion = (url: string) => {
cy.visit("/jobs");
cy.contains("div", url, { timeout: 10000 }).should("exist");
cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
};
export const enableMultiPageScraping = () => {
cy.get("button").contains("Advanced Job Options").click();
cy.get('[data-cy="multi-page-toggle"]').click();
cy.get("body").type("{esc}");
};
export const addCustomHeaders = (headers: Record<string, string>) => {
cy.get("button").contains("Advanced Job Options").click();
cy.get('[name="custom_headers"]').type(JSON.stringify(headers), {
parseSpecialCharSequences: false,
});
cy.get("body").type("{esc}");
};
export const addCustomCookies = (cookies: Record<string, string>) => {
cy.get("button").contains("Advanced Job Options").click();
cy.get('[name="custom_cookies"]').type(JSON.stringify(cookies));
cy.get("body").type("{esc}");
};
export const openAdvancedJobOptions = () => {
cy.get("button").contains("Advanced Job Options").click();
};
export const selectJobFromSelector = () => {
cy.get("div[id='select-job']").click();
cy.get("li[role='option']").click().first();
};
export const addMedia = () => {
cy.get('[data-cy="collect-media-checkbox"]').click();
};
export const checkForMedia = () => {
cy.visit("/media");
selectJobFromSelector();
cy.get("[data-testid='media-grid']", { timeout: 10000 }).should("exist");
};
export const addSiteMapAction = (
type: "click" | "input",
xpath: string,
input?: string
) => {
cy.get("button").contains("Create Site Map").click();
cy.get('[data-cy="site-map-select"]').select(type);
cy.get('[data-cy="site-map-xpath"]').type(xpath);
if (type === "input" && input) {
cy.get('[data-cy="site-map-input"]').type(input);
}
cy.get('[data-cy="add-site-map-action"]').click();
};
export const addElement = (name: string, xpath: string) => {
cy.get('[data-cy="name-field"]').type(name);
cy.get('[data-cy="xpath-field"]').type(xpath);
cy.get('[data-cy="add-button"]').click();
};
export const buildAgentJob = (url: string, prompt: string) => {
enterJobUrl(url);
cy.get("[data-cy='prompt-input']").type(prompt);
};
export const submitJob = () => {
cy.get("button").contains("Submit").click();
};
export const enterJobUrl = (url: string) => {
cy.get('[data-cy="url-input"]').type(url);
};

View File

@@ -0,0 +1,15 @@
export const mockSubmitJob = () => {
cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
};
export const mockToken = () => {
cy.intercept("POST", "/api/token").as("token");
};
export const mockSignup = () => {
cy.intercept("POST", "/api/signup").as("signup");
};
export const mockLogin = () => {
cy.intercept("POST", "/api/token").as("token");
};

View File

@@ -0,0 +1 @@
export * from "./authentication.utils";

View File

@@ -15,6 +15,7 @@ services:
image: jpyles0524/scraperr_api:latest
environment:
- LOG_LEVEL=INFO
- OPENAI_KEY=${OPENAI_KEY}
container_name: scraperr_api
ports:
- 8000:8000

View File

@@ -36,6 +36,7 @@
"react-router": "^6.14.1",
"react-router-dom": "^6.14.1",
"react-spinners": "^0.14.1",
"react-toastify": "^11.0.5",
"redux-persist": "^6.0.0",
"typescript": "^4.9.5",
"web-vitals": "^2.1.4"
@@ -67,6 +68,7 @@
]
},
"devDependencies": {
"@faker-js/faker": "^9.8.0",
"@types/cypress": "^1.1.6",
"@types/js-cookie": "^3.0.6",
"autoprefixer": "^10.4.21",

13
pdm.lock generated
View File

@@ -5,7 +5,7 @@
groups = ["default", "dev"]
strategy = ["inherit_metadata"]
lock_version = "4.5.0"
content_hash = "sha256:5f4c90b42c3b35194a7c2af8b46b7c28127e25e836a779e85aae0df2bd0e69eb"
content_hash = "sha256:1a65c1e288d2c6827fc6866d3bfe6a9b8707b2ca895d488f4a9b11cd579c4359"
[[metadata.targets]]
requires_python = ">=3.10"
@@ -2314,6 +2314,17 @@ files = [
{file = "propcache-0.3.1.tar.gz", hash = "sha256:40d980c33765359098837527e18eddefc9a24cea5b45e078a7f3bb5b032c6ecf"},
]
[[package]]
name = "proxy-py"
version = "2.4.10"
requires_python = ">=3.6"
summary = "\\u26a1 Fast \\u2022 \\U0001fab6 Lightweight \\u2022 \\U0001f51f Dependency \\u2022 \\U0001f50c Pluggable \\u2022 \\U0001f608 TLS interception \\u2022 \\U0001f512 DNS-over-HTTPS \\u2022 \\U0001f525 Poor Mans VPN \\u2022 \\u23ea Reverse & \\u23e9 Forward \\u2022 \\U0001f46e\\U0001f3ff Proxy Server framework \\u2022 \\U0001f310 Web Server framework \\u2022 \\u27b5 \\u27b6 \\u27b7 \\u27a0 PubSub framework \\u2022 \\U0001f477 Work acceptor & executor framework."
groups = ["default"]
files = [
{file = "proxy.py-2.4.10-py3-none-any.whl", hash = "sha256:ef3a31f6ef3be6ff78559c0e68198523bfe2fb1e820bb16686750c1bb5baf9e8"},
{file = "proxy_py-2.4.10.tar.gz", hash = "sha256:41b9e9d3aae6f80e2304d3726e8e9c583a510d8de224eada53d115f48a63a9ce"},
]
[[package]]
name = "ptyprocess"
version = "0.7.0"

View File

@@ -42,6 +42,7 @@ dependencies = [
"playwright>=1.52.0",
"camoufox>=0.4.11",
"html2text>=2025.4.15",
"proxy-py>=2.4.10",
]
requires-python = ">=3.10"
readme = "README.md"
@@ -98,9 +99,9 @@ strictSetInference = true
[tool.isort]
length_sort = "1"
length_sort = true
profile = "black"
sections = "STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER"
sections = ["STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
import_heading_stdlib = "STL"
import_heading_thirdparty = "PDM"
import_heading_firstparty = "LOCAL"

View File

@@ -1,5 +0,0 @@
import React from "react";
export const Chat = () => {
return <h1>Chat</h1>;
};

View File

@@ -1,2 +0,0 @@
export * from "./Chat";
export * from "./JobSelector";

View File

@@ -15,6 +15,7 @@ export const AdvancedJobOptions = ({
multiPageScrapeEnabled = true,
}: AdvancedJobOptionsProps) => {
const [open, setOpen] = useState(false);
return (
<Box sx={{ mb: 2 }}>
<Link
@@ -36,6 +37,7 @@ export const AdvancedJobOptions = ({
>
<Typography variant="body2">Advanced Job Options</Typography>
</Link>
<AdvancedJobOptionsDialog
open={open}
onClose={() => setOpen(false)}

View File

@@ -1,3 +1,11 @@
import { ExpandedTableInput } from "@/components/common/expanded-table-input";
import { RawJobOptions } from "@/types";
import {
Code as CodeIcon,
ExpandMore as ExpandMoreIcon,
InfoOutlined,
Settings,
} from "@mui/icons-material";
import {
Accordion,
AccordionDetails,
@@ -17,15 +25,7 @@ import {
Typography,
useTheme,
} from "@mui/material";
import {
ExpandMore as ExpandMoreIcon,
InfoOutlined,
Code as CodeIcon,
Settings,
} from "@mui/icons-material";
import { Dispatch, SetStateAction } from "react";
import { RawJobOptions } from "@/types";
import { ExpandedTableInput } from "../../expanded-table-input";
import { Dispatch, SetStateAction, useEffect, useState } from "react";
export type AdvancedJobOptionsDialogProps = {
open: boolean;
@@ -43,31 +43,45 @@ export const AdvancedJobOptionsDialog = ({
multiPageScrapeEnabled = true,
}: AdvancedJobOptionsDialogProps) => {
const theme = useTheme();
const [localJobOptions, setLocalJobOptions] =
useState<RawJobOptions>(jobOptions);
// Update local state when prop changes
useEffect(() => {
setLocalJobOptions(jobOptions);
}, [jobOptions]);
const handleMultiPageScrapeChange = () => {
setJobOptions((prevJobOptions) => ({
setLocalJobOptions((prevJobOptions) => ({
...prevJobOptions,
multi_page_scrape: !prevJobOptions.multi_page_scrape,
}));
};
const handleProxiesChange = (e: React.ChangeEvent<HTMLInputElement>) => {
setJobOptions((prevJobOptions) => ({
setLocalJobOptions((prevJobOptions) => ({
...prevJobOptions,
proxies: e.target.value,
}));
};
const handleCollectMediaChange = () => {
setJobOptions((prevJobOptions) => ({
setLocalJobOptions((prevJobOptions) => ({
...prevJobOptions,
collect_media: !prevJobOptions.collect_media,
}));
};
const handleClose = () => {
// Save the local state back to the parent before closing
setJobOptions(localJobOptions);
onClose();
};
return (
<Dialog
open={open}
onClose={onClose}
onClose={handleClose}
maxWidth="md"
fullWidth
PaperProps={{
@@ -122,7 +136,7 @@ export const AdvancedJobOptionsDialog = ({
<FormControlLabel
control={
<Checkbox
checked={jobOptions.multi_page_scrape}
checked={localJobOptions.multi_page_scrape}
onChange={handleMultiPageScrapeChange}
disabled={!multiPageScrapeEnabled}
/>
@@ -147,7 +161,7 @@ export const AdvancedJobOptionsDialog = ({
<FormControlLabel
control={
<Checkbox
checked={jobOptions.collect_media}
checked={localJobOptions.collect_media}
onChange={handleCollectMediaChange}
data-cy="collect-media-checkbox"
/>
@@ -233,7 +247,7 @@ export const AdvancedJobOptionsDialog = ({
fullWidth
variant="outlined"
size="small"
value={jobOptions.proxies}
value={localJobOptions.proxies}
onChange={handleProxiesChange}
InputProps={{
startAdornment: (
@@ -251,8 +265,9 @@ export const AdvancedJobOptionsDialog = ({
label="Custom Headers"
placeholder='{"User-Agent": "CustomAgent", "Accept": "*/*"}'
urlParam="custom_headers"
name="custom_headers"
onChange={(value) => {
setJobOptions((prevJobOptions) => ({
setLocalJobOptions((prevJobOptions) => ({
...prevJobOptions,
custom_headers: value,
}));
@@ -264,8 +279,9 @@ export const AdvancedJobOptionsDialog = ({
label="Custom Cookies"
placeholder='[{"name": "value", "name2": "value2"}]'
urlParam="custom_cookies"
name="custom_cookies"
onChange={(value) => {
setJobOptions((prevJobOptions) => ({
setLocalJobOptions((prevJobOptions) => ({
...prevJobOptions,
custom_cookies: value,
}));

View File

@@ -1,17 +1,17 @@
import React, { useState } from "react";
import {
alpha,
Box,
Paper,
Table,
TableBody,
TableCell,
TableContainer,
TableHead,
TableRow,
Paper,
Box,
Typography,
useTheme,
alpha,
} from "@mui/material";
import React, { useState } from "react";
export type CsvRow = {
[key: string]: string;

View File

@@ -1,28 +1,29 @@
import { parseJsonToEntries } from "@/lib/helpers/parse-json-to-entries";
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
import {
Accordion,
AccordionSummary,
TableCell,
TableRow,
Paper,
TableBody,
useTheme,
TextField,
Box,
Typography,
AccordionDetails,
TableHead,
TableContainer,
AccordionSummary,
Box,
Paper,
Table,
TableBody,
TableCell,
TableContainer,
TableHead,
TableRow,
TextField,
Typography,
useTheme,
} from "@mui/material";
import { useEffect, useState } from "react";
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
import { parseJsonToEntries } from "@/lib/helpers/parse-json-to-entries";
export type ExpandedTableInputProps = {
label: string;
onChange: (value: any) => void;
placeholder: string;
urlParam: string;
name: string;
};
export const ExpandedTableInput = ({
@@ -30,6 +31,7 @@ export const ExpandedTableInput = ({
onChange,
placeholder,
urlParam,
name,
}: ExpandedTableInputProps) => {
const theme = useTheme();
const [value, setValue] = useState("");
@@ -150,6 +152,7 @@ export const ExpandedTableInput = ({
size="small"
error={jsonError !== null}
helperText={jsonError ?? ""}
name={name}
/>
{parsedHeaders && parsedHeaders.length > 0 && (

View File

@@ -1,16 +1,16 @@
import { useDownloadJob } from "@/hooks/use-download-job";
import {
Dialog,
DialogTitle,
DialogContent,
DialogActions,
Button,
FormControl,
RadioGroup,
FormControlLabel,
Radio,
FormLabel,
Typography,
Box,
Button,
Dialog,
DialogContent,
DialogTitle,
FormControl,
FormControlLabel,
FormLabel,
Radio,
RadioGroup,
Typography,
} from "@mui/material";
import { useState } from "react";
@@ -26,27 +26,10 @@ export const JobDownloadDialog = ({
ids,
}: JobDownloadDialogProps) => {
const [jobFormat, setJobFormat] = useState<string>("csv");
const handleDownload = async () => {
const response = await fetch("/api/download", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ data: { ids: ids, job_format: jobFormat } }),
});
const { downloadJob } = useDownloadJob();
if (response.ok) {
const blob = await response.blob();
const url = window.URL.createObjectURL(blob);
const a = document.createElement("a");
a.style.display = "none";
a.href = url;
a.download = `job_${ids[0]}.${jobFormat}`;
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
} else {
console.error("Failed to download the file.");
}
const handleDownload = () => {
downloadJob(ids, jobFormat);
};
return (

View File

@@ -0,0 +1 @@
export * from "./job-selector";

View File

@@ -1,17 +1,17 @@
import React, { useState, Dispatch, useEffect } from "react";
import { Job } from "../../types";
import Box from "@mui/material/Box";
import InputLabel from "@mui/material/InputLabel";
import FormControl from "@mui/material/FormControl";
import Select from "@mui/material/Select";
import Popover from "@mui/material/Popover";
import { Job } from "@/types";
import {
Typography,
MenuItem,
useTheme,
ClickAwayListener,
MenuItem,
SxProps,
Typography,
useTheme,
} from "@mui/material";
import { SxProps } from "@mui/material";
import Box from "@mui/material/Box";
import FormControl from "@mui/material/FormControl";
import InputLabel from "@mui/material/InputLabel";
import Popover from "@mui/material/Popover";
import Select from "@mui/material/Select";
import React, { Dispatch, useEffect, useState } from "react";
interface Props {
sxProps?: SxProps;
@@ -19,7 +19,6 @@ interface Props {
| Dispatch<React.SetStateAction<Job | null>>
| ((job: Job) => void);
selectedJob: Job | null;
setJobs: Dispatch<React.SetStateAction<Job[]>>;
jobs: Job[];
}
@@ -27,7 +26,6 @@ export const JobSelector = ({
sxProps,
selectedJob,
setSelectedJob,
setJobs,
jobs,
}: Props) => {
const [anchorEl, setAnchorEl] = useState<HTMLElement | null>(null);

View File

@@ -1,7 +1,6 @@
"use client";
import React from "react";
import { useAuth } from "../../../contexts/AuthContext";
import { Box, Drawer } from "@mui/material";
import { QuickSettings } from "../../nav/quick-settings";
@@ -21,8 +20,6 @@ export const NavDrawer: React.FC<NavDrawerProps> = ({
toggleTheme,
isDarkMode,
}) => {
const { logout, user, isAuthenticated } = useAuth();
return (
<Drawer
variant="permanent"
@@ -48,12 +45,7 @@ export const NavDrawer: React.FC<NavDrawerProps> = ({
<NavItems />
</div>
<div>
<UserControl
isAuthenticated={isAuthenticated}
user={user}
logout={logout}
className={classes.userControl}
/>
<UserControl className={classes.userControl} />
<QuickSettings toggleTheme={toggleTheme} isDarkMode={isDarkMode} />
</div>
</Box>

View File

@@ -1,7 +1,7 @@
.welcome {
margin: 0.25rem;
margin: 0.5rem !important;
}
.userControlButton {
width: 100%;
}
}

View File

@@ -1,18 +1,23 @@
import React from "react";
import { Typography, Button } from "@mui/material";
import classes from "./logged-in-control.module.css";
import { useUser } from "@/store/hooks";
import router from "next/router";
import { useAuth } from "@/hooks/use-auth";
type LoggedInControlProps = {
user: any;
logout: () => void;
children?: React.ReactNode;
};
export const LoggedInControl = ({
user,
logout,
children,
}: LoggedInControlProps) => {
export const LoggedInControl = ({ children }: LoggedInControlProps) => {
const { user } = useUser();
const { logout } = useAuth();
const handleLogout = () => {
logout();
router.push("/");
};
if (children) {
return <>{children}</>;
}
@@ -24,7 +29,7 @@ export const LoggedInControl = ({
</Typography>
<Button
variant="contained"
onClick={logout}
onClick={handleLogout}
className={classes.userControlButton}
>
Logout

View File

@@ -5,30 +5,25 @@ import clsx from "clsx";
import classes from "./user-control.module.css";
import { LoggedInControl } from "./logged-in-control";
import { LoggedOutControl } from "./logged-out-control";
import { useUser } from "@/store/hooks";
export type UserControlProps = {
isAuthenticated: boolean;
user: any;
logout: () => void;
loggedInChildren?: React.ReactNode;
loggedOutChildren?: React.ReactNode;
className?: string;
};
export const UserControl = ({
isAuthenticated,
user,
logout,
loggedInChildren,
loggedOutChildren,
className,
}: UserControlProps) => {
const { user } = useUser();
return (
<Box className={clsx(classes.userControl, className)}>
{isAuthenticated ? (
<LoggedInControl user={user} logout={logout}>
{loggedInChildren}
</LoggedInControl>
{user.isAuthenticated ? (
<LoggedInControl>{loggedInChildren}</LoggedInControl>
) : (
<LoggedOutControl>{loggedOutChildren}</LoggedOutControl>
)}

View File

@@ -0,0 +1,21 @@
import { Snackbar, Alert } from "@mui/material";
type ErrorSnackbarProps = {
open: boolean;
onClose: () => void;
message: string;
};
export const ErrorSnackbar = ({
open,
onClose,
message,
}: ErrorSnackbarProps) => {
if (!open) return null;
return (
<Snackbar open={open} autoHideDuration={6000} onClose={onClose}>
<Alert severity="error">{message}</Alert>
</Snackbar>
);
};

View File

@@ -0,0 +1 @@
export * from "./error";

View File

@@ -0,0 +1,2 @@
export * from "./job-notify";
export * from "./error";

View File

@@ -0,0 +1 @@
export * from "./job-notify";

View File

@@ -0,0 +1,34 @@
import { Snackbar, Alert, Button } from "@mui/material";
import router from "next/router";
type JobNotifySnackbarProps = {
open: boolean;
onClose: () => void;
message: string;
};
export const JobNotifySnackbar = ({
open,
onClose,
message,
}: JobNotifySnackbarProps) => {
if (!open) return null;
const goTo = () => {
router.push("/jobs");
};
const action = (
<Button color="inherit" size="small" onClick={goTo}>
Go To Job
</Button>
);
return (
<Snackbar open={open} autoHideDuration={6000} onClose={onClose}>
<Alert severity="info" action={action}>
{message}
</Alert>
</Snackbar>
);
};

View File

@@ -0,0 +1,3 @@
export * from "./job-queue";
export * from "./favorites";
export * from "./job-table";

View File

@@ -1,3 +0,0 @@
export * from "./JobQueue";
export * from "./Favorites";
export * from "./JobTable";

View File

@@ -1,43 +1,32 @@
import React, { SetStateAction, useState } from "react";
import {
IconButton,
Box,
Typography,
Tooltip,
TextField,
FormControl,
InputLabel,
Select,
MenuItem,
SelectChangeEvent,
} from "@mui/material";
import { JobDownloadDialog } from "@/components/common/job-download-dialog";
import { ApiService } from "@/services";
import { COLOR_MAP, Job } from "@/types";
import DeleteIcon from "@mui/icons-material/Delete";
import SelectAllIcon from "@mui/icons-material/SelectAll";
import DownloadIcon from "@mui/icons-material/Download";
import SelectAllIcon from "@mui/icons-material/SelectAll";
import StarIcon from "@mui/icons-material/Star";
import { useRouter } from "next/router";
import { Favorites, JobQueue } from ".";
import { Job } from "../../types";
import Cookies from "js-cookie";
import {
Box,
FormControl,
IconButton,
InputLabel,
MenuItem,
Select,
SelectChangeEvent,
TextField,
Tooltip,
Typography,
} from "@mui/material";
import { useSearchParams } from "next/navigation";
import { JobDownloadDialog } from "../common/job-download-dialog";
import { useRouter } from "next/router";
import React, { SetStateAction, useState } from "react";
import { Favorites, JobQueue } from ".";
interface JobTableProps {
jobs: Job[];
setJobs: React.Dispatch<SetStateAction<Job[]>>;
}
interface ColorMap {
[key: string]: string;
}
const COLOR_MAP: ColorMap = {
Queued: "rgba(255,201,5,0.25)",
Scraping: "rgba(3,104,255,0.25)",
Completed: "rgba(5,255,51,0.25)",
Failed: "rgba(214,0,25,0.25)",
};
export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
const searchParams = useSearchParams();
const search = searchParams.get("search");
@@ -51,7 +40,6 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
const [jobDownloadDialogOpen, setJobDownloadDialogOpen] =
useState<boolean>(false);
const token = Cookies.get("token");
const router = useRouter();
const handleDownload = (ids: string[]) => {
@@ -78,6 +66,7 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
} else {
newSelected.add(id);
}
return newSelected;
});
};
@@ -89,14 +78,13 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
const allJobIds = new Set(jobs.map((job) => job.id));
setSelectedJobs(allJobIds);
}
setAllSelected(!allSelected);
};
const handleDeleteSelected = async () => {
const response = await fetch("/api/delete", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ data: { ids: Array.from(selectedJobs) } }),
const response = await ApiService.deleteJob({
ids: Array.from(selectedJobs),
});
if (response.ok) {
@@ -115,6 +103,7 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
} else if (searchMode === "status") {
return job.status.toLowerCase().includes(searchQuery.toLowerCase());
}
return true;
});
@@ -125,19 +114,10 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
)
);
const postBody = {
await ApiService.updateJob({
ids: ids,
field: field,
value: value,
};
await fetch("/api/update", {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${token}`,
},
body: JSON.stringify({ data: postBody }),
});
};
@@ -170,7 +150,12 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
</Typography>
<Tooltip title="Select All">
<span>
<IconButton color="primary" onClick={handleSelectAll}>
<IconButton
color="primary"
onClick={handleSelectAll}
data-testid="select-all"
aria-label="Select All"
>
<SelectAllIcon />
</IconButton>
</span>

View File

@@ -1,35 +1,41 @@
import { validateURL } from "@/lib/helpers/validate-url";
import { ApiService } from "@/services";
import { AdvancedJobOptions } from "@/components/common/advanced-job-options";
import { Disabled } from "@/components/common/disabled/disabled";
import {
ErrorSnackbar,
JobNotifySnackbar,
} from "@/components/common/snackbars";
import {
Provider as JobSubmitterProvider,
useJobSubmitterProvider,
} from "@/components/submit/job-submitter/provider";
import { useAdvancedJobOptions } from "@/hooks/use-advanced-job-options";
import { useSubmitJob } from "@/hooks/use-submit-job";
import { checkAI } from "@/lib";
import { useUser } from "@/store/hooks";
import {
Box,
Button,
Divider,
Snackbar,
Alert,
TextField,
Typography,
useTheme,
} from "@mui/material";
import { useEffect, useState } from "react";
import { useRouter } from "next/router";
import { AdvancedJobOptions } from "@/components/common/advanced-job-options";
import { useAdvancedJobOptions } from "@/lib/hooks/use-advanced-job-options/use-advanced-job-options";
import { checkAI } from "@/lib";
import { Disabled } from "@/components/common/disabled/disabled";
import { useEffect, useState } from "react";
export const Agent = () => {
const router = useRouter();
const theme = useTheme();
const { user } = useUser();
const { jobOptions, setJobOptions } = useAdvancedJobOptions();
const { submitJob, error } = useSubmitJob();
const { snackbarOpen, snackbarMessage, snackbarSeverity, closeSnackbar } =
useJobSubmitterProvider();
const [url, setUrl] = useState("");
const [prompt, setPrompt] = useState("");
const [urlError, setUrlError] = useState<string | null>(null);
const [aiEnabled, setAiEnabled] = useState(false);
const [snackbarMessage, setSnackbarMessage] = useState("");
const [snackbarSeverity, setSnackbarSeverity] = useState<
"success" | "error" | "info" | "warning"
>("info");
const [snackbarOpen, setSnackbarOpen] = useState(false);
const router = useRouter();
const { jobOptions, setJobOptions } = useAdvancedJobOptions();
const theme = useTheme();
useEffect(() => {
if (router.query.url) {
@@ -45,91 +51,8 @@ export const Agent = () => {
checkAI(setAiEnabled);
}, []);
const handleCloseSnackbar = () => {
setSnackbarOpen(false);
};
const ErrorSnackbar = () => {
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="error">
{snackbarMessage}
</Alert>
</Snackbar>
);
};
const NotifySnackbar = () => {
const goTo = () => {
router.push("/jobs");
};
const action = (
<Button color="inherit" size="small" onClick={goTo}>
Go To Job
</Button>
);
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="info" action={action}>
{snackbarMessage}
</Alert>
</Snackbar>
);
};
const handleSubmit = async () => {
if (!validateURL(url)) {
setUrlError("Please enter a valid URL.");
return;
}
setUrlError(null);
await ApiService.submitJob(
url,
[],
"",
{
collect_media: jobOptions.collect_media,
multi_page_scrape: jobOptions.multi_page_scrape,
},
jobOptions.custom_headers,
jobOptions.custom_cookies,
null,
true,
prompt
)
.then(async (response) => {
if (!response.ok) {
return response.json().then((error) => {
throw new Error(error.error);
});
}
return response.json();
})
.then((data) => {
setSnackbarMessage(
`Agent job: ${data.id} submitted successfully.` ||
"Agent job submitted successfully."
);
setSnackbarSeverity("info");
setSnackbarOpen(true);
})
.catch((error) => {
setSnackbarMessage(error || "An error occurred.");
setSnackbarSeverity("error");
setSnackbarOpen(true);
});
await submitJob(url, [], user, jobOptions, null, true, prompt);
};
if (!aiEnabled) {
@@ -178,13 +101,14 @@ export const Agent = () => {
<TextField
value={url}
onChange={(e) => setUrl(e.target.value)}
error={!!urlError}
helperText={urlError}
error={!!error}
helperText={error}
autoComplete="agent-url"
fullWidth
placeholder="https://www.example.com"
variant="outlined"
size="small"
data-cy="url-input"
/>
<Typography variant="body1" sx={{ fontWeight: 500, marginBottom: 0 }}>
Prompt
@@ -197,6 +121,7 @@ export const Agent = () => {
placeholder="Collect all the links on the page"
variant="outlined"
size="small"
data-cy="prompt-input"
/>
<Box
sx={{
@@ -221,8 +146,28 @@ export const Agent = () => {
Submit
</Button>
</Box>
{snackbarSeverity === "info" ? <NotifySnackbar /> : <ErrorSnackbar />}
{snackbarSeverity === "info" ? (
<JobNotifySnackbar
open={snackbarOpen}
onClose={closeSnackbar}
message={snackbarMessage}
/>
) : (
<ErrorSnackbar
open={snackbarOpen}
onClose={closeSnackbar}
message={snackbarMessage}
/>
)}
</Box>
</Box>
);
};
export const AgentPage = () => {
return (
<JobSubmitterProvider>
<Agent />
</JobSubmitterProvider>
);
};

View File

@@ -1,31 +1,32 @@
import React, { useEffect, useRef, useState } from "react";
import { JobSelector } from "@/components/common/job-selector";
import { useGetCurrentJobs } from "@/hooks/use-get-current-jobs";
import { checkAI, fetchJob, updateJob } from "@/lib";
import { Job, Message } from "@/types";
import EditNoteIcon from "@mui/icons-material/EditNote";
import SendIcon from "@mui/icons-material/Send";
import {
Box,
TextField,
Typography,
Paper,
useTheme,
IconButton,
Paper,
TextField,
Tooltip,
Typography,
useTheme,
} from "@mui/material";
import { JobSelector } from "../../ai";
import { Job, Message } from "../../../types";
import { useSearchParams } from "next/navigation";
import { fetchJob, fetchJobs, updateJob, checkAI } from "../../../lib";
import SendIcon from "@mui/icons-material/Send";
import EditNoteIcon from "@mui/icons-material/EditNote";
import React, { useEffect, useState } from "react";
export const AI: React.FC = () => {
const theme = useTheme();
const searchParams = useSearchParams();
const { jobs, setJobs } = useGetCurrentJobs();
const [currentMessage, setCurrentMessage] = useState<string>("");
const [selectedJob, setSelectedJob] = useState<Job | null>(null);
const [messages, setMessages] = useState<Message[]>([]);
const [aiEnabled, setAiEnabled] = useState<boolean>(false);
const [jobs, setJobs] = useState<Job[]>([]);
const [thinking, setThinking] = useState<boolean>(false);
const searchParams = useSearchParams();
const getJobFromParam = async () => {
const jobId = searchParams.get("job");
@@ -134,10 +135,6 @@ export const AI: React.FC = () => {
setMessages([]);
};
useEffect(() => {
fetchJobs(setJobs);
}, []);
return (
<Box
sx={{
@@ -187,7 +184,6 @@ export const AI: React.FC = () => {
<JobSelector
selectedJob={selectedJob}
setSelectedJob={setSelectedJob}
setJobs={setJobs}
jobs={jobs}
sxProps={{
position: "absolute",
@@ -242,6 +238,9 @@ export const AI: React.FC = () => {
marginLeft: message.role === "user" ? "auto" : "",
maxWidth: "40%",
}}
data-cy={
message.role === "user" ? "user-message" : "ai-message"
}
>
<Typography variant="body1" sx={{ color: "white" }}>
{message.content}
@@ -309,6 +308,7 @@ export const AI: React.FC = () => {
}
}}
sx={{ borderRadius: "8px" }}
data-cy="message-input"
/>
<Tooltip title="Send" placement="top">
@@ -319,6 +319,7 @@ export const AI: React.FC = () => {
onClick={() => {
handleMessageSend(currentMessage);
}}
data-cy="send-message"
>
<SendIcon />
</IconButton>

View File

@@ -0,0 +1 @@
export * from "./chat";

View File

@@ -1,19 +1,18 @@
import { Job, CronJob } from "@/types/job";
import { useState, useEffect } from "react";
import { CreateCronJobs } from "./create-cron-jobs";
import { CronJob, Job } from "@/types/job";
import {
Box,
Button,
Table,
TableBody,
TableCell,
TableHead,
TableRow,
TableCell,
TableBody,
Button,
Box,
Typography,
useTheme,
} from "@mui/material";
import Cookies from "js-cookie";
import { useEffect, useState } from "react";
import { CreateCronJobs } from "./create-cron-jobs";
import { ApiService } from "@/services/api-service";
export type CronJobsProps = {
initialJobs: Job[];
initialCronJobs: CronJob[];
@@ -37,14 +36,9 @@ export const CronJobs = ({
}, [initialJobs, initialCronJobs, initialUser]);
const handleDeleteCronJob = async (id: string) => {
const token = Cookies.get("token");
const response = await fetch("/api/delete-cron-job", {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${token}`,
},
body: JSON.stringify({ data: { id, user_email: user.email } }),
const response = await ApiService.deleteCronJobs({
id,
user_email: user.email,
});
if (response.ok) {

View File

@@ -1,11 +1,14 @@
"use client";
import React, { useState, useEffect, useRef } from "react";
import { Button, Container, Box, Snackbar, Alert } from "@mui/material";
import React, { useEffect, useRef } from "react";
import { Container, Box } from "@mui/material";
import { useRouter } from "next/router";
import { Element, Result } from "@/types";
import { ElementTable, JobSubmitter } from "@/components/submit/job-submitter";
import { useJobSubmitterProvider } from "@/components/submit/job-submitter/provider";
import {
ErrorSnackbar,
JobNotifySnackbar,
} from "@/components/common/snackbars";
export const Home = () => {
const {
@@ -15,9 +18,9 @@ export const Home = () => {
setRows,
results,
snackbarOpen,
setSnackbarOpen,
snackbarMessage,
snackbarSeverity,
closeSnackbar,
} = useJobSubmitterProvider();
const router = useRouter();
const { elements, url } = router.query;
@@ -28,6 +31,7 @@ export const Home = () => {
if (elements) {
setRows(JSON.parse(elements as string));
}
if (url) {
setSubmittedURL(url as string);
}
@@ -39,48 +43,6 @@ export const Home = () => {
}
}, [results]);
const handleCloseSnackbar = () => {
setSnackbarOpen(false);
};
const ErrorSnackbar = () => {
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="error">
{snackbarMessage}
</Alert>
</Snackbar>
);
};
const NotifySnackbar = () => {
const goTo = () => {
router.push("/jobs");
};
const action = (
<Button color="inherit" size="small" onClick={goTo}>
Go To Job
</Button>
);
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="info" action={action}>
{snackbarMessage}
</Alert>
</Snackbar>
);
};
return (
<Box
bgcolor="background.default"
@@ -93,7 +55,8 @@ export const Home = () => {
>
<Container maxWidth="lg" className="overflow-y-auto max-h-full">
<JobSubmitter />
{submittedURL.length ? (
{submittedURL.length > 0 ? (
<ElementTable
rows={rows}
setRows={setRows}
@@ -101,7 +64,20 @@ export const Home = () => {
/>
) : null}
</Container>
{snackbarSeverity === "info" ? <NotifySnackbar /> : <ErrorSnackbar />}
{snackbarSeverity === "info" ? (
<JobNotifySnackbar
open={snackbarOpen}
onClose={closeSnackbar}
message={snackbarMessage}
/>
) : (
<ErrorSnackbar
open={snackbarOpen}
onClose={closeSnackbar}
message={snackbarMessage}
/>
)}
</Box>
);
};

View File

@@ -1,20 +1,20 @@
import { JobSelector } from "@/components/ai";
import { fetchJobs } from "@/lib";
import { JobSelector } from "@/components/common/job-selector";
import { MediaViewer } from "@/components/common/media-viewer";
import { TileGridView } from "@/components/common/media-viewer/tile-grid-view";
import { useGetCurrentJobs } from "@/hooks/use-get-current-jobs";
import { Job } from "@/types";
import {
Box,
useTheme,
Typography,
CircularProgress,
Alert,
Box,
CircularProgress,
Paper,
Tabs,
Tab,
Tabs,
Typography,
useTheme,
} from "@mui/material";
import { useRouter, useSearchParams } from "next/navigation";
import { useState, useEffect } from "react";
import { TileGridView } from "@/components/common/media-viewer/tile-grid-view";
import { MediaViewer } from "@/components/common/media-viewer";
import { useEffect, useState } from "react";
export interface MediaFiles {
audio: string[];
@@ -31,10 +31,10 @@ export const MediaId = () => {
const searchParams = useSearchParams();
const theme = useTheme();
const router = useRouter();
const { jobs } = useGetCurrentJobs();
const [error, setError] = useState<string | null>(null);
const [loading, setLoading] = useState(true);
const [jobs, setJobs] = useState<Job[]>([]);
const [selectedJob, setSelectedJob] = useState<Job | null>(null);
const [mediaFiles, setMediaFiles] = useState<MediaFiles | null>(null);
const [activeTab, setActiveTab] = useState<string>("images");
@@ -60,11 +60,6 @@ export const MediaId = () => {
router.push(`/media?id=${currentId}&type=${activeTab}&file=${fileName}`);
};
// Fetch jobs on mount
useEffect(() => {
fetchJobs(setJobs);
}, []);
// Set selected job when currentId changes
useEffect(() => {
if (!currentId) {
@@ -201,7 +196,6 @@ export const MediaId = () => {
<JobSelector
setSelectedJob={handleSelectJob}
selectedJob={selectedJob}
setJobs={setJobs}
jobs={jobs}
/>
</Box>

Some files were not shown because too many files have changed in this diff Show More