15 Commits

Author SHA1 Message Date
Jayden Pyles
3475d66995 Add cron jobs (#60)
* feat: finish up cron jobs

* feat: clean up
2025-04-24 22:03:28 -05:00
Jayden Pyles
186b4a0231 Merge branch 'master' of github.com:jaypyles/Scraperr 2025-04-24 22:02:06 -05:00
Jayden Pyles
0af0ebf5b5 feat: fix authentication 2025-04-24 18:24:19 -05:00
c3Nz
ef35db00d7 fix: Python handler Fixed (#51)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
* Fix: Python handler Fixed

* fix: Python handler Fixed without comment
2024-11-26 10:05:43 -06:00
Jayden Pyles
d65e600ec3 Merge branch 'master' of github.com:jaypyles/Scraperr
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
2024-11-21 18:13:18 -06:00
Jayden Pyles
6fe145f649 chore: remove uneeded files [skip ci] 2024-11-21 18:12:46 -06:00
Jayden Pyles
563ca2245e Refactor: Drop MongoDB (#48)
* feat: replace mongodb with sqllite

* feat: update docker compose to drop mongo

* chore: drop logs

* chore: cleanup

* fix: unit tests

* fix: workflow

* fix: workflow run
2024-11-21 18:11:46 -06:00
Jayden Pyles
d54fdbd405 fix: workflow ruin [skip ci] 2024-11-21 18:11:31 -06:00
Jayden Pyles
7169755cd2 fix: workflow 2024-11-21 18:03:40 -06:00
Jayden Pyles
15b56b5704 fix: unit tests 2024-11-21 18:00:57 -06:00
Jayden Pyles
bf6b740005 chore: cleanup 2024-11-21 17:43:20 -06:00
Jayden Pyles
c339e75e06 chore: drop logs 2024-11-21 17:36:47 -06:00
Jayden Pyles
b6ed40e6cf feat: update docker compose to drop mongo 2024-11-21 17:36:22 -06:00
Jayden Pyles
3085f9d31a feat: replace mongodb with sqllite 2024-11-20 21:32:27 -06:00
Jayden Pyles
7d80ff5c7f Feat: Site Mapping (#46)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
* wip: add site mapping

* chore: cleanup
2024-11-16 20:55:23 -06:00
79 changed files with 3705 additions and 1150 deletions

View File

@@ -7,7 +7,7 @@ on:
jobs:
build:
if: ${{ github.event.workflow_run.conclusion == 'success' && github.ref == 'refs/heads/master' && github.event_name != 'pull_request' }}
if: ${{ github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.head_branch == 'master' }}
runs-on: ubuntu-latest
steps:
- name: Checkout

View File

@@ -15,6 +15,9 @@ jobs:
- name: Checkout
uses: actions/checkout@v4
- name: Set env
run: echo "ENV=test" >> $GITHUB_ENV
- name: Install pdm
run: pip install pdm

1
.gitignore vendored
View File

@@ -187,3 +187,4 @@ cython_debug/
postgres_data
.vscode
ollama
data

1
.python-version Normal file
View File

@@ -0,0 +1 @@
3.10.12

View File

@@ -1,3 +0,0 @@
github_repo: https://github.com/jaypyles/webapp-template.git
deploy_path: /home/admin/site-test6
deploy_command: make pull up-prd

View File

@@ -1,10 +0,0 @@
- name: Deploy site
hosts: all
become: true
vars_files:
- ./config.yaml
tasks:
- name: Deploy
command: "{{deploy_command}}"
args:
chdir: "{{deploy_path}}"

View File

@@ -1,6 +0,0 @@
all:
hosts:
host1:
ansible_host: 192.168.0.1
ansible_user: admin
ansible_ssh_private_key_file: private_key.pem

View File

@@ -1,54 +0,0 @@
- name: Install Docker and run make pull up
hosts: all
become: true
vars_files:
- ./config.yaml
tasks:
- name: Update apt cache
apt:
update_cache: yes
- name: Install required packages
apt:
name:
- apt-transport-https
- ca-certificates
- curl
- gnupg-agent
- software-properties-common
- rsync
- make
state: present
- name: Add Dockers official GPG key
apt_key:
url: https://download.docker.com/linux/ubuntu/gpg
state: present
- name: Add Docker APT repository
apt_repository:
repo: deb [arch=amd64] https://download.docker.com/linux/ubuntu focal stable
state: present
- name: Update apt cache again after adding Docker repo
apt:
update_cache: yes
- name: Install Docker
apt:
name: docker-ce
state: present
- name: Start and enable Docker service
systemd:
name: docker
enabled: yes
state: started
- name: Install Docker Compose
apt:
name: docker-compose-plugin
state: present
- name: Verify Docker is installed
command: docker --version
register: docker_version
- name: Display Docker version
debug:
msg: "Docker version: {{ docker_version.stdout }}"
- name: Clone repo
ansible.builtin.git:
repo: "{{github_repo}}"
dest: "{{deploy_path}}"

View File

@@ -1,9 +1,13 @@
# STL
import os
import logging
import apscheduler # type: ignore
# PDM
from fastapi import FastAPI
import apscheduler.schedulers
import apscheduler.schedulers.background
from fastapi import FastAPI, Request, status
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
# LOCAL
@@ -13,6 +17,11 @@ from api.backend.utils import get_log_level
from api.backend.routers.job_router import job_router
from api.backend.routers.log_router import log_router
from api.backend.routers.stats_router import stats_router
from api.backend.database.startup import init_database
from fastapi.responses import JSONResponse
from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
from api.backend.scheduler import scheduler
log_level = os.getenv("LOG_LEVEL")
LOG_LEVEL = get_log_level(log_level)
@@ -41,3 +50,28 @@ app.include_router(ai_router)
app.include_router(job_router)
app.include_router(log_router)
app.include_router(stats_router)
@app.on_event("startup")
async def startup_event():
start_cron_scheduler(scheduler)
scheduler.start()
if os.getenv("ENV") != "test":
init_database()
LOG.info("Starting up...")
@app.on_event("shutdown")
def shutdown_scheduler():
scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(request: Request, exc: RequestValidationError):
exc_str = f"{exc}".replace("\n", " ").replace(" ", " ")
logging.error(f"{request}: {exc_str}")
content = {"status_code": 10422, "message": exc_str, "data": None}
return JSONResponse(
content=content, status_code=status.HTTP_422_UNPROCESSABLE_ENTITY
)

View File

@@ -7,7 +7,6 @@ from fastapi.security import OAuth2PasswordRequestForm
# LOCAL
from api.backend.schemas import User, Token, UserCreate
from api.backend.database import get_user_collection
from api.backend.auth.auth_utils import (
ACCESS_TOKEN_EXPIRE_MINUTES,
get_current_user,
@@ -15,9 +14,14 @@ from api.backend.auth.auth_utils import (
get_password_hash,
create_access_token,
)
import logging
from api.backend.database.common import update
auth_router = APIRouter()
LOG = logging.getLogger("auth_router")
@auth_router.post("/auth/token", response_model=Token)
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
@@ -43,12 +47,14 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends(
@auth_router.post("/auth/signup", response_model=User)
async def create_user(user: UserCreate):
users_collection = get_user_collection()
hashed_password = get_password_hash(user.password)
user_dict = user.model_dump()
user_dict["hashed_password"] = hashed_password
del user_dict["password"]
_ = await users_collection.insert_one(user_dict)
query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)"
_ = update(query, (user_dict["email"], hashed_password, user_dict["full_name"]))
return user_dict

View File

@@ -1,7 +1,5 @@
# STL
import os
from gc import disable
from queue import Empty
from typing import Any, Optional
from datetime import datetime, timedelta
import logging
@@ -15,7 +13,8 @@ from fastapi.security import OAuth2PasswordBearer
# LOCAL
from api.backend.schemas import User, UserInDB, TokenData
from api.backend.database import get_user_collection
from api.backend.database.common import query
LOG = logging.getLogger(__name__)
@@ -40,8 +39,8 @@ def get_password_hash(password: str):
async def get_user(email: str):
user_collection = get_user_collection()
user = await user_collection.find_one({"email": email})
user_query = "SELECT * FROM users WHERE email = ?"
user = query(user_query, (email,))[0]
if not user:
return
@@ -77,27 +76,42 @@ def create_access_token(
async def get_current_user(token: str = Depends(oauth2_scheme)):
LOG.info(f"Getting current user with token: {token}")
LOG.debug(f"Getting current user with token: {token}")
if not token:
LOG.debug("No token provided")
return EMPTY_USER
if len(token.split(".")) != 3:
LOG.error(f"Malformed token: {token}")
return EMPTY_USER
try:
LOG.debug(
f"Decoding token: {token} with secret key: {SECRET_KEY} and algorithm: {ALGORITHM}"
)
if token.startswith("Bearer "):
token = token.split(" ")[1]
payload: Optional[dict[str, Any]] = jwt.decode(
token, SECRET_KEY, algorithms=[ALGORITHM]
)
if not payload:
LOG.error("No payload found in token")
return EMPTY_USER
email = payload.get("sub")
if email is None:
LOG.error("No email found in payload")
return EMPTY_USER
token_data = TokenData(email=email)
except JWTError:
except JWTError as e:
LOG.error(f"JWTError occurred: {e}")
return EMPTY_USER
except Exception as e:
@@ -105,7 +119,6 @@ async def get_current_user(token: str = Depends(oauth2_scheme)):
return EMPTY_USER
user = await get_user(email=token_data.email)
if user is None:
return EMPTY_USER

1
api/backend/constants.py Normal file
View File

@@ -0,0 +1 @@
DATABASE_PATH = "data/database.db"

View File

@@ -1,23 +0,0 @@
# STL
import os
from typing import Any
# PDM
from dotenv import load_dotenv
from motor.motor_asyncio import AsyncIOMotorClient
_ = load_dotenv()
MONGODB_URI = os.getenv("MONGODB_URI")
def get_user_collection():
client: AsyncIOMotorClient[dict[str, Any]] = AsyncIOMotorClient(MONGODB_URI)
db = client["scrape"]
return db["users"]
def get_job_collection():
client: AsyncIOMotorClient[dict[str, Any]] = AsyncIOMotorClient(MONGODB_URI)
db = client["scrape"]
return db["jobs"]

View File

@@ -0,0 +1,3 @@
from .common import insert, QUERIES, update
__all__ = ["insert", "QUERIES", "update"]

View File

@@ -0,0 +1,92 @@
import sqlite3
from typing import Any, Optional
from api.backend.constants import DATABASE_PATH
from api.backend.utils import format_json, format_sql_row_to_python
from api.backend.database.schema import INIT_QUERY
from api.backend.database.queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
import logging
LOG = logging.getLogger(__name__)
def connect():
connection = sqlite3.connect(DATABASE_PATH)
connection.set_trace_callback(print)
cursor = connection.cursor()
return cursor
def insert(query: str, values: tuple[Any, ...]):
connection = sqlite3.connect(DATABASE_PATH)
cursor = connection.cursor()
copy = list(values)
format_json(copy)
try:
_ = cursor.execute(query, copy)
connection.commit()
except sqlite3.Error as e:
LOG.error(f"An error occurred: {e}")
finally:
cursor.close()
connection.close()
def query(query: str, values: Optional[tuple[Any, ...]] = None):
connection = sqlite3.connect(DATABASE_PATH)
connection.row_factory = sqlite3.Row
cursor = connection.cursor()
rows = []
try:
if values:
_ = cursor.execute(query, values)
else:
_ = cursor.execute(query)
rows = cursor.fetchall()
finally:
cursor.close()
connection.close()
formatted_rows: list[dict[str, Any]] = []
for row in rows:
row = dict(row)
formatted_row = format_sql_row_to_python(row)
formatted_rows.append(formatted_row)
return formatted_rows
def update(query: str, values: Optional[tuple[Any, ...]] = None):
connection = sqlite3.connect(DATABASE_PATH)
cursor = connection.cursor()
copy = None
if values:
copy = list(values)
format_json(copy)
try:
if copy:
res = cursor.execute(query, copy)
else:
res = cursor.execute(query)
connection.commit()
return res.rowcount
except sqlite3.Error as e:
LOG.error(f"An error occurred: {e}")
finally:
cursor.close()
connection.close()
return 0
QUERIES = {
"init": INIT_QUERY,
"insert_job": JOB_INSERT_QUERY,
"delete_job": DELETE_JOB_QUERY,
}

View File

@@ -0,0 +1,3 @@
from .queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
__all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"]

View File

@@ -0,0 +1,9 @@
JOB_INSERT_QUERY = """
INSERT INTO jobs
(id, url, elements, user, time_created, result, status, chat, job_options)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
"""
DELETE_JOB_QUERY = """
DELETE FROM jobs WHERE id IN ()
"""

View File

@@ -0,0 +1,3 @@
from .schema import INIT_QUERY
__all__ = ["INIT_QUERY"]

View File

@@ -0,0 +1,30 @@
INIT_QUERY = """
CREATE TABLE IF NOT EXISTS jobs (
id STRING PRIMARY KEY NOT NULL,
url STRING NOT NULL,
elements JSON NOT NULL,
user STRING,
time_created DATETIME NOT NULL,
result JSON NOT NULL,
status STRING NOT NULL,
chat JSON,
job_options JSON
);
CREATE TABLE IF NOT EXISTS users (
email STRING PRIMARY KEY NOT NULL,
hashed_password STRING NOT NULL,
full_name STRING,
disabled BOOLEAN
);
CREATE TABLE IF NOT EXISTS cron_jobs (
id STRING PRIMARY KEY NOT NULL,
user_email STRING NOT NULL,
job_id STRING NOT NULL,
cron_expression STRING NOT NULL,
time_created DATETIME NOT NULL,
time_updated DATETIME NOT NULL,
FOREIGN KEY (job_id) REFERENCES jobs(id)
);
"""

View File

@@ -0,0 +1,15 @@
from api.backend.database.common import connect, QUERIES
import logging
LOG = logging.getLogger(__name__)
def init_database():
cursor = connect()
for query in QUERIES["init"].strip().split(";"):
if query.strip():
LOG.info(f"Executing query: {query}")
_ = cursor.execute(query)
cursor.close()

View File

@@ -1,161 +0,0 @@
# STL
import logging
from typing import Any, Optional
# PDM
from pymongo import DESCENDING
# LOCAL
from api.backend.models import FetchOptions
from api.backend.database import get_job_collection
LOG = logging.getLogger(__name__)
async def insert(item: dict[str, Any]) -> None:
collection = get_job_collection()
i = await collection.insert_one(item)
LOG.info(f"Inserted item: {i}")
async def get_queued_job():
collection = get_job_collection()
return await collection.find_one(
{"status": "Queued"}, sort=[("created_at", DESCENDING)]
)
async def query(
filter: dict[str, Any], fetch_options: Optional[FetchOptions] = None
) -> list[dict[str, Any]]:
collection = get_job_collection()
cursor = collection.find(filter)
results: list[dict[str, Any]] = []
async for document in cursor:
del document["_id"]
if fetch_options and not fetch_options.chat and document.get("chat"):
del document["chat"]
results.append(document)
return results
async def update_job(ids: list[str], field: str, value: Any):
collection = get_job_collection()
for id in ids:
_ = await collection.update_one(
{"id": id},
{"$set": {field: value}},
)
async def delete_jobs(jobs: list[str]):
collection = get_job_collection()
result = await collection.delete_many({"id": {"$in": jobs}})
LOG.info(f"{result.deleted_count} documents deleted")
return True if result.deleted_count > 0 else False
async def average_elements_per_link(user: str):
collection = get_job_collection()
pipeline = [
{"$match": {"status": "Completed", "user": user}},
{
"$addFields": {
"time_created_date": {
"$cond": {
"if": {"$eq": [{"$type": "$time_created"}, "date"]},
"then": "$time_created",
"else": {
"$convert": {
"input": "$time_created",
"to": "date",
"onError": None,
"onNull": None,
}
},
}
}
}
},
{
"$project": {
"date": {
"$dateToString": {
"format": "%Y-%m-%d",
"date": "$time_created_date",
}
},
"num_elements": {"$size": "$elements"},
}
},
{
"$group": {
"_id": "$date",
"average_elements": {"$avg": "$num_elements"},
"count": {"$sum": 1},
}
},
{"$sort": {"_id": 1}},
]
cursor = collection.aggregate(pipeline)
results: list[dict[str, Any]] = []
async for document in cursor:
results.append(
{
"date": document["_id"],
"average_elements": document["average_elements"],
"count": document["count"],
}
)
return results
async def get_jobs_per_day(user: str):
collection = get_job_collection()
pipeline = [
{"$match": {"status": "Completed", "user": user}},
{
"$addFields": {
"time_created_date": {
"$cond": {
"if": {"$eq": [{"$type": "$time_created"}, "date"]},
"then": "$time_created",
"else": {
"$convert": {
"input": "$time_created",
"to": "date",
"onError": None,
"onNull": None,
}
},
}
}
}
},
{
"$project": {
"date": {
"$dateToString": {
"format": "%Y-%m-%d",
"date": "$time_created_date",
}
}
}
},
{"$group": {"_id": "$date", "job_count": {"$sum": 1}}},
{"$sort": {"_id": 1}},
]
cursor = collection.aggregate(pipeline)
results: list[dict[str, Any]] = []
async for document in cursor:
results.append({"date": document["_id"], "job_count": document["job_count"]})
return results

View File

@@ -0,0 +1,17 @@
from .job import (
insert,
update_job,
delete_jobs,
get_jobs_per_day,
get_queued_job,
average_elements_per_link,
)
__all__ = [
"insert",
"update_job",
"delete_jobs",
"get_jobs_per_day",
"get_queued_job",
"average_elements_per_link",
]

View File

@@ -0,0 +1,100 @@
import datetime
from typing import Any
import uuid
from api.backend.database.common import insert, query
from api.backend.models import CronJob
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
from apscheduler.triggers.cron import CronTrigger # type: ignore
from api.backend.job import insert as insert_job
import logging
LOG = logging.getLogger("Cron Scheduler")
def insert_cron_job(cron_job: CronJob):
query = """
INSERT INTO cron_jobs (id, user_email, job_id, cron_expression, time_created, time_updated)
VALUES (?, ?, ?, ?, ?, ?)
"""
values = (
cron_job.id,
cron_job.user_email,
cron_job.job_id,
cron_job.cron_expression,
cron_job.time_created,
cron_job.time_updated,
)
insert(query, values)
return True
def delete_cron_job(id: str, user_email: str):
query = """
DELETE FROM cron_jobs
WHERE id = ? AND user_email = ?
"""
values = (id, user_email)
insert(query, values)
return True
def get_cron_jobs(user_email: str):
cron_jobs = query("SELECT * FROM cron_jobs WHERE user_email = ?", (user_email,))
return cron_jobs
def get_all_cron_jobs():
cron_jobs = query("SELECT * FROM cron_jobs")
return cron_jobs
def insert_job_from_cron_job(job: dict[str, Any]):
insert_job(
{
**job,
"id": uuid.uuid4().hex,
"status": "Queued",
"result": "",
"chat": None,
"time_created": datetime.datetime.now(),
"time_updated": datetime.datetime.now(),
}
)
def get_cron_job_trigger(cron_expression: str):
expression_parts = cron_expression.split()
if len(expression_parts) != 5:
print(f"Invalid cron expression: {cron_expression}")
return None
minute, hour, day, month, day_of_week = expression_parts
return CronTrigger(
minute=minute, hour=hour, day=day, month=month, day_of_week=day_of_week
)
def start_cron_scheduler(scheduler: BackgroundScheduler):
cron_jobs = get_all_cron_jobs()
LOG.info(f"Cron jobs: {cron_jobs}")
for job in cron_jobs:
queried_job = query("SELECT * FROM jobs WHERE id = ?", (job["job_id"],))
LOG.info(f"Adding job: {queried_job}")
scheduler.add_job(
insert_job_from_cron_job,
get_cron_job_trigger(job["cron_expression"]),
id=job["id"],
args=[queried_job[0]],
)

97
api/backend/job/job.py Normal file
View File

@@ -0,0 +1,97 @@
# STL
import logging
from typing import Any
# LOCAL
from api.backend.utils import format_list_for_query
from api.backend.database.common import (
insert as common_insert,
query as common_query,
QUERIES,
update as common_update,
)
LOG = logging.getLogger(__name__)
def insert(item: dict[str, Any]) -> None:
common_insert(
QUERIES["insert_job"],
(
item["id"],
item["url"],
item["elements"],
item["user"],
item["time_created"],
item["result"],
item["status"],
item["chat"],
item["job_options"],
),
)
LOG.info(f"Inserted item: {item}")
async def get_queued_job():
query = (
"SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
)
res = common_query(query)
LOG.info(f"Got queued job: {res}")
return res[0] if res else None
async def update_job(ids: list[str], field: str, value: Any):
query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
res = common_update(query, tuple([value] + ids))
LOG.info(f"Updated job: {res}")
async def delete_jobs(jobs: list[str]):
if not jobs:
LOG.info("No jobs to delete.")
return False
query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}"
res = common_update(query, tuple(jobs))
return res > 0
async def average_elements_per_link(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
AVG(json_array_length(elements)) AS average_elements,
COUNT(*) AS count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = common_query(job_query, (user,))
return results
async def get_jobs_per_day(user: str):
job_query = """
SELECT
DATE(time_created) AS date,
COUNT(*) AS job_count
FROM
jobs
WHERE
status = 'Completed' AND user = ?
GROUP BY
DATE(time_created)
ORDER BY
date ASC;
"""
results = common_query(job_query, (user,))
return results

View File

@@ -0,0 +1,3 @@
from .job_options import JobOptions
__all__ = ["JobOptions"]

View File

@@ -0,0 +1,14 @@
from pydantic import BaseModel
from typing import Any, Optional
from api.backend.job.models.site_map import SiteMap
class FetchOptions(BaseModel):
chat: Optional[bool] = None
class JobOptions(BaseModel):
multi_page_scrape: bool = False
custom_headers: dict[str, Any] = {}
proxies: list[str] = []
site_map: Optional[SiteMap] = None

View File

@@ -0,0 +1,14 @@
from pydantic import BaseModel
from typing import Literal
class Action(BaseModel):
type: Literal["click", "input"]
xpath: str
name: str
input: str = ""
do_once: bool = True
class SiteMap(BaseModel):
actions: list[Action]

View File

@@ -0,0 +1,30 @@
import time
from typing import cast
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
def scrape_content(driver: webdriver.Chrome, pages: set[tuple[str, str]]):
_ = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
last_height = cast(str, driver.execute_script("return document.body.scrollHeight"))
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3) # Wait for the page to load
new_height = cast(
str, driver.execute_script("return document.body.scrollHeight")
)
if new_height == last_height:
break
last_height = new_height
pages.add((driver.page_source, driver.current_url))
return driver.page_source

View File

View File

@@ -0,0 +1,93 @@
from api.backend.job.models.site_map import Action, SiteMap
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from typing import Any
import logging
import time
from copy import deepcopy
from api.backend.job.scraping.scraping_utils import scrape_content
from selenium.webdriver.support.ui import WebDriverWait
from seleniumwire.inspect import TimeoutException
from seleniumwire.webdriver import Chrome
from selenium.webdriver.support import expected_conditions as EC
LOG = logging.getLogger(__name__)
def clear_done_actions(site_map: dict[str, Any]):
"""Clear all actions that have been clicked."""
cleared_site_map = deepcopy(site_map)
cleared_site_map["actions"] = [
action for action in cleared_site_map["actions"] if not action["do_once"]
]
return cleared_site_map
def handle_input(action: Action, driver: webdriver.Chrome):
try:
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, action.xpath))
)
LOG.info(f"Sending keys: {action.input} to element: {element}")
element.send_keys(action.input)
except NoSuchElementException:
LOG.info(f"Element not found: {action.xpath}")
return False
except TimeoutException:
LOG.info(f"Timeout waiting for element: {action.xpath}")
return False
except Exception as e:
LOG.info(f"Error handling input: {e}")
return False
return True
def handle_click(action: Action, driver: webdriver.Chrome):
try:
element = driver.find_element(By.XPATH, action.xpath)
LOG.info(f"Clicking element: {element}")
element.click()
except NoSuchElementException:
LOG.info(f"Element not found: {action.xpath}")
return False
return True
ACTION_MAP = {
"click": handle_click,
"input": handle_input,
}
async def handle_site_mapping(
site_map_dict: dict[str, Any],
driver: Chrome,
pages: set[tuple[str, str]],
):
site_map = SiteMap(**site_map_dict)
for action in site_map.actions:
action_handler = ACTION_MAP[action.type]
if not action_handler(action, driver):
return
time.sleep(2)
_ = scrape_content(driver, pages)
cleared_site_map_dict = clear_done_actions(site_map_dict)
if cleared_site_map_dict["actions"]:
await handle_site_mapping(cleared_site_map_dict, driver, pages)

View File

@@ -2,14 +2,13 @@
from typing import Any, Optional, Union
from datetime import datetime
# LOCAL
from api.backend.job.models.job_options import JobOptions
# PDM
import pydantic
class FetchOptions(pydantic.BaseModel):
chat: Optional[bool] = None
class Element(pydantic.BaseModel):
name: str
xpath: str
@@ -22,12 +21,6 @@ class CapturedElement(pydantic.BaseModel):
name: str
class JobOptions(pydantic.BaseModel):
multi_page_scrape: bool = False
custom_headers: Optional[dict[str, Any]] = {}
proxies: Optional[list[str]] = []
class RetrieveScrapeJobs(pydantic.BaseModel):
user: str
@@ -64,3 +57,17 @@ class Job(pydantic.BaseModel):
job_options: JobOptions
status: str = "Queued"
chat: Optional[str] = None
class CronJob(pydantic.BaseModel):
id: Optional[str] = None
user_email: str
job_id: str
cron_expression: str
time_created: Optional[Union[datetime, str]] = None
time_updated: Optional[Union[datetime, str]] = None
class DeleteCronJob(pydantic.BaseModel):
id: str
user_email: str

View File

@@ -1,4 +1,5 @@
# STL
import datetime
import uuid
import traceback
from io import StringIO
@@ -10,24 +11,33 @@ import random
from fastapi import Depends, APIRouter
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse, StreamingResponse
from api.backend.scheduler import scheduler
from apscheduler.triggers.cron import CronTrigger # type: ignore
# LOCAL
from api.backend.job import (
query,
insert,
update_job,
delete_jobs,
)
from api.backend.job import insert, update_job, delete_jobs
from api.backend.models import (
DeleteCronJob,
UpdateJobs,
DownloadJob,
FetchOptions,
DeleteScrapeJobs,
Job,
CronJob,
)
from api.backend.schemas import User
from api.backend.auth.auth_utils import get_current_user
from api.backend.utils import clean_text
from api.backend.utils import clean_text, format_list_for_query
from api.backend.job.models.job_options import FetchOptions
from api.backend.database.common import query
from api.backend.job.cron_scheduling.cron_scheduling import (
delete_cron_job,
get_cron_job_trigger,
insert_cron_job,
get_cron_jobs,
insert_job_from_cron_job,
)
LOG = logging.getLogger(__name__)
@@ -47,10 +57,11 @@ async def submit_scrape_job(job: Job):
job.id = uuid.uuid4().hex
job_dict = job.model_dump()
await insert(job_dict)
insert(job_dict)
return JSONResponse(content={"id": job.id})
except Exception as e:
LOG.error(f"Exception occurred: {traceback.format_exc()}")
return JSONResponse(content={"error": str(e)}, status_code=500)
@@ -59,8 +70,11 @@ async def retrieve_scrape_jobs(
fetch_options: FetchOptions, user: User = Depends(get_current_user)
):
LOG.info(f"Retrieving jobs for account: {user.email}")
ATTRIBUTES = "chat" if fetch_options.chat else "*"
try:
results = await query({"user": user.email}, fetch_options=fetch_options)
job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?"
results = query(job_query, (user.email,))
return JSONResponse(content=jsonable_encoder(results[::-1]))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
@@ -72,8 +86,8 @@ async def job(id: str, user: User = Depends(get_current_user)):
LOG.info(f"Retrieving jobs for account: {user.email}")
try:
filter = {"user": user.email, "id": id}
results = await query(filter)
job_query = "SELECT * FROM jobs WHERE user = ? AND id = ?"
results = query(job_query, (user.email, id))
return JSONResponse(content=jsonable_encoder(results))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
@@ -85,7 +99,10 @@ async def download(download_job: DownloadJob):
LOG.info(f"Downloading job with ids: {download_job.ids}")
try:
results = await query({"id": {"$in": download_job.ids}})
job_query = (
f"SELECT * FROM jobs WHERE id IN {format_list_for_query(download_job.ids)}"
)
results = query(job_query, tuple(download_job.ids))
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
@@ -136,3 +153,47 @@ async def delete(delete_scrape_jobs: DeleteScrapeJobs):
if result
else JSONResponse({"error": "Jobs not deleted."})
)
@job_router.post("/schedule-cron-job")
async def schedule_cron_job(cron_job: CronJob):
if not cron_job.id:
cron_job.id = uuid.uuid4().hex
if not cron_job.time_created:
cron_job.time_created = datetime.datetime.now()
if not cron_job.time_updated:
cron_job.time_updated = datetime.datetime.now()
insert_cron_job(cron_job)
queried_job = query("SELECT * FROM jobs WHERE id = ?", (cron_job.job_id,))
scheduler.add_job(
insert_job_from_cron_job,
get_cron_job_trigger(cron_job.cron_expression),
id=cron_job.id,
args=[queried_job[0]],
)
return JSONResponse(content={"message": "Cron job scheduled successfully."})
@job_router.post("/delete-cron-job")
async def delete_cron_job_request(request: DeleteCronJob):
if not request.id:
return JSONResponse(
content={"error": "Cron job id is required."}, status_code=400
)
delete_cron_job(request.id, request.user_email)
scheduler.remove_job(request.id)
return JSONResponse(content={"message": "Cron job deleted successfully."})
@job_router.get("/cron-jobs")
async def get_cron_jobs_request(user: User = Depends(get_current_user)):
cron_jobs = get_cron_jobs(user.email)
return JSONResponse(content=jsonable_encoder(cron_jobs))

3
api/backend/scheduler.py Normal file
View File

@@ -0,0 +1,3 @@
from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
scheduler = BackgroundScheduler()

View File

@@ -1,19 +1,20 @@
import logging
from typing import Any, Optional
import time
import random
from bs4 import BeautifulSoup
from lxml import etree
from seleniumwire import webdriver
from lxml.etree import _Element # type: ignore [reportPrivateImport]
from lxml.etree import _Element # pyright: ignore [reportPrivateUsage]
from fake_useragent import UserAgent
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options as ChromeOptions
from urllib.parse import urlparse, urljoin
from api.backend.models import Element, CapturedElement
from api.backend.job.site_mapping.site_mapping import (
handle_site_mapping,
)
from api.backend.job.scraping.scraping_utils import scrape_content
from api.backend.job.models.site_map import SiteMap
LOG = logging.getLogger(__name__)
@@ -95,6 +96,7 @@ async def make_site_request(
pages: set[tuple[str, str]] = set(),
original_url: str = "",
proxies: Optional[list[str]] = [],
site_map: Optional[dict[str, Any]] = None,
) -> None:
"""Make basic `GET` request to site using Selenium."""
# Check if URL has already been visited
@@ -114,27 +116,16 @@ async def make_site_request(
final_url = driver.current_url
visited_urls.add(url)
visited_urls.add(final_url)
_ = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_source = scrape_content(driver, pages)
time.sleep(3) # Wait for the page to load
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
final_height = driver.execute_script("return document.body.scrollHeight")
page_source = driver.page_source
LOG.debug(f"Page source for url: {url}\n{page_source}")
pages.add((page_source, final_url))
if site_map:
LOG.info("Site map: %s", site_map)
_ = await handle_site_mapping(
site_map,
driver,
pages,
)
finally:
driver.quit()
@@ -172,7 +163,10 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
el = sxpath(root, elem.xpath)
for e in el:
text = "\t".join(str(t) for t in e.itertext())
if isinstance(e, etree._Element):
text = "\t".join(str(t) for t in e.itertext())
else:
text = str(e)
captured_element = CapturedElement(
xpath=elem.xpath, text=text, name=elem.name
)
@@ -192,6 +186,7 @@ async def scrape(
headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False,
proxies: Optional[list[str]] = [],
site_map: Optional[SiteMap] = None,
):
visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set()
@@ -204,6 +199,7 @@ async def scrape(
pages=pages,
original_url=url,
proxies=proxies,
site_map=site_map,
)
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()

View File

@@ -1,15 +1,10 @@
import pytest
import logging
from unittest.mock import AsyncMock, patch, MagicMock
from api.backend.tests.factories.job_factory import create_job
from api.backend.models import JobOptions
from api.backend.scraping import create_driver
mocked_job = create_job(
job_options=JobOptions(
multi_page_scrape=False, custom_headers={}, proxies=["127.0.0.1:8080"]
)
).model_dump()
logging.basicConfig(level=logging.DEBUG)
LOG = logging.getLogger(__name__)
@pytest.mark.asyncio
@@ -26,8 +21,7 @@ async def test_proxy(mock_get: AsyncMock):
driver.get("http://example.com")
response = driver.last_request
# Check if the proxy header is set correctly
if response:
assert response.headers["Proxy"] == "127.0.0.1:8080"
assert response.headers["Proxy-Connection"] == "keep-alive"
driver.quit()

View File

@@ -1,5 +1,8 @@
from typing import Optional
from typing import Any, Optional
import logging
import json
LOG = logging.getLogger(__name__)
def clean_text(text: str):
@@ -17,3 +20,30 @@ def get_log_level(level_name: Optional[str]) -> int:
level = getattr(logging, level_name, logging.INFO)
return level
def format_list_for_query(ids: list[str]):
return (
f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)"
)
def format_sql_row_to_python(row: dict[str, Any]):
new_row: dict[str, Any] = {}
for key, value in row.items():
if isinstance(value, str):
try:
new_row[key] = json.loads(value)
except json.JSONDecodeError:
new_row[key] = value
else:
new_row[key] = value
return new_row
def format_json(items: list[Any]):
for idx, item in enumerate(items):
if isinstance(item, (dict, list)):
formatted_item = json.dumps(item)
items[idx] = formatted_item

View File

@@ -24,6 +24,7 @@ async def process_job():
job["job_options"]["custom_headers"],
job["job_options"]["multi_page_scrape"],
job["job_options"]["proxies"],
job["job_options"]["site_map"],
)
LOG.info(
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"

View File

@@ -10,5 +10,8 @@ services:
- "$PWD/package-lock.json:/app/package-lock.json"
- "$PWD/tsconfig.json:/app/tsconfig.json"
scraperr_api:
environment:
- LOG_LEVEL=INFO
volumes:
- "$PWD/api:/project/api"
- "$PWD/scraping:/project/scraping"

View File

@@ -23,25 +23,16 @@ services:
dockerfile: docker/api/Dockerfile
environment:
- LOG_LEVEL=INFO
- MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB
- SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string)
- SECRET_KEY=MRo9PfasPibnqFeK4Oswb6Z+PhFmjzdvxZzwdAkbf/Y= # used to encode authentication tokens (can be a random string)
- ALGORITHM=HS256 # authentication encoding algorithm
- ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes
container_name: scraperr_api
ports:
- 8000:8000
volumes:
- "$PWD/data:/project/data"
- /var/run/docker.sock:/var/run/docker.sock
networks:
- web
mongo:
container_name: webscrape-mongo
image: mongo
restart: always
environment:
MONGO_INITDB_ROOT_USERNAME: root
MONGO_INITDB_ROOT_PASSWORD: example
networks:
- web
networks:
web:

View File

@@ -1,5 +1,5 @@
# Build next dependencies
FROM node:latest
FROM node:23.1
WORKDIR /app
COPY package*.json ./
@@ -15,6 +15,4 @@ COPY src /app/src
RUN npm run build
EXPOSE 3000
# CMD [ "npm", "run" ]
EXPOSE 3000

View File

@@ -1,4 +0,0 @@
tls:
certificates:
- certFile: /etc/certs/ssl-cert.pem
keyFile: /etc/certs/ssl-cert.key

View File

@@ -1,37 +0,0 @@
# STL
import os
# PDM
import boto3
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
def test_insert_and_delete():
# Get environment variables
region_name = os.getenv("AWS_REGION")
# Initialize DynamoDB resource
dynamodb = boto3.resource("dynamodb", region_name=region_name)
table = dynamodb.Table("scrape")
# Item to insert
item = {
"id": "123", # Replace with the appropriate id value
"attribute1": "value1",
"attribute2": "value2",
# Add more attributes as needed
}
# Insert the item
table.put_item(Item=item)
print(f"Inserted item: {item}")
# Delete the item
table.delete_item(Key={"id": "123"}) # Replace with the appropriate id value
print(f"Deleted item with id: {item['id']}")
if __name__ == "__main__":
test_insert_and_delete()

2247
pdm.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -2,9 +2,7 @@
name = "web-scrape"
version = "0.1.0"
description = ""
authors = [
{name = "Jayden Pyles", email = "jpylesbuisness@gmail.com"},
]
authors = [{ name = "Jayden Pyles", email = "jpylesbuisness@gmail.com" }]
dependencies = [
"uvicorn>=0.30.1",
"fastapi>=0.111.0",
@@ -39,20 +37,19 @@ dependencies = [
"exceptiongroup>=1.2.2",
"Faker>=30.6.0",
"pytest-asyncio>=0.24.0",
"python-multipart>=0.0.12",
"python-multipart>=0.0.1",
"bcrypt==4.0.1",
"apscheduler>=3.11.0",
]
requires-python = ">=3.10"
readme = "README.md"
license = {text = "MIT"}
license = { text = "MIT" }
[tool.pdm]
distribution = true
[tool.pdm.dev-dependencies]
dev = [
"ipython>=8.26.0",
"pytest>=8.3.3",
]
dev = ["ipython>=8.26.0", "pytest>=8.3.3"]
[tool.pyright]
include = ["./api/backend/"]
exclude = ["**/node_modules", "**/__pycache__"]
@@ -60,14 +57,42 @@ ignore = []
defineConstant = { DEBUG = true }
stubPath = ""
reportUnknownMemberType= false
reportMissingImports = true
reportMissingTypeStubs = false
reportAny = false
reportCallInDefaultInitializer = false
# Type checking strictness
typeCheckingMode = "strict" # Enables strict type checking mode
reportPrivateUsage = "error"
reportMissingTypeStubs = "error"
reportUntypedFunctionDecorator = "error"
reportUntypedClassDecorator = "error"
reportUntypedBaseClass = "error"
reportInvalidTypeVarUse = "error"
reportUnnecessaryTypeIgnoreComment = "information"
reportUnknownVariableType = "none"
reportUnknownMemberType = "none"
reportUnknownParameterType = "none"
pythonVersion = "3.9"
pythonPlatform = "Linux"
# Additional checks
reportImplicitStringConcatenation = "error"
reportInvalidStringEscapeSequence = "error"
reportMissingImports = "error"
reportMissingModuleSource = "error"
reportOptionalCall = "error"
reportOptionalIterable = "error"
reportOptionalMemberAccess = "error"
reportOptionalOperand = "error"
reportOptionalSubscript = "error"
reportTypedDictNotRequiredAccess = "error"
# Function return type checking
reportIncompleteStub = "error"
reportIncompatibleMethodOverride = "error"
reportInvalidStubStatement = "error"
reportInconsistentOverload = "error"
# Misc settings
pythonVersion = "3.10" # Matches your Python version from pyproject.toml
strictListInference = true
strictDictionaryInference = true
strictSetInference = true
[tool.isort]

View File

@@ -2,7 +2,7 @@
import React from "react";
import { useAuth } from "../../../contexts/AuthContext";
import { Box, Drawer, Divider } from "@mui/material";
import { Box, Drawer } from "@mui/material";
import { QuickSettings } from "../../nav/quick-settings";
import { NavItems } from "./nav-items/nav-items";

View File

@@ -7,6 +7,7 @@ import TerminalIcon from "@mui/icons-material/Terminal";
import BarChart from "@mui/icons-material/BarChart";
import AutoAwesomeIcon from "@mui/icons-material/AutoAwesome";
import { List } from "@mui/material";
import { Schedule } from "@mui/icons-material";
const items = [
{
@@ -34,6 +35,11 @@ const items = [
text: "View App Logs",
href: "/logs",
},
{
icon: <Schedule />,
text: "Cron Jobs",
href: "/cron-jobs",
},
];
export const NavItems = () => {

View File

@@ -15,6 +15,7 @@ import {
Button,
Tooltip,
IconButton,
TableContainer,
} from "@mui/material";
import ExpandMoreIcon from "@mui/icons-material/ExpandMore";
import StarIcon from "@mui/icons-material/Star";
@@ -52,145 +53,155 @@ export const JobQueue = ({
const router = useRouter();
return (
<Table sx={{ tableLayout: "fixed", width: "100%" }}>
<TableHead>
<TableRow>
<TableCell>Select</TableCell>
<TableCell>Id</TableCell>
<TableCell>Url</TableCell>
<TableCell>Elements</TableCell>
<TableCell>Result</TableCell>
<TableCell>Time Created</TableCell>
<TableCell>Status</TableCell>
<TableCell>Actions</TableCell>
</TableRow>
</TableHead>
<TableBody>
{filteredJobs.map((row, index) => (
<TableRow key={index}>
<TableCell padding="checkbox">
<Checkbox
checked={selectedJobs.has(row.id)}
onChange={() => onSelectJob(row.id)}
/>
<Tooltip title="Chat with AI">
<span>
<IconButton
onClick={() => {
router.push({
pathname: "/chat",
query: {
job: row.id,
},
});
}}
>
<AutoAwesome />
</IconButton>
</span>
</Tooltip>
<Tooltip title="Favorite Job">
<span>
<IconButton
color={row.favorite ? "warning" : "default"}
onClick={() => {
onFavorite([row.id], "favorite", !row.favorite);
row.favorite = !row.favorite;
}}
>
<StarIcon />
</IconButton>
</span>
</Tooltip>
</TableCell>
<TableCell sx={{ maxWidth: 100, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.id}</Box>
</TableCell>
<TableCell sx={{ maxWidth: 200, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.url}</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
{JSON.stringify(row.elements)}
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto", padding: 0 }}>
<Accordion sx={{ margin: 0, padding: 0.5 }}>
<AccordionSummary
expandIcon={<ExpandMoreIcon />}
aria-controls="panel1a-content"
id="panel1a-header"
sx={{
minHeight: 0,
"&.Mui-expanded": { minHeight: 0 },
}}
>
<Box
sx={{
maxHeight: 150,
overflow: "auto",
width: "100%",
}}
>
<Typography sx={{ fontSize: "0.875rem" }}>
Show Result
</Typography>
</Box>
</AccordionSummary>
<AccordionDetails sx={{ padding: 1 }}>
<Box sx={{ maxHeight: 200, overflow: "auto" }}>
<Typography
sx={{
fontSize: "0.875rem",
whiteSpace: "pre-wrap",
<TableContainer component={Box} sx={{ maxHeight: "90dvh" }}>
<Table sx={{ tableLayout: "fixed", width: "100%" }}>
<TableHead>
<TableRow>
<TableCell>Select</TableCell>
<TableCell>Id</TableCell>
<TableCell>Url</TableCell>
<TableCell>Elements</TableCell>
<TableCell>Result</TableCell>
<TableCell>Time Created</TableCell>
<TableCell>Status</TableCell>
<TableCell>Actions</TableCell>
</TableRow>
</TableHead>
<TableBody sx={{ overflow: "auto" }}>
{filteredJobs.map((row, index) => (
<TableRow key={index}>
<TableCell padding="checkbox">
<Checkbox
checked={selectedJobs.has(row.id)}
onChange={() => onSelectJob(row.id)}
/>
<Tooltip title="Chat with AI">
<span>
<IconButton
onClick={() => {
router.push({
pathname: "/chat",
query: {
job: row.id,
},
});
}}
>
{JSON.stringify(row.result, null, 2)}
</Typography>
</Box>
</AccordionDetails>
</Accordion>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
{new Date(row.time_created).toLocaleString()}
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 50, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
<Box
className="rounded-md p-2 text-center"
sx={{ bgcolor: colors[row.status] }}
>
{row.status}
<AutoAwesome />
</IconButton>
</span>
</Tooltip>
<Tooltip title="Favorite Job">
<span>
<IconButton
color={row.favorite ? "warning" : "default"}
onClick={() => {
onFavorite([row.id], "favorite", !row.favorite);
row.favorite = !row.favorite;
}}
>
<StarIcon />
</IconButton>
</span>
</Tooltip>
</TableCell>
<TableCell sx={{ maxWidth: 100, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.id}</Box>
</TableCell>
<TableCell sx={{ maxWidth: 200, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.url}</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
{JSON.stringify(row.elements)}
</Box>
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ display: "flex", gap: 1 }}>
<Button
onClick={() => {
onDownload([row.id]);
}}
size="small"
sx={{ minWidth: 0, padding: "4px 8px" }}
>
Download
</Button>
<Button
onClick={() =>
onNavigate(row.elements, row.url, row.job_options)
}
size="small"
sx={{ minWidth: 0, padding: "4px 8px" }}
>
Rerun
</Button>
</Box>
</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto", padding: 0 }}>
<Accordion sx={{ margin: 0, padding: 0.5 }}>
<AccordionSummary
expandIcon={<ExpandMoreIcon />}
aria-controls="panel1a-content"
id="panel1a-header"
sx={{
minHeight: 0,
"&.Mui-expanded": { minHeight: 0 },
}}
>
<Box
sx={{
maxHeight: 150,
overflow: "auto",
width: "100%",
}}
>
<Typography sx={{ fontSize: "0.875rem" }}>
Show Result
</Typography>
</Box>
</AccordionSummary>
<AccordionDetails sx={{ padding: 1 }}>
<Box sx={{ maxHeight: 200, overflow: "auto" }}>
<Typography
sx={{
fontSize: "0.875rem",
whiteSpace: "pre-wrap",
}}
>
{JSON.stringify(row.result, null, 2)}
</Typography>
</Box>
</AccordionDetails>
</Accordion>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
{new Date(row.time_created).toLocaleString()}
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 50, overflow: "auto" }}>
<Box sx={{ maxHeight: 100, overflow: "auto" }}>
<Box
className="rounded-md p-2 text-center"
sx={{ bgcolor: colors[row.status] }}
>
{row.status}
</Box>
</Box>
</TableCell>
<TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
<Box sx={{ display: "flex", gap: 1 }}>
<Button
onClick={() => {
onDownload([row.id]);
}}
size="small"
sx={{
minWidth: 0,
padding: "4px 8px",
fontSize: "0.625rem",
}}
>
Download
</Button>
<Button
onClick={() =>
onNavigate(row.elements, row.url, row.job_options)
}
size="small"
sx={{
minWidth: 0,
padding: "4px 8px",
fontSize: "0.625rem",
}}
>
Rerun
</Button>
</Box>
</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</TableContainer>
);
};

View File

@@ -0,0 +1,182 @@
import { Job } from "@/types";
import {
Button,
Dialog,
DialogTitle,
DialogContent,
TextField,
Snackbar,
Alert,
} from "@mui/material";
import Cookies from "js-cookie";
import { useState } from "react";
export type CreateCronJobsProps = {
availableJobs: Job[];
user: any;
};
export const CreateCronJobs = ({
availableJobs,
user,
}: CreateCronJobsProps) => {
const [open, setOpen] = useState(false);
return (
<>
<Button
variant="contained"
color="primary"
onClick={() => setOpen(true)}
sx={{ borderRadius: 2 }}
>
Create Cron Job
</Button>
<CreateCronJobDialog
open={open}
onClose={() => setOpen(false)}
availableJobs={availableJobs}
user={user}
/>
</>
);
};
const CreateCronJobDialog = ({
open,
onClose,
availableJobs,
user,
}: {
open: boolean;
onClose: () => void;
availableJobs: Job[];
user: any;
}) => {
const [cronExpression, setCronExpression] = useState("");
const [jobId, setJobId] = useState("");
const [successOpen, setSuccessOpen] = useState(false);
const [isSubmitting, setIsSubmitting] = useState(false);
const [error, setError] = useState("");
const handleSubmit = async () => {
if (!cronExpression || !jobId) {
setError("Please fill in all fields");
return;
}
setIsSubmitting(true);
const token = Cookies.get("token");
try {
const response = await fetch("/api/schedule-cron-job", {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${token}`,
},
body: JSON.stringify({
data: {
cron_expression: cronExpression,
job_id: jobId,
user_email: user.email,
},
}),
});
if (!response.ok) {
throw new Error("Failed to schedule job");
}
setSuccessOpen(true);
setCronExpression("");
setJobId("");
setTimeout(() => {
onClose();
}, 1500);
window.location.reload();
} catch (error) {
console.error(error);
setError("Failed to create cron job");
} finally {
setIsSubmitting(false);
}
};
const handleClose = () => {
setSuccessOpen(false);
};
return (
<>
<Dialog
open={open}
onClose={onClose}
PaperProps={{
sx: { borderRadius: 2, minWidth: "400px" },
}}
>
<DialogTitle sx={{ fontWeight: 500 }}>Create Cron Job</DialogTitle>
<DialogContent>
<div className="flex flex-col gap-1 mt0">
<TextField
label="Cron Expression"
fullWidth
value={cronExpression}
onChange={(e) => setCronExpression(e.target.value)}
variant="outlined"
placeholder="* * * * *"
margin="normal"
helperText="Format: minute hour day month day-of-week"
/>
<TextField
label="Job ID"
fullWidth
value={jobId}
onChange={(e) => setJobId(e.target.value)}
variant="outlined"
margin="normal"
/>
{error && (
<Alert severity="error" sx={{ mt: 2 }}>
{error}
</Alert>
)}
<div className="flex justify-end gap-2 mt-4">
<Button
variant="outlined"
onClick={onClose}
sx={{ borderRadius: 2 }}
>
Cancel
</Button>
<Button
variant="contained"
color="primary"
onClick={handleSubmit}
disabled={isSubmitting}
sx={{ borderRadius: 2 }}
>
{isSubmitting ? "Submitting..." : "Create Job"}
</Button>
</div>
</div>
</DialogContent>
</Dialog>
<Snackbar
open={successOpen}
autoHideDuration={4000}
onClose={handleClose}
anchorOrigin={{ vertical: "bottom", horizontal: "right" }}
>
<Alert onClose={handleClose} severity="success" sx={{ width: "100%" }}>
Cron job created successfully!
</Alert>
</Snackbar>
</>
);
};

View File

@@ -0,0 +1 @@
export * from "./create-cron-jobs";

View File

@@ -0,0 +1,92 @@
import { Job, CronJob } from "@/types/job";
import { useState, useEffect } from "react";
import { CreateCronJobs } from "./create-cron-jobs";
import {
Table,
TableHead,
TableRow,
TableCell,
TableBody,
Button,
} from "@mui/material";
import Cookies from "js-cookie";
export type CronJobsProps = {
initialJobs: Job[];
initialCronJobs: CronJob[];
initialUser: any;
};
export const CronJobs = ({
initialJobs,
initialCronJobs,
initialUser,
}: CronJobsProps) => {
const [jobs, setJobs] = useState<Job[]>(initialJobs);
const [cronJobs, setCronJobs] = useState<CronJob[]>(initialCronJobs);
const [user, setUser] = useState<any>(initialUser);
useEffect(() => {
setJobs(initialJobs);
setCronJobs(initialCronJobs);
setUser(initialUser);
}, [initialJobs, initialCronJobs, initialUser]);
const handleDeleteCronJob = async (id: string) => {
const token = Cookies.get("token");
const response = await fetch("/api/delete-cron-job", {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${token}`,
},
body: JSON.stringify({ data: { id, user_email: user.email } }),
});
if (response.ok) {
console.log("Cron job deleted successfully");
setCronJobs(cronJobs.filter((cronJob) => cronJob.id !== id));
} else {
console.error("Failed to delete cron job");
}
};
return (
<div>
<CreateCronJobs availableJobs={jobs} user={user} />
<Table>
<TableHead>
<TableRow>
<TableCell>Cron Expression</TableCell>
<TableCell>Job ID</TableCell>
<TableCell>User Email</TableCell>
<TableCell>Created At</TableCell>
<TableCell>Updated At</TableCell>
<TableCell>Actions</TableCell>
</TableRow>
</TableHead>
<TableBody>
{cronJobs.map((cronJob) => (
<TableRow key={cronJob.id}>
<TableCell>{cronJob.cron_expression}</TableCell>
<TableCell>{cronJob.job_id}</TableCell>
<TableCell>{cronJob.user_email}</TableCell>
<TableCell>
{new Date(cronJob.time_created).toLocaleString()}
</TableCell>
<TableCell>
{new Date(cronJob.time_updated).toLocaleString()}
</TableCell>
<TableCell>
<Button onClick={() => handleDeleteCronJob(cronJob.id)}>
Delete
</Button>
</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</div>
);
};

View File

@@ -0,0 +1,62 @@
import axios from "axios";
import { GetServerSideProps } from "next";
import { parseCookies } from "nookies";
import { CronJob, Job } from "../../../types";
export const getServerSideProps: GetServerSideProps = async (context) => {
const { req } = context;
const cookies = parseCookies({ req });
const token = cookies.token;
let user = null;
let initialJobs: Job[] = [];
let initialCronJobs: CronJob[] = [];
if (token) {
try {
const userResponse = await axios.get(
`${process.env.NEXT_PUBLIC_API_URL}/api/auth/users/me`,
{
headers: { Authorization: `Bearer ${token}` },
}
);
user = userResponse.data;
const jobsResponse = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/retrieve-scrape-jobs`,
{
method: "POST",
body: JSON.stringify({ user: user.email }),
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
}
);
initialJobs = await jobsResponse.json();
console.log(initialJobs);
const cronJobsResponse = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/cron-jobs`,
{
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
}
);
initialCronJobs = await cronJobsResponse.json();
} catch (error) {
console.error("Error fetching user or jobs:", error);
}
}
return {
props: {
initialJobs,
initialUser: user,
initialCronJobs,
},
};
};

View File

@@ -0,0 +1 @@
export { CronJobs } from "./cron-jobs";

View File

@@ -0,0 +1,107 @@
"use client";
import React, { useState, useEffect, useRef } from "react";
import { Button, Container, Box, Snackbar, Alert } from "@mui/material";
import { useRouter } from "next/router";
import { Element, Result } from "@/types";
import { ElementTable, JobSubmitter } from "@/components/submit/job-submitter";
import { useJobSubmitterProvider } from "@/components/submit/job-submitter/provider";
export const Home = () => {
const {
submittedURL,
setSubmittedURL,
rows,
setRows,
results,
snackbarOpen,
setSnackbarOpen,
snackbarMessage,
snackbarSeverity,
} = useJobSubmitterProvider();
const router = useRouter();
const { elements, url } = router.query;
const resultsRef = useRef<HTMLTableElement | null>(null);
useEffect(() => {
if (elements) {
setRows(JSON.parse(elements as string));
}
if (url) {
setSubmittedURL(url as string);
}
}, [elements, url]);
useEffect(() => {
if (results && resultsRef.current) {
resultsRef.current.scrollIntoView({ behavior: "smooth" });
}
}, [results]);
const handleCloseSnackbar = () => {
setSnackbarOpen(false);
};
const ErrorSnackbar = () => {
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="error">
{snackbarMessage}
</Alert>
</Snackbar>
);
};
const NotifySnackbar = () => {
const goTo = () => {
router.push("/jobs");
};
const action = (
<Button color="inherit" size="small" onClick={goTo}>
Go To Job
</Button>
);
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="info" action={action}>
{snackbarMessage}
</Alert>
</Snackbar>
);
};
return (
<Box
bgcolor="background.default"
display="flex"
flexDirection="column"
justifyContent="center"
alignItems="center"
height="100%"
py={4}
>
<Container maxWidth="lg" className="overflow-y-auto max-h-full">
<JobSubmitter />
{submittedURL.length ? (
<ElementTable
rows={rows}
setRows={setRows}
submittedURL={submittedURL}
/>
) : null}
</Container>
{snackbarSeverity === "info" ? <NotifySnackbar /> : <ErrorSnackbar />}
</Box>
);
};

View File

@@ -0,0 +1 @@
export * from "./home";

View File

@@ -1,2 +1 @@
export * from "./ElementTable";
export * from "./job-submitter";

View File

@@ -15,9 +15,11 @@ import {
IconButton,
Tooltip,
useTheme,
Divider,
} from "@mui/material";
import AddIcon from "@mui/icons-material/Add";
import { Element } from "../../types";
import { Element } from "@/types";
import { SiteMap } from "../site-map";
interface Props {
rows: Element[];
@@ -169,6 +171,13 @@ export const ElementTable = ({ rows, setRows, submittedURL }: Props) => {
</div>
</TableContainer>
</Box>
<Divider
sx={{
borderColor: theme.palette.mode === "dark" ? "#ffffff" : "0000000",
marginBottom: 2,
}}
/>
<SiteMap />
</Box>
);
};

View File

@@ -0,0 +1 @@
export { ElementTable } from "./element-table";

View File

@@ -1 +1,2 @@
export { JobSubmitter } from "./job-submitter";
export { ElementTable } from "./element-table";

View File

@@ -1,26 +1,20 @@
import React, { Dispatch } from "react";
import React from "react";
import { TextField, Button, CircularProgress } from "@mui/material";
import { Element } from "@/types";
import { useJobSubmitterProvider } from "../provider";
export type JobSubmitterInputProps = {
submittedURL: string;
setSubmittedURL: Dispatch<React.SetStateAction<string>>;
isValidURL: boolean;
urlError: string | null;
handleSubmit: () => void;
loading: boolean;
rows: Element[];
};
export const JobSubmitterInput = ({
submittedURL,
setSubmittedURL,
isValidURL,
urlError,
handleSubmit,
loading,
rows,
urlError,
}: JobSubmitterInputProps) => {
const { submittedURL, setSubmittedURL, isValidURL, rows } =
useJobSubmitterProvider();
return (
<div className="flex flex-row space-x-4 items-center mb-2">
<TextField

View File

@@ -1,6 +1,7 @@
import { RawJobOptions } from "@/types/job";
import { Box, FormControlLabel, Checkbox, TextField } from "@mui/material";
import { Dispatch, SetStateAction } from "react";
import { useJobSubmitterProvider } from "../provider";
export type JobSubmitterOptionsProps = {
jobOptions: RawJobOptions;
@@ -14,9 +15,9 @@ export type JobSubmitterOptionsProps = {
export const JobSubmitterOptions = ({
jobOptions,
setJobOptions,
handleSelectProxies,
customJSONSelected,
setCustomJSONSelected,
handleSelectProxies,
proxiesSelected,
}: JobSubmitterOptionsProps) => {
const handleMultiPageScrapeChange = () => {

View File

@@ -1,7 +1,6 @@
"use client";
import React, { useEffect, useState, Dispatch } from "react";
import { Element } from "@/types";
import React, { useEffect, useState } from "react";
import { useAuth } from "@/contexts/AuthContext";
import { useRouter } from "next/router";
import { RawJobOptions } from "@/types/job";
@@ -10,21 +9,7 @@ import { JobSubmitterHeader } from "./job-submitter-header";
import { JobSubmitterInput } from "./job-submitter-input";
import { JobSubmitterOptions } from "./job-submitter-options";
import { ApiService } from "@/services";
interface StateProps {
submittedURL: string;
setSubmittedURL: Dispatch<React.SetStateAction<string>>;
rows: Element[];
isValidURL: boolean;
setIsValidUrl: Dispatch<React.SetStateAction<boolean>>;
setSnackbarMessage: Dispatch<React.SetStateAction<string>>;
setSnackbarOpen: Dispatch<React.SetStateAction<boolean>>;
setSnackbarSeverity: Dispatch<React.SetStateAction<string>>;
}
interface Props {
stateProps: StateProps;
}
import { useJobSubmitterProvider } from "./provider";
const initialJobOptions: RawJobOptions = {
multi_page_scrape: false,
@@ -32,7 +17,7 @@ const initialJobOptions: RawJobOptions = {
proxies: null,
};
export const JobSubmitter = ({ stateProps }: Props) => {
export const JobSubmitter = () => {
const { user } = useAuth();
const router = useRouter();
const { job_options } = router.query;
@@ -40,11 +25,13 @@ export const JobSubmitter = ({ stateProps }: Props) => {
const {
submittedURL,
rows,
siteMap,
setIsValidUrl,
setSnackbarMessage,
setSnackbarOpen,
setSnackbarSeverity,
} = stateProps;
setSiteMap,
} = useJobSubmitterProvider();
const [urlError, setUrlError] = useState<string | null>(null);
const [loading, setLoading] = useState<boolean>(false);
@@ -87,7 +74,8 @@ export const JobSubmitter = ({ stateProps }: Props) => {
rows,
user,
jobOptions,
customHeaders
customHeaders,
siteMap
)
.then(async (response) => {
if (!response.ok) {
@@ -120,31 +108,28 @@ export const JobSubmitter = ({ stateProps }: Props) => {
job_options as string,
setCustomJSONSelected,
setProxiesSelected,
setJobOptions
setJobOptions,
setSiteMap
);
}
}, [job_options]);
return (
<>
<div>
<JobSubmitterHeader />
<JobSubmitterInput
{...stateProps}
urlError={urlError}
handleSubmit={handleSubmit}
loading={loading}
/>
<JobSubmitterOptions
{...stateProps}
jobOptions={jobOptions}
setJobOptions={setJobOptions}
customJSONSelected={customJSONSelected}
setCustomJSONSelected={setCustomJSONSelected}
handleSelectProxies={handleSelectProxies}
proxiesSelected={proxiesSelected}
/>
</div>
</>
<div>
<JobSubmitterHeader />
<JobSubmitterInput
urlError={urlError}
handleSubmit={handleSubmit}
loading={loading}
/>
<JobSubmitterOptions
jobOptions={jobOptions}
setJobOptions={setJobOptions}
customJSONSelected={customJSONSelected}
setCustomJSONSelected={setCustomJSONSelected}
handleSelectProxies={handleSelectProxies}
proxiesSelected={proxiesSelected}
/>
</div>
);
};

View File

@@ -0,0 +1,84 @@
import React, {
createContext,
PropsWithChildren,
useContext,
useState,
Dispatch,
useMemo,
} from "react";
import { Element, Result, SiteMap } from "@/types";
type JobSubmitterProviderType = {
submittedURL: string;
setSubmittedURL: Dispatch<React.SetStateAction<string>>;
rows: Element[];
setRows: Dispatch<React.SetStateAction<Element[]>>;
results: Result;
setResults: Dispatch<React.SetStateAction<Result>>;
snackbarOpen: boolean;
setSnackbarOpen: Dispatch<React.SetStateAction<boolean>>;
snackbarMessage: string;
setSnackbarMessage: Dispatch<React.SetStateAction<string>>;
snackbarSeverity: string;
setSnackbarSeverity: Dispatch<React.SetStateAction<string>>;
isValidURL: boolean;
setIsValidUrl: Dispatch<React.SetStateAction<boolean>>;
siteMap: SiteMap | null;
setSiteMap: Dispatch<React.SetStateAction<SiteMap | null>>;
};
const JobSubmitterProvider = createContext<JobSubmitterProviderType>(
{} as JobSubmitterProviderType
);
export const Provider = ({ children }: PropsWithChildren) => {
const [submittedURL, setSubmittedURL] = useState<string>("");
const [rows, setRows] = useState<Element[]>([]);
const [results, setResults] = useState<Result>({});
const [snackbarOpen, setSnackbarOpen] = useState<boolean>(false);
const [snackbarMessage, setSnackbarMessage] = useState<string>("");
const [snackbarSeverity, setSnackbarSeverity] = useState<string>("error");
const [isValidURL, setIsValidUrl] = useState<boolean>(true);
const [siteMap, setSiteMap] = useState<SiteMap | null>(null);
const value: JobSubmitterProviderType = useMemo(
() => ({
submittedURL,
setSubmittedURL,
rows,
setRows,
results,
setResults,
snackbarOpen,
setSnackbarOpen,
snackbarMessage,
setSnackbarMessage,
snackbarSeverity,
setSnackbarSeverity,
isValidURL,
setIsValidUrl,
siteMap,
setSiteMap,
}),
[
submittedURL,
rows,
results,
snackbarOpen,
snackbarMessage,
snackbarSeverity,
isValidURL,
siteMap,
]
);
return (
<JobSubmitterProvider.Provider value={value}>
{children}
</JobSubmitterProvider.Provider>
);
};
export const useJobSubmitterProvider = () => {
return useContext(JobSubmitterProvider);
};

View File

@@ -0,0 +1 @@
export * from "./site-map";

View File

@@ -0,0 +1 @@
export * from "./site-map-input";

View File

@@ -0,0 +1,22 @@
.button {
height: 3rem;
width: 2rem;
color: #ffffff;
font-weight: 600;
border-radius: 0.375rem;
transition: transform 0.2s ease-in-out;
transform: scale(1);
}
.button:hover {
transform: scale(1.05);
}
.remove {
background-color: var(--delete-red) !important;
}
.remove:hover {
background-color: var(--delete-red-hover) !important;
}

View File

@@ -0,0 +1,135 @@
import { useState } from "react";
import { useJobSubmitterProvider } from "../../provider";
import {
MenuItem,
Select,
TextField,
FormControl,
Button,
Checkbox,
FormControlLabel,
} from "@mui/material";
import { ActionOption } from "@/types/job";
import classes from "./site-map-input.module.css";
import { clsx } from "clsx";
export type SiteMapInputProps = {
disabled?: boolean;
xpath?: string;
option?: ActionOption;
clickOnce?: boolean;
input?: string;
};
export const SiteMapInput = ({
disabled,
xpath,
option,
clickOnce,
input,
}: SiteMapInputProps) => {
console.log(clickOnce);
const [optionState, setOptionState] = useState<ActionOption>(
option || "click"
);
const [xpathState, setXpathState] = useState<string>(xpath || "");
const [clickOnceState, setClickOnceState] = useState<boolean>(
clickOnce || false
);
const [inputState, setInputState] = useState<string>(input || "");
const { siteMap, setSiteMap } = useJobSubmitterProvider();
const handleAdd = () => {
if (!siteMap) return;
console.log(optionState, xpathState, clickOnceState, inputState);
setSiteMap((prevSiteMap) => ({
...prevSiteMap,
actions: [
{
type: optionState,
xpath: xpathState,
name: "",
do_once: clickOnceState,
input: inputState,
},
...(prevSiteMap?.actions || []),
],
}));
setXpathState("");
};
const handleRemove = () => {
if (!siteMap) return;
setSiteMap((prevSiteMap) => ({
...prevSiteMap,
actions: (prevSiteMap?.actions || []).slice(0, -1),
}));
};
return (
<div className="flex flex-col gap-2 w-full">
<div className="flex gap-2 items-center">
<FormControl className="w-1/4">
<Select
disabled={disabled}
displayEmpty
value={optionState}
onChange={(e) => setOptionState(e.target.value as ActionOption)}
>
<MenuItem value="click">Click</MenuItem>
<MenuItem value="input">Input</MenuItem>
</Select>
</FormControl>
{optionState === "input" && (
<TextField
label="Input Text"
fullWidth
value={inputState}
onChange={(e) => setInputState(e.target.value)}
disabled={disabled}
/>
)}
<TextField
label="XPath Selector"
fullWidth
value={xpathState}
onChange={(e) => setXpathState(e.target.value)}
disabled={disabled}
/>
{disabled ? (
<Button
onClick={handleRemove}
className={clsx(classes.button, classes.remove)}
>
Delete
</Button>
) : (
<Button
onClick={handleAdd}
disabled={!xpathState}
className={clsx(classes.button, classes.add)}
>
Add
</Button>
)}
</div>
{!disabled && (
<FormControlLabel
label="Do Once"
control={
<Checkbox
checked={clickOnceState}
disabled={disabled}
onChange={() => setClickOnceState(!clickOnceState)}
/>
}
/>
)}
</div>
);
};

View File

@@ -0,0 +1,70 @@
import { useEffect, useState } from "react";
import { useJobSubmitterProvider } from "../provider";
import { Button, Divider, Typography, useTheme } from "@mui/material";
import { SiteMapInput } from "./site-map-input";
export const SiteMap = () => {
const { siteMap, setSiteMap } = useJobSubmitterProvider();
const [showSiteMap, setShowSiteMap] = useState<boolean>(false);
const theme = useTheme();
const handleCreateSiteMap = () => {
setSiteMap({ actions: [] });
setShowSiteMap(true);
};
const handleClearSiteMap = () => {
setSiteMap(null);
setShowSiteMap(false);
};
useEffect(() => {
if (siteMap) {
setShowSiteMap(true);
}
}, [siteMap]);
return (
<div className="flex flex-col gap-4">
{siteMap ? (
<Button onClick={handleClearSiteMap}>Clear Site Map</Button>
) : (
<Button onClick={handleCreateSiteMap}>Create Site Map</Button>
)}
{showSiteMap && (
<div className="flex flex-col gap-4">
<SiteMapInput />
{siteMap?.actions && siteMap?.actions.length > 0 && (
<>
<Divider
sx={{
borderColor:
theme.palette.mode === "dark" ? "#ffffff" : "0000000",
}}
/>
<Typography className="w-full text-center" variant="h5">
Site Map Actions
</Typography>
</>
)}
<ul className="flex flex-col gap-4">
{siteMap?.actions.reverse().map((action, index) => (
<li key={action.xpath} className="flex w-full items-center">
<Typography variant="h6" className="w-[10%] mr-2">
Action {index + 1}:
</Typography>
<SiteMapInput
disabled={Boolean(siteMap)}
xpath={action.xpath}
option={action.type}
clickOnce={action.do_once}
input={action.input}
/>
</li>
))}
</ul>
</div>
)}
</div>
);
};

View File

@@ -1,6 +1,5 @@
import React, { createContext, useContext, useState, useEffect } from "react";
import axios from "axios";
import { Constants } from "../lib";
import Cookies from "js-cookie";
interface AuthContextProps {
@@ -55,6 +54,7 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
const userResponse = await axios.get(`/api/me`, {
headers: { Authorization: `Bearer ${response.data.access_token}` },
});
setUser(userResponse.data);
setIsAuthenticated(true);
};

View File

@@ -1,15 +1,17 @@
import { Dispatch, SetStateAction } from "react";
import { RawJobOptions } from "@/types";
import { RawJobOptions, SiteMap } from "@/types";
export const parseJobOptions = (
job_options: string,
setCustomJSONSelected: Dispatch<SetStateAction<boolean>>,
setProxiesSelected: Dispatch<SetStateAction<boolean>>,
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>,
setSiteMap: Dispatch<SetStateAction<any>>
) => {
if (job_options) {
const jsonOptions = JSON.parse(job_options as string);
console.log(jsonOptions);
const newJobOptions: RawJobOptions = {
multi_page_scrape: false,
custom_headers: null,
@@ -31,6 +33,10 @@ export const parseJobOptions = (
newJobOptions.proxies = jsonOptions.proxies.join(",");
}
if (jsonOptions.site_map) {
setSiteMap(jsonOptions.site_map);
}
setJobOptions(newJobOptions);
}
};

View File

@@ -0,0 +1,39 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
if (req.method === "POST") {
const { data } = req.body;
console.log("Data", data);
const headers = new Headers();
headers.set("content-type", "application/json");
try {
const response = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/delete-cron-job`,
{
method: "POST",
headers,
body: JSON.stringify(data),
}
);
if (!response.ok) {
console.error(response);
throw new Error(`Error: ${response.statusText}`);
}
const result = await response.json();
res.status(200).json(result);
} catch (error) {
console.error("Error deleting cron job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
} else {
res.setHeader("Allow", ["POST"]);
res.status(405).end(`Method ${req.method} Not Allowed`);
}
}

View File

@@ -0,0 +1,39 @@
import { NextApiRequest, NextApiResponse } from "next";
export default async function handler(
req: NextApiRequest,
res: NextApiResponse
) {
if (req.method === "POST") {
const { data } = req.body;
console.log("Data", data);
const headers = new Headers();
headers.set("content-type", "application/json");
try {
const response = await fetch(
`${global.process.env.NEXT_PUBLIC_API_URL}/api/schedule-cron-job`,
{
method: "POST",
headers,
body: JSON.stringify(data),
}
);
if (!response.ok) {
console.error(response);
throw new Error(`Error: ${response.statusText}`);
}
const result = await response.json();
res.status(200).json(result);
} catch (error) {
console.error("Error scheduling cron job:", error);
res.status(500).json({ error: "Internal Server Error" });
}
} else {
res.setHeader("Allow", ["POST"]);
res.status(405).end(`Method ${req.method} Not Allowed`);
}
}

4
src/pages/cron-jobs.tsx Normal file
View File

@@ -0,0 +1,4 @@
import { CronJobs } from "../components/pages/cron-jobs";
import { getServerSideProps } from "../components/pages/cron-jobs/get-server-side-props";
export { getServerSideProps };
export default CronJobs;

View File

@@ -1,117 +1,10 @@
"use client";
import React, { useState, useEffect, useRef } from "react";
import { Button, Container, Box, Snackbar, Alert } from "@mui/material";
import { useRouter } from "next/router";
import { Element, Result } from "@/types";
import { ElementTable } from "@/components/submit";
import { JobSubmitter } from "@/components/submit/job-submitter";
const Home = () => {
const router = useRouter();
const { elements, url } = router.query;
const [submittedURL, setSubmittedURL] = useState<string>("");
const [rows, setRows] = useState<Element[]>([]);
const [results, setResults] = useState<Result>({});
const [snackbarOpen, setSnackbarOpen] = useState<boolean>(false);
const [snackbarMessage, setSnackbarMessage] = useState<string>("");
const [snackbarSeverity, setSnackbarSeverity] = useState<string>("error");
const [isValidURL, setIsValidUrl] = useState<boolean>(true);
const resultsRef = useRef<HTMLTableElement | null>(null);
useEffect(() => {
if (elements) {
setRows(JSON.parse(elements as string));
}
if (url) {
setSubmittedURL(url as string);
}
}, [elements, url]);
useEffect(() => {
if (results && resultsRef.current) {
resultsRef.current.scrollIntoView({ behavior: "smooth" });
}
}, [results]);
const handleCloseSnackbar = () => {
setSnackbarOpen(false);
};
const ErrorSnackbar = () => {
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="error">
{snackbarMessage}
</Alert>
</Snackbar>
);
};
const NotifySnackbar = () => {
const goTo = () => {
router.push("/jobs");
};
const action = (
<Button color="inherit" size="small" onClick={goTo}>
Go To Job
</Button>
);
return (
<Snackbar
open={snackbarOpen}
autoHideDuration={6000}
onClose={handleCloseSnackbar}
>
<Alert onClose={handleCloseSnackbar} severity="info" action={action}>
{snackbarMessage}
</Alert>
</Snackbar>
);
};
import { Provider as JobSubmitterProvider } from "@/components/submit/job-submitter/provider";
import { Home } from "@/components/pages/home/home";
export default function Main() {
return (
<Box
bgcolor="background.default"
display="flex"
flexDirection="column"
justifyContent="center"
alignItems="center"
height="100%"
py={4}
>
<Container maxWidth="lg">
<JobSubmitter
stateProps={{
submittedURL,
setSubmittedURL,
rows,
isValidURL,
setIsValidUrl,
setSnackbarMessage,
setSnackbarOpen,
setSnackbarSeverity,
}}
/>
{submittedURL.length ? (
<ElementTable
rows={rows}
setRows={setRows}
submittedURL={submittedURL}
/>
) : null}
</Container>
{snackbarSeverity === "info" ? <NotifySnackbar /> : <ErrorSnackbar />}
</Box>
<JobSubmitterProvider>
<Home />
</JobSubmitterProvider>
);
};
export default Home;
}

View File

@@ -1,9 +1,12 @@
import { SiteMap } from "@/types/job";
export const submitJob = async (
submittedURL: string,
rows: any[],
user: any,
jobOptions: any,
customHeaders: any
customHeaders: any,
siteMap: SiteMap | null
) => {
return await fetch(`/api/submit-scrape-job`, {
method: "POST",
@@ -18,6 +21,7 @@ export const submitJob = async (
...jobOptions,
custom_headers: customHeaders || {},
proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [],
site_map: siteMap,
},
},
}),

View File

@@ -2,6 +2,11 @@
@tailwind components;
@tailwind utilities;
:root {
--delete-red: #ef4444;
--delete-red-hover: #ff6969;
}
#__next {
height: 100%;
}

View File

@@ -34,6 +34,12 @@ const commonThemeOptions = {
h4: {
fontWeight: 500,
},
h5: {
fontWeight: 500,
},
h6: {
fontWeight: 500,
},
body1: {
fontFamily: '"Schibsted Grotesk", sans-serif',
},
@@ -175,6 +181,9 @@ const darkTheme = createTheme({
h5: {
color: "#ffffff",
},
h6: {
color: "#ffffff",
},
body1: {
...commonThemeOptions.typography.body1,
color: "#ffffff",

View File

@@ -16,6 +16,7 @@ export type JobOptions = {
multi_page_scrape: boolean;
custom_headers: null | string;
proxies: string[];
site_map?: SiteMap;
};
export type RawJobOptions = {
@@ -23,3 +24,26 @@ export type RawJobOptions = {
custom_headers: string | null;
proxies: string | null;
};
export type ActionOption = "click" | "input";
export type Action = {
type: ActionOption;
xpath: string;
name: string;
do_once?: boolean;
input?: string;
};
export type SiteMap = {
actions: Action[];
};
export type CronJob = {
id: string;
user_email: string;
job_id: string;
cron_expression: string;
time_created: Date;
time_updated: Date;
};

View File

@@ -109,5 +109,5 @@
"isolatedModules": true
},
"include": ["src", "src/declaration.d.ts", "src/next-auth.d.ts"],
"exclude": ["node_modules"]
"exclude": ["node_modules", "src-tauri"]
}