mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-11-26 19:13:42 +00:00
fix: broken dependencies
This commit is contained in:
@@ -1,52 +1,18 @@
|
||||
# STL
|
||||
import os
|
||||
import uuid
|
||||
import logging
|
||||
import traceback
|
||||
from io import StringIO
|
||||
from typing import Optional
|
||||
import csv
|
||||
|
||||
# PDM
|
||||
from fastapi import Depends, FastAPI, HTTPException
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
# LOCAL
|
||||
import docker
|
||||
from api.backend.job import (
|
||||
query,
|
||||
insert,
|
||||
update_job,
|
||||
delete_jobs,
|
||||
get_jobs_per_day,
|
||||
average_elements_per_link,
|
||||
)
|
||||
from api.backend.models import (
|
||||
UpdateJobs,
|
||||
DownloadJob,
|
||||
FetchOptions,
|
||||
DeleteScrapeJobs,
|
||||
Job,
|
||||
)
|
||||
from api.backend.schemas import User
|
||||
from api.backend.ai.ai_router import ai_router
|
||||
from api.backend.auth.auth_utils import get_current_user
|
||||
from api.backend.auth.auth_router import auth_router
|
||||
|
||||
client = docker.from_env()
|
||||
|
||||
|
||||
def get_log_level(level_name: Optional[str]) -> int:
|
||||
level = logging.INFO
|
||||
|
||||
if level_name:
|
||||
level_name = level_name.upper()
|
||||
level = getattr(logging, level_name, logging.INFO)
|
||||
|
||||
return level
|
||||
|
||||
from api.backend.utils import get_log_level
|
||||
from api.backend.routers.job_router import job_router
|
||||
from api.backend.routers.log_router import log_router
|
||||
from api.backend.routers.stats_router import stats_router
|
||||
|
||||
log_level = os.getenv("LOG_LEVEL")
|
||||
LOG_LEVEL = get_log_level(log_level)
|
||||
@@ -60,8 +26,6 @@ logging.basicConfig(
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
app = FastAPI(title="api")
|
||||
app.include_router(auth_router)
|
||||
app.include_router(ai_router)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
@@ -72,151 +36,8 @@ app.add_middleware(
|
||||
)
|
||||
|
||||
|
||||
@app.post("/update")
|
||||
async def update(update_jobs: UpdateJobs, user: User = Depends(get_current_user)):
|
||||
"""Used to update jobs"""
|
||||
await update_job(update_jobs.ids, update_jobs.field, update_jobs.value)
|
||||
|
||||
|
||||
@app.post("/submit-scrape-job")
|
||||
async def submit_scrape_job(job: Job):
|
||||
LOG.info(f"Recieved job: {job}")
|
||||
try:
|
||||
job.id = uuid.uuid4().hex
|
||||
|
||||
job_dict = job.model_dump()
|
||||
await insert(job_dict)
|
||||
|
||||
return JSONResponse(content=f"Job queued for scraping: {job.id}")
|
||||
except Exception as e:
|
||||
return JSONResponse(content={"error": str(e)}, status_code=500)
|
||||
|
||||
|
||||
@app.post("/retrieve-scrape-jobs")
|
||||
async def retrieve_scrape_jobs(
|
||||
fetch_options: FetchOptions, user: User = Depends(get_current_user)
|
||||
):
|
||||
LOG.info(f"Retrieving jobs for account: {user.email}")
|
||||
try:
|
||||
results = await query({"user": user.email}, fetch_options=fetch_options)
|
||||
return JSONResponse(content=jsonable_encoder(results[::-1]))
|
||||
except Exception as e:
|
||||
LOG.error(f"Exception occurred: {e}")
|
||||
return JSONResponse(content=[], status_code=500)
|
||||
|
||||
|
||||
@app.get("/job/{id}")
|
||||
async def job(id: str, user: User = Depends(get_current_user)):
|
||||
LOG.info(f"Retrieving jobs for account: {user.email}")
|
||||
try:
|
||||
filter = {"user": user.email, "id": id}
|
||||
results = await query(filter)
|
||||
return JSONResponse(content=jsonable_encoder(results))
|
||||
except Exception as e:
|
||||
LOG.error(f"Exception occurred: {e}")
|
||||
return JSONResponse(content={"error": str(e)}, status_code=500)
|
||||
|
||||
|
||||
def clean_text(text: str):
|
||||
text = text.replace("\r\n", "\n") # Normalize newlines
|
||||
text = text.replace("\n", "\\n") # Escape newlines
|
||||
text = text.replace('"', '\\"') # Escape double quotes
|
||||
return text
|
||||
|
||||
|
||||
@app.post("/download")
|
||||
async def download(download_job: DownloadJob):
|
||||
LOG.info(f"Downloading job with ids: {download_job.ids}")
|
||||
|
||||
try:
|
||||
results = await query({"id": {"$in": download_job.ids}})
|
||||
|
||||
csv_buffer = StringIO()
|
||||
csv_writer = csv.writer(csv_buffer)
|
||||
|
||||
headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"]
|
||||
csv_writer.writerow(headers)
|
||||
|
||||
for result in results:
|
||||
for res in result["result"]:
|
||||
for url, elements in res.items():
|
||||
for element_name, values in elements.items():
|
||||
for value in values:
|
||||
text = clean_text(value.get("text", ""))
|
||||
csv_writer.writerow(
|
||||
[
|
||||
result.get("id", ""),
|
||||
url,
|
||||
element_name,
|
||||
value.get("xpath", ""),
|
||||
text,
|
||||
result.get("user", ""),
|
||||
result.get("time_created", ""),
|
||||
]
|
||||
)
|
||||
|
||||
_ = csv_buffer.seek(0)
|
||||
response = StreamingResponse(
|
||||
csv_buffer,
|
||||
media_type="text/csv",
|
||||
)
|
||||
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
LOG.error(f"Exception occurred: {e}")
|
||||
traceback.print_exc()
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
@app.post("/delete-scrape-jobs")
|
||||
async def delete(delete_scrape_jobs: DeleteScrapeJobs):
|
||||
result = await delete_jobs(delete_scrape_jobs.ids)
|
||||
return (
|
||||
JSONResponse(content={"message": "Jobs successfully deleted."})
|
||||
if result
|
||||
else JSONResponse({"error": "Jobs not deleted."})
|
||||
)
|
||||
|
||||
|
||||
@app.get("/initial_logs")
|
||||
async def get_initial_logs():
|
||||
container_id = "scraperr_api"
|
||||
|
||||
try:
|
||||
container = client.containers.get(container_id)
|
||||
log_stream = container.logs(stream=False).decode("utf-8")
|
||||
return JSONResponse(content={"logs": log_stream})
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
|
||||
|
||||
|
||||
@app.get("/logs")
|
||||
async def get_own_logs():
|
||||
container_id = "scraperr_api"
|
||||
|
||||
try:
|
||||
container = client.containers.get(container_id)
|
||||
log_stream = container.logs(stream=True, follow=True)
|
||||
|
||||
def log_generator():
|
||||
try:
|
||||
for log in log_stream:
|
||||
yield f"data: {log.decode('utf-8')}\n\n"
|
||||
except Exception as e:
|
||||
yield f"data: {str(e)}\n\n"
|
||||
|
||||
return StreamingResponse(log_generator(), media_type="text/event-stream")
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@app.get("/statistics/get-average-element-per-link")
|
||||
async def get_average_element_per_link(user: User = Depends(get_current_user)):
|
||||
return await average_elements_per_link(user.email)
|
||||
|
||||
|
||||
@app.get("/statistics/get-average-jobs-per-day")
|
||||
async def average_jobs_per_day(user: User = Depends(get_current_user)):
|
||||
data = await get_jobs_per_day(user.email)
|
||||
return data
|
||||
app.include_router(auth_router)
|
||||
app.include_router(ai_router)
|
||||
app.include_router(job_router)
|
||||
app.include_router(log_router)
|
||||
app.include_router(stats_router)
|
||||
|
||||
133
api/backend/routers/job_router.py
Normal file
133
api/backend/routers/job_router.py
Normal file
@@ -0,0 +1,133 @@
|
||||
# STL
|
||||
import uuid
|
||||
import traceback
|
||||
from io import StringIO
|
||||
import csv
|
||||
import logging
|
||||
|
||||
# PDM
|
||||
from fastapi import Depends, APIRouter
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
|
||||
# LOCAL
|
||||
from api.backend.job import (
|
||||
query,
|
||||
insert,
|
||||
update_job,
|
||||
delete_jobs,
|
||||
)
|
||||
from api.backend.models import (
|
||||
UpdateJobs,
|
||||
DownloadJob,
|
||||
FetchOptions,
|
||||
DeleteScrapeJobs,
|
||||
Job,
|
||||
)
|
||||
from api.backend.schemas import User
|
||||
from api.backend.auth.auth_utils import get_current_user
|
||||
from api.backend.utils import clean_text
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
job_router = APIRouter()
|
||||
|
||||
|
||||
@job_router.post("/update")
|
||||
async def update(update_jobs: UpdateJobs, _: User = Depends(get_current_user)):
|
||||
"""Used to update jobs"""
|
||||
await update_job(update_jobs.ids, update_jobs.field, update_jobs.value)
|
||||
|
||||
|
||||
@job_router.post("/submit-scrape-job")
|
||||
async def submit_scrape_job(job: Job):
|
||||
LOG.info(f"Recieved job: {job}")
|
||||
try:
|
||||
job.id = uuid.uuid4().hex
|
||||
|
||||
job_dict = job.model_dump()
|
||||
await insert(job_dict)
|
||||
|
||||
return JSONResponse(content=f"Job queued for scraping: {job.id}")
|
||||
except Exception as e:
|
||||
return JSONResponse(content={"error": str(e)}, status_code=500)
|
||||
|
||||
|
||||
@job_router.post("/retrieve-scrape-jobs")
|
||||
async def retrieve_scrape_jobs(
|
||||
fetch_options: FetchOptions, user: User = Depends(get_current_user)
|
||||
):
|
||||
LOG.info(f"Retrieving jobs for account: {user.email}")
|
||||
try:
|
||||
results = await query({"user": user.email}, fetch_options=fetch_options)
|
||||
return JSONResponse(content=jsonable_encoder(results[::-1]))
|
||||
except Exception as e:
|
||||
LOG.error(f"Exception occurred: {e}")
|
||||
return JSONResponse(content=[], status_code=500)
|
||||
|
||||
|
||||
@job_router.get("/job/{id}")
|
||||
async def job(id: str, user: User = Depends(get_current_user)):
|
||||
LOG.info(f"Retrieving jobs for account: {user.email}")
|
||||
try:
|
||||
filter = {"user": user.email, "id": id}
|
||||
results = await query(filter)
|
||||
return JSONResponse(content=jsonable_encoder(results))
|
||||
except Exception as e:
|
||||
LOG.error(f"Exception occurred: {e}")
|
||||
return JSONResponse(content={"error": str(e)}, status_code=500)
|
||||
|
||||
|
||||
@job_router.post("/download")
|
||||
async def download(download_job: DownloadJob):
|
||||
LOG.info(f"Downloading job with ids: {download_job.ids}")
|
||||
|
||||
try:
|
||||
results = await query({"id": {"$in": download_job.ids}})
|
||||
|
||||
csv_buffer = StringIO()
|
||||
csv_writer = csv.writer(csv_buffer)
|
||||
|
||||
headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"]
|
||||
csv_writer.writerow(headers)
|
||||
|
||||
for result in results:
|
||||
for res in result["result"]:
|
||||
for url, elements in res.items():
|
||||
for element_name, values in elements.items():
|
||||
for value in values:
|
||||
text = clean_text(value.get("text", ""))
|
||||
csv_writer.writerow(
|
||||
[
|
||||
result.get("id", ""),
|
||||
url,
|
||||
element_name,
|
||||
value.get("xpath", ""),
|
||||
text,
|
||||
result.get("user", ""),
|
||||
result.get("time_created", ""),
|
||||
]
|
||||
)
|
||||
|
||||
_ = csv_buffer.seek(0)
|
||||
response = StreamingResponse(
|
||||
csv_buffer,
|
||||
media_type="text/csv",
|
||||
)
|
||||
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
LOG.error(f"Exception occurred: {e}")
|
||||
traceback.print_exc()
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
@job_router.post("/delete-scrape-jobs")
|
||||
async def delete(delete_scrape_jobs: DeleteScrapeJobs):
|
||||
result = await delete_jobs(delete_scrape_jobs.ids)
|
||||
return (
|
||||
JSONResponse(content={"message": "Jobs successfully deleted."})
|
||||
if result
|
||||
else JSONResponse({"error": "Jobs not deleted."})
|
||||
)
|
||||
46
api/backend/routers/log_router.py
Normal file
46
api/backend/routers/log_router.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# STL
|
||||
import logging
|
||||
import docker
|
||||
|
||||
# PDM
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
log_router = APIRouter()
|
||||
|
||||
client = docker.from_env()
|
||||
|
||||
|
||||
@log_router.get("/initial_logs")
|
||||
async def get_initial_logs():
|
||||
container_id = "scraperr_api"
|
||||
|
||||
try:
|
||||
container = client.containers.get(container_id)
|
||||
log_stream = container.logs(stream=False).decode("utf-8")
|
||||
return JSONResponse(content={"logs": log_stream})
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
|
||||
|
||||
|
||||
@log_router.get("/logs")
|
||||
async def get_own_logs():
|
||||
container_id = "scraperr_api"
|
||||
|
||||
try:
|
||||
container = client.containers.get(container_id)
|
||||
log_stream = container.logs(stream=True, follow=True)
|
||||
|
||||
def log_generator():
|
||||
try:
|
||||
for log in log_stream:
|
||||
yield f"data: {log.decode('utf-8')}\n\n"
|
||||
except Exception as e:
|
||||
yield f"data: {str(e)}\n\n"
|
||||
|
||||
return StreamingResponse(log_generator(), media_type="text/event-stream")
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
29
api/backend/routers/stats_router.py
Normal file
29
api/backend/routers/stats_router.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# STL
|
||||
import logging
|
||||
|
||||
# PDM
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
# LOCAL
|
||||
from api.backend.job import (
|
||||
get_jobs_per_day,
|
||||
average_elements_per_link,
|
||||
)
|
||||
from api.backend.auth.auth_utils import get_current_user
|
||||
from api.backend.schemas import User
|
||||
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
stats_router = APIRouter()
|
||||
|
||||
|
||||
@stats_router.get("/statistics/get-average-element-per-link")
|
||||
async def get_average_element_per_link(user: User = Depends(get_current_user)):
|
||||
return await average_elements_per_link(user.email)
|
||||
|
||||
|
||||
@stats_router.get("/statistics/get-average-jobs-per-day")
|
||||
async def average_jobs_per_day(user: User = Depends(get_current_user)):
|
||||
data = await get_jobs_per_day(user.email)
|
||||
return data
|
||||
19
api/backend/utils.py
Normal file
19
api/backend/utils.py
Normal file
@@ -0,0 +1,19 @@
|
||||
from typing import Optional
|
||||
import logging
|
||||
|
||||
|
||||
def clean_text(text: str):
|
||||
text = text.replace("\r\n", "\n") # Normalize newlines
|
||||
text = text.replace("\n", "\\n") # Escape newlines
|
||||
text = text.replace('"', '\\"') # Escape double quotes
|
||||
return text
|
||||
|
||||
|
||||
def get_log_level(level_name: Optional[str]) -> int:
|
||||
level = logging.INFO
|
||||
|
||||
if level_name:
|
||||
level_name = level_name.upper()
|
||||
level = getattr(logging, level_name, logging.INFO)
|
||||
|
||||
return level
|
||||
41
pdm.lock
generated
41
pdm.lock
generated
@@ -5,7 +5,7 @@
|
||||
groups = ["default", "dev"]
|
||||
strategy = []
|
||||
lock_version = "4.5.0"
|
||||
content_hash = "sha256:50e60db0b9c7d55c330310b64871570dd5494665b67a06138d8a78c2df377932"
|
||||
content_hash = "sha256:8773474a2ac9829b371599e669215d10a50c1b1ce880e9de518c91019fabaadf"
|
||||
|
||||
[[metadata.targets]]
|
||||
requires_python = "==3.10.12"
|
||||
@@ -445,6 +445,20 @@ files = [
|
||||
{file = "ecdsa-0.19.0.tar.gz", hash = "sha256:60eaad1199659900dd0af521ed462b793bbdf867432b3948e87416ae4caf6bf8"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "email-validator"
|
||||
version = "2.2.0"
|
||||
requires_python = ">=3.8"
|
||||
summary = "A robust email address syntax and deliverability validation library."
|
||||
dependencies = [
|
||||
"dnspython>=2.0.0",
|
||||
"idna>=2.0.0",
|
||||
]
|
||||
files = [
|
||||
{file = "email_validator-2.2.0-py3-none-any.whl", hash = "sha256:561977c2d73ce3611850a06fa56b414621e0c8faa9d66f2611407d87465da631"},
|
||||
{file = "email_validator-2.2.0.tar.gz", hash = "sha256:cb690f344c617a714f22e66ae771445a1ceb46821152df8e165c5f9a364582b7"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "et-xmlfile"
|
||||
version = "1.1.0"
|
||||
@@ -1198,6 +1212,21 @@ files = [
|
||||
{file = "pydantic_core-2.23.4.tar.gz", hash = "sha256:2584f7cf844ac4d970fba483a717dbe10c1c1c96a969bf65d61ffe94df1b2863"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pydantic"
|
||||
version = "2.9.2"
|
||||
extras = ["email"]
|
||||
requires_python = ">=3.8"
|
||||
summary = "Data validation using Python type hints"
|
||||
dependencies = [
|
||||
"email-validator>=2.0.0",
|
||||
"pydantic==2.9.2",
|
||||
]
|
||||
files = [
|
||||
{file = "pydantic-2.9.2-py3-none-any.whl", hash = "sha256:f048cec7b26778210e28a0459867920654d48e5e62db0958433636cde4254f12"},
|
||||
{file = "pydantic-2.9.2.tar.gz", hash = "sha256:d155cef71265d1e9807ed1c32b4c8deec042a44a50a4188b25ac67ecd81a9c0f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyee"
|
||||
version = "12.0.0"
|
||||
@@ -1396,6 +1425,16 @@ files = [
|
||||
{file = "python_keycloak-4.6.2.tar.gz", hash = "sha256:184462526fc512546b9b5d2c584b0cdf7539f9ba2854a5b08b7727458c2569f9"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "python-multipart"
|
||||
version = "0.0.12"
|
||||
requires_python = ">=3.8"
|
||||
summary = "A streaming multipart parser for Python"
|
||||
files = [
|
||||
{file = "python_multipart-0.0.12-py3-none-any.whl", hash = "sha256:43dcf96cf65888a9cd3423544dd0d75ac10f7aa0c3c28a175bbcd00c9ce1aebf"},
|
||||
{file = "python_multipart-0.0.12.tar.gz", hash = "sha256:045e1f98d719c1ce085ed7f7e1ef9d8ccc8c02ba02b5566d5f7521410ced58cb"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pytz"
|
||||
version = "2024.2"
|
||||
|
||||
@@ -20,7 +20,7 @@ dependencies = [
|
||||
"requests-html>=0.10.0",
|
||||
"selenium>=4.22.0",
|
||||
"webdriver-manager>=4.0.1",
|
||||
"pydantic>=2.8.2",
|
||||
"pydantic[email]>=2.9.2",
|
||||
"pandas>=2.2.2",
|
||||
"openpyxl>=3.1.5",
|
||||
"xlsxwriter>=3.2.0",
|
||||
@@ -39,6 +39,7 @@ dependencies = [
|
||||
"exceptiongroup>=1.2.2",
|
||||
"Faker>=30.6.0",
|
||||
"pytest-asyncio>=0.24.0",
|
||||
"python-multipart>=0.0.12",
|
||||
]
|
||||
requires-python = ">=3.10"
|
||||
readme = "README.md"
|
||||
|
||||
Reference in New Issue
Block a user