fix: broken dependencies

This commit is contained in:
Jayden Pyles
2024-10-21 18:08:15 -05:00
parent b254d10c5d
commit db774e9892
7 changed files with 279 additions and 191 deletions

View File

@@ -1,52 +1,18 @@
# STL
import os
import uuid
import logging
import traceback
from io import StringIO
from typing import Optional
import csv
# PDM
from fastapi import Depends, FastAPI, HTTPException
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse, StreamingResponse
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
# LOCAL
import docker
from api.backend.job import (
query,
insert,
update_job,
delete_jobs,
get_jobs_per_day,
average_elements_per_link,
)
from api.backend.models import (
UpdateJobs,
DownloadJob,
FetchOptions,
DeleteScrapeJobs,
Job,
)
from api.backend.schemas import User
from api.backend.ai.ai_router import ai_router
from api.backend.auth.auth_utils import get_current_user
from api.backend.auth.auth_router import auth_router
client = docker.from_env()
def get_log_level(level_name: Optional[str]) -> int:
level = logging.INFO
if level_name:
level_name = level_name.upper()
level = getattr(logging, level_name, logging.INFO)
return level
from api.backend.utils import get_log_level
from api.backend.routers.job_router import job_router
from api.backend.routers.log_router import log_router
from api.backend.routers.stats_router import stats_router
log_level = os.getenv("LOG_LEVEL")
LOG_LEVEL = get_log_level(log_level)
@@ -60,8 +26,6 @@ logging.basicConfig(
LOG = logging.getLogger(__name__)
app = FastAPI(title="api")
app.include_router(auth_router)
app.include_router(ai_router)
app.add_middleware(
CORSMiddleware,
@@ -72,151 +36,8 @@ app.add_middleware(
)
@app.post("/update")
async def update(update_jobs: UpdateJobs, user: User = Depends(get_current_user)):
"""Used to update jobs"""
await update_job(update_jobs.ids, update_jobs.field, update_jobs.value)
@app.post("/submit-scrape-job")
async def submit_scrape_job(job: Job):
LOG.info(f"Recieved job: {job}")
try:
job.id = uuid.uuid4().hex
job_dict = job.model_dump()
await insert(job_dict)
return JSONResponse(content=f"Job queued for scraping: {job.id}")
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=500)
@app.post("/retrieve-scrape-jobs")
async def retrieve_scrape_jobs(
fetch_options: FetchOptions, user: User = Depends(get_current_user)
):
LOG.info(f"Retrieving jobs for account: {user.email}")
try:
results = await query({"user": user.email}, fetch_options=fetch_options)
return JSONResponse(content=jsonable_encoder(results[::-1]))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
return JSONResponse(content=[], status_code=500)
@app.get("/job/{id}")
async def job(id: str, user: User = Depends(get_current_user)):
LOG.info(f"Retrieving jobs for account: {user.email}")
try:
filter = {"user": user.email, "id": id}
results = await query(filter)
return JSONResponse(content=jsonable_encoder(results))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
return JSONResponse(content={"error": str(e)}, status_code=500)
def clean_text(text: str):
text = text.replace("\r\n", "\n") # Normalize newlines
text = text.replace("\n", "\\n") # Escape newlines
text = text.replace('"', '\\"') # Escape double quotes
return text
@app.post("/download")
async def download(download_job: DownloadJob):
LOG.info(f"Downloading job with ids: {download_job.ids}")
try:
results = await query({"id": {"$in": download_job.ids}})
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer)
headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"]
csv_writer.writerow(headers)
for result in results:
for res in result["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", ""))
csv_writer.writerow(
[
result.get("id", ""),
url,
element_name,
value.get("xpath", ""),
text,
result.get("user", ""),
result.get("time_created", ""),
]
)
_ = csv_buffer.seek(0)
response = StreamingResponse(
csv_buffer,
media_type="text/csv",
)
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
return {"error": str(e)}
@app.post("/delete-scrape-jobs")
async def delete(delete_scrape_jobs: DeleteScrapeJobs):
result = await delete_jobs(delete_scrape_jobs.ids)
return (
JSONResponse(content={"message": "Jobs successfully deleted."})
if result
else JSONResponse({"error": "Jobs not deleted."})
)
@app.get("/initial_logs")
async def get_initial_logs():
container_id = "scraperr_api"
try:
container = client.containers.get(container_id)
log_stream = container.logs(stream=False).decode("utf-8")
return JSONResponse(content={"logs": log_stream})
except Exception as e:
raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
@app.get("/logs")
async def get_own_logs():
container_id = "scraperr_api"
try:
container = client.containers.get(container_id)
log_stream = container.logs(stream=True, follow=True)
def log_generator():
try:
for log in log_stream:
yield f"data: {log.decode('utf-8')}\n\n"
except Exception as e:
yield f"data: {str(e)}\n\n"
return StreamingResponse(log_generator(), media_type="text/event-stream")
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/statistics/get-average-element-per-link")
async def get_average_element_per_link(user: User = Depends(get_current_user)):
return await average_elements_per_link(user.email)
@app.get("/statistics/get-average-jobs-per-day")
async def average_jobs_per_day(user: User = Depends(get_current_user)):
data = await get_jobs_per_day(user.email)
return data
app.include_router(auth_router)
app.include_router(ai_router)
app.include_router(job_router)
app.include_router(log_router)
app.include_router(stats_router)

View File

@@ -0,0 +1,133 @@
# STL
import uuid
import traceback
from io import StringIO
import csv
import logging
# PDM
from fastapi import Depends, APIRouter
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse, StreamingResponse
# LOCAL
from api.backend.job import (
query,
insert,
update_job,
delete_jobs,
)
from api.backend.models import (
UpdateJobs,
DownloadJob,
FetchOptions,
DeleteScrapeJobs,
Job,
)
from api.backend.schemas import User
from api.backend.auth.auth_utils import get_current_user
from api.backend.utils import clean_text
LOG = logging.getLogger(__name__)
job_router = APIRouter()
@job_router.post("/update")
async def update(update_jobs: UpdateJobs, _: User = Depends(get_current_user)):
"""Used to update jobs"""
await update_job(update_jobs.ids, update_jobs.field, update_jobs.value)
@job_router.post("/submit-scrape-job")
async def submit_scrape_job(job: Job):
LOG.info(f"Recieved job: {job}")
try:
job.id = uuid.uuid4().hex
job_dict = job.model_dump()
await insert(job_dict)
return JSONResponse(content=f"Job queued for scraping: {job.id}")
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=500)
@job_router.post("/retrieve-scrape-jobs")
async def retrieve_scrape_jobs(
fetch_options: FetchOptions, user: User = Depends(get_current_user)
):
LOG.info(f"Retrieving jobs for account: {user.email}")
try:
results = await query({"user": user.email}, fetch_options=fetch_options)
return JSONResponse(content=jsonable_encoder(results[::-1]))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
return JSONResponse(content=[], status_code=500)
@job_router.get("/job/{id}")
async def job(id: str, user: User = Depends(get_current_user)):
LOG.info(f"Retrieving jobs for account: {user.email}")
try:
filter = {"user": user.email, "id": id}
results = await query(filter)
return JSONResponse(content=jsonable_encoder(results))
except Exception as e:
LOG.error(f"Exception occurred: {e}")
return JSONResponse(content={"error": str(e)}, status_code=500)
@job_router.post("/download")
async def download(download_job: DownloadJob):
LOG.info(f"Downloading job with ids: {download_job.ids}")
try:
results = await query({"id": {"$in": download_job.ids}})
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer)
headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"]
csv_writer.writerow(headers)
for result in results:
for res in result["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", ""))
csv_writer.writerow(
[
result.get("id", ""),
url,
element_name,
value.get("xpath", ""),
text,
result.get("user", ""),
result.get("time_created", ""),
]
)
_ = csv_buffer.seek(0)
response = StreamingResponse(
csv_buffer,
media_type="text/csv",
)
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
except Exception as e:
LOG.error(f"Exception occurred: {e}")
traceback.print_exc()
return {"error": str(e)}
@job_router.post("/delete-scrape-jobs")
async def delete(delete_scrape_jobs: DeleteScrapeJobs):
result = await delete_jobs(delete_scrape_jobs.ids)
return (
JSONResponse(content={"message": "Jobs successfully deleted."})
if result
else JSONResponse({"error": "Jobs not deleted."})
)

View File

@@ -0,0 +1,46 @@
# STL
import logging
import docker
# PDM
from fastapi import APIRouter, HTTPException
from fastapi.responses import JSONResponse, StreamingResponse
LOG = logging.getLogger(__name__)
log_router = APIRouter()
client = docker.from_env()
@log_router.get("/initial_logs")
async def get_initial_logs():
container_id = "scraperr_api"
try:
container = client.containers.get(container_id)
log_stream = container.logs(stream=False).decode("utf-8")
return JSONResponse(content={"logs": log_stream})
except Exception as e:
raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
@log_router.get("/logs")
async def get_own_logs():
container_id = "scraperr_api"
try:
container = client.containers.get(container_id)
log_stream = container.logs(stream=True, follow=True)
def log_generator():
try:
for log in log_stream:
yield f"data: {log.decode('utf-8')}\n\n"
except Exception as e:
yield f"data: {str(e)}\n\n"
return StreamingResponse(log_generator(), media_type="text/event-stream")
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@@ -0,0 +1,29 @@
# STL
import logging
# PDM
from fastapi import APIRouter, Depends
# LOCAL
from api.backend.job import (
get_jobs_per_day,
average_elements_per_link,
)
from api.backend.auth.auth_utils import get_current_user
from api.backend.schemas import User
LOG = logging.getLogger(__name__)
stats_router = APIRouter()
@stats_router.get("/statistics/get-average-element-per-link")
async def get_average_element_per_link(user: User = Depends(get_current_user)):
return await average_elements_per_link(user.email)
@stats_router.get("/statistics/get-average-jobs-per-day")
async def average_jobs_per_day(user: User = Depends(get_current_user)):
data = await get_jobs_per_day(user.email)
return data

19
api/backend/utils.py Normal file
View File

@@ -0,0 +1,19 @@
from typing import Optional
import logging
def clean_text(text: str):
text = text.replace("\r\n", "\n") # Normalize newlines
text = text.replace("\n", "\\n") # Escape newlines
text = text.replace('"', '\\"') # Escape double quotes
return text
def get_log_level(level_name: Optional[str]) -> int:
level = logging.INFO
if level_name:
level_name = level_name.upper()
level = getattr(logging, level_name, logging.INFO)
return level

41
pdm.lock generated
View File

@@ -5,7 +5,7 @@
groups = ["default", "dev"]
strategy = []
lock_version = "4.5.0"
content_hash = "sha256:50e60db0b9c7d55c330310b64871570dd5494665b67a06138d8a78c2df377932"
content_hash = "sha256:8773474a2ac9829b371599e669215d10a50c1b1ce880e9de518c91019fabaadf"
[[metadata.targets]]
requires_python = "==3.10.12"
@@ -445,6 +445,20 @@ files = [
{file = "ecdsa-0.19.0.tar.gz", hash = "sha256:60eaad1199659900dd0af521ed462b793bbdf867432b3948e87416ae4caf6bf8"},
]
[[package]]
name = "email-validator"
version = "2.2.0"
requires_python = ">=3.8"
summary = "A robust email address syntax and deliverability validation library."
dependencies = [
"dnspython>=2.0.0",
"idna>=2.0.0",
]
files = [
{file = "email_validator-2.2.0-py3-none-any.whl", hash = "sha256:561977c2d73ce3611850a06fa56b414621e0c8faa9d66f2611407d87465da631"},
{file = "email_validator-2.2.0.tar.gz", hash = "sha256:cb690f344c617a714f22e66ae771445a1ceb46821152df8e165c5f9a364582b7"},
]
[[package]]
name = "et-xmlfile"
version = "1.1.0"
@@ -1198,6 +1212,21 @@ files = [
{file = "pydantic_core-2.23.4.tar.gz", hash = "sha256:2584f7cf844ac4d970fba483a717dbe10c1c1c96a969bf65d61ffe94df1b2863"},
]
[[package]]
name = "pydantic"
version = "2.9.2"
extras = ["email"]
requires_python = ">=3.8"
summary = "Data validation using Python type hints"
dependencies = [
"email-validator>=2.0.0",
"pydantic==2.9.2",
]
files = [
{file = "pydantic-2.9.2-py3-none-any.whl", hash = "sha256:f048cec7b26778210e28a0459867920654d48e5e62db0958433636cde4254f12"},
{file = "pydantic-2.9.2.tar.gz", hash = "sha256:d155cef71265d1e9807ed1c32b4c8deec042a44a50a4188b25ac67ecd81a9c0f"},
]
[[package]]
name = "pyee"
version = "12.0.0"
@@ -1396,6 +1425,16 @@ files = [
{file = "python_keycloak-4.6.2.tar.gz", hash = "sha256:184462526fc512546b9b5d2c584b0cdf7539f9ba2854a5b08b7727458c2569f9"},
]
[[package]]
name = "python-multipart"
version = "0.0.12"
requires_python = ">=3.8"
summary = "A streaming multipart parser for Python"
files = [
{file = "python_multipart-0.0.12-py3-none-any.whl", hash = "sha256:43dcf96cf65888a9cd3423544dd0d75ac10f7aa0c3c28a175bbcd00c9ce1aebf"},
{file = "python_multipart-0.0.12.tar.gz", hash = "sha256:045e1f98d719c1ce085ed7f7e1ef9d8ccc8c02ba02b5566d5f7521410ced58cb"},
]
[[package]]
name = "pytz"
version = "2024.2"

View File

@@ -20,7 +20,7 @@ dependencies = [
"requests-html>=0.10.0",
"selenium>=4.22.0",
"webdriver-manager>=4.0.1",
"pydantic>=2.8.2",
"pydantic[email]>=2.9.2",
"pandas>=2.2.2",
"openpyxl>=3.1.5",
"xlsxwriter>=3.2.0",
@@ -39,6 +39,7 @@ dependencies = [
"exceptiongroup>=1.2.2",
"Faker>=30.6.0",
"pytest-asyncio>=0.24.0",
"python-multipart>=0.0.12",
]
requires-python = ">=3.10"
readme = "README.md"