mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-11-14 05:06:10 +00:00
209 lines
6.2 KiB
Python
209 lines
6.2 KiB
Python
# STL
|
|
import uuid
|
|
import logging
|
|
from io import BytesIO
|
|
from openpyxl import Workbook
|
|
from typing import Any
|
|
from datetime import datetime
|
|
from bson import ObjectId
|
|
|
|
# PDM
|
|
from fastapi import BackgroundTasks, FastAPI, HTTPException
|
|
from fastapi.encoders import jsonable_encoder
|
|
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
|
|
from fastapi.staticfiles import StaticFiles
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
import docker
|
|
|
|
client = docker.from_env()
|
|
|
|
# LOCAL
|
|
from api.backend.job import (
|
|
average_elements_per_link,
|
|
get_jobs_per_day,
|
|
query,
|
|
insert,
|
|
delete_jobs,
|
|
)
|
|
from api.backend.models import (
|
|
DownloadJob,
|
|
GetStatistics,
|
|
SubmitScrapeJob,
|
|
DeleteScrapeJobs,
|
|
RetrieveScrapeJobs,
|
|
)
|
|
from api.backend.auth.auth_router import auth_router
|
|
import traceback
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(levelname)s: %(asctime)s - %(name)s - %(message)s",
|
|
handlers=[logging.StreamHandler()],
|
|
)
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
app = FastAPI(title="api")
|
|
app.include_router(auth_router)
|
|
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"],
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
app.mount("/_next/static", StaticFiles(directory="./dist/_next/static"), name="static")
|
|
|
|
|
|
@app.get("/")
|
|
def read_root():
|
|
return FileResponse("./dist/index.html")
|
|
|
|
|
|
@app.get("/favicon.ico")
|
|
def read_favicon():
|
|
return FileResponse("dist/favicon.ico")
|
|
|
|
|
|
@app.post("/api/submit-scrape-job")
|
|
async def submit_scrape_job(job: SubmitScrapeJob, background_tasks: BackgroundTasks):
|
|
LOG.info(f"Recieved job: {job}")
|
|
try:
|
|
job.id = uuid.uuid4().hex
|
|
|
|
if job.user:
|
|
job_dict = job.model_dump()
|
|
await insert(job_dict)
|
|
|
|
return JSONResponse(content=f"Job queued for scraping: {job.id}")
|
|
except Exception as e:
|
|
return JSONResponse(content={"error": str(e)}, status_code=500)
|
|
|
|
|
|
@app.post("/api/retrieve-scrape-jobs")
|
|
async def retrieve_scrape_jobs(retrieve: RetrieveScrapeJobs):
|
|
LOG.info(f"Retrieving jobs for account: {retrieve.user}")
|
|
try:
|
|
results = await query({"user": retrieve.user})
|
|
return JSONResponse(content=jsonable_encoder(results[::-1]))
|
|
except Exception as e:
|
|
LOG.error(f"Exception occurred: {e}")
|
|
return JSONResponse(content={"error": str(e)}, status_code=500)
|
|
|
|
|
|
def clean_text(text: str):
|
|
text = text.replace("\r\n", "\n") # Normalize newlines
|
|
text = text.replace("\n", "\\n") # Escape newlines
|
|
text = text.replace('"', '\\"') # Escape double quotes
|
|
return text
|
|
|
|
|
|
@app.post("/api/download")
|
|
async def download(download_job: DownloadJob):
|
|
LOG.info(f"Downloading job with ids: {download_job.ids}")
|
|
try:
|
|
results = await query({"id": {"$in": download_job.ids}})
|
|
|
|
flattened_results = []
|
|
for result in results:
|
|
for res in result["result"]:
|
|
for url, elements in res.items():
|
|
for element_name, values in elements.items():
|
|
for value in values:
|
|
text = clean_text(value.get("text", ""))
|
|
flattened_results.append(
|
|
{
|
|
"id": result.get("id", None),
|
|
"url": url,
|
|
"element_name": element_name,
|
|
"xpath": value.get("xpath", ""),
|
|
"text": text,
|
|
"user": result.get("user", ""),
|
|
"time_created": result.get("time_created", ""),
|
|
}
|
|
)
|
|
|
|
# Create an Excel workbook and sheet
|
|
workbook = Workbook()
|
|
sheet = workbook.active
|
|
assert sheet
|
|
sheet.title = "Results"
|
|
|
|
# Write the header
|
|
headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"]
|
|
sheet.append(headers)
|
|
|
|
# Write the rows
|
|
for row in flattened_results:
|
|
sheet.append(
|
|
[
|
|
row["id"],
|
|
row["url"],
|
|
row["element_name"],
|
|
row["xpath"],
|
|
row["text"],
|
|
row["user"],
|
|
row["time_created"],
|
|
]
|
|
)
|
|
|
|
# Save the workbook to a BytesIO buffer
|
|
excel_buffer = BytesIO()
|
|
workbook.save(excel_buffer)
|
|
_ = excel_buffer.seek(0)
|
|
|
|
# Create the response
|
|
response = StreamingResponse(
|
|
excel_buffer,
|
|
media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
)
|
|
response.headers["Content-Disposition"] = "attachment; filename=export.xlsx"
|
|
return response
|
|
|
|
except Exception as e:
|
|
LOG.error(f"Exception occurred: {e}")
|
|
traceback.print_exc()
|
|
return {"error": str(e)}
|
|
|
|
|
|
@app.post("/api/delete-scrape-jobs")
|
|
async def delete(delete_scrape_jobs: DeleteScrapeJobs):
|
|
result = await delete_jobs(delete_scrape_jobs.ids)
|
|
return (
|
|
JSONResponse(content={"message": "Jobs successfully deleted."})
|
|
if result
|
|
else JSONResponse({"error": "Jobs not deleted."})
|
|
)
|
|
|
|
|
|
@app.get("/api/logs")
|
|
async def get_own_logs():
|
|
container_id = "scraperr"
|
|
try:
|
|
container = client.containers.get(container_id)
|
|
log_stream = container.logs(stream=True, follow=True)
|
|
|
|
def log_generator():
|
|
try:
|
|
for log in log_stream:
|
|
yield f"data: {log.decode('utf-8')}\n\n"
|
|
except Exception as e:
|
|
yield f"data: {str(e)}\n\n"
|
|
|
|
return StreamingResponse(log_generator(), media_type="text/event-stream")
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@app.post("/api/statistics/get-average-element-per-link")
|
|
async def get_average_element_per_link(get_statistics: GetStatistics):
|
|
return await average_elements_per_link(get_statistics.user)
|
|
|
|
|
|
@app.post("/api/statistics/get-average-jobs-per-day")
|
|
async def average_jobs_per_day(get_statistics: GetStatistics):
|
|
data = await get_jobs_per_day(get_statistics.user)
|
|
return data
|