mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-11-26 11:03:25 +00:00
114 lines
3.4 KiB
Python
114 lines
3.4 KiB
Python
# STL
|
|
import uuid
|
|
import logging
|
|
from io import StringIO
|
|
|
|
# PDM
|
|
import pandas as pd
|
|
from fastapi import FastAPI
|
|
from fastapi.encoders import jsonable_encoder
|
|
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
|
|
from fastapi.staticfiles import StaticFiles
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
# LOCAL
|
|
from api.backend.job import query, insert
|
|
from api.backend.models import DownloadJob, SubmitScrapeJob, RetrieveScrapeJobs
|
|
from api.backend.scraping import scrape
|
|
from api.backend.auth.auth_router import auth_router
|
|
|
|
logging.basicConfig(
|
|
level=logging.DEBUG,
|
|
format="%(levelname)s: %(asctime)s - %(name)s - %(message)s",
|
|
handlers=[logging.StreamHandler()],
|
|
)
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
app = FastAPI(title="api")
|
|
app.include_router(auth_router)
|
|
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"],
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
app.mount("/_next/static", StaticFiles(directory="./dist/_next/static"), name="static")
|
|
|
|
|
|
@app.get("/")
|
|
def read_root():
|
|
return FileResponse("./dist/index.html")
|
|
|
|
|
|
@app.post("/api/submit-scrape-job")
|
|
async def submit_scrape_job(job: SubmitScrapeJob):
|
|
LOG.info(f"Recieved job: {job}")
|
|
try:
|
|
scraped = await scrape(job.url, job.elements)
|
|
|
|
LOG.info(
|
|
f"Scraped result for url: {job.url}, with elements: {job.elements}\n{scraped}"
|
|
)
|
|
|
|
json_scraped = jsonable_encoder(scraped)
|
|
job.result = json_scraped
|
|
job.id = uuid.uuid4().hex
|
|
|
|
if job.user:
|
|
await insert(jsonable_encoder(job))
|
|
|
|
return JSONResponse(content=json_scraped)
|
|
except Exception as e:
|
|
return JSONResponse(content={"error": str(e)}, status_code=500)
|
|
|
|
|
|
@app.post("/api/retrieve-scrape-jobs")
|
|
async def retrieve_scrape_jobs(retrieve: RetrieveScrapeJobs):
|
|
LOG.info(f"Retrieving jobs for account: {retrieve.user}")
|
|
try:
|
|
results = await query({"user": retrieve.user})
|
|
return JSONResponse(content=results[::-1])
|
|
except Exception as e:
|
|
LOG.error(f"Exception occurred: {e}")
|
|
return JSONResponse(content={"error": str(e)}, status_code=500)
|
|
|
|
|
|
@app.post("/api/download")
|
|
async def download(download_job: DownloadJob):
|
|
LOG.info(f"Downloading job with id: {download_job.id}")
|
|
try:
|
|
results = await query({"id": download_job.id})
|
|
|
|
flattened_results = []
|
|
for result in results:
|
|
for key, values in result["result"].items():
|
|
for value in values:
|
|
flattened_results.append(
|
|
{
|
|
"id": result["id"],
|
|
"url": result["url"],
|
|
"element_name": key,
|
|
"xpath": value["xpath"],
|
|
"text": value["text"],
|
|
"user": result["user"],
|
|
"time_created": result["time_created"],
|
|
}
|
|
)
|
|
|
|
df = pd.DataFrame(flattened_results)
|
|
|
|
csv_buffer = StringIO()
|
|
df.to_csv(csv_buffer, index=False)
|
|
_ = csv_buffer.seek(0)
|
|
response = StreamingResponse(csv_buffer, media_type="text/csv")
|
|
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
|
|
return response
|
|
|
|
except Exception as e:
|
|
LOG.error(f"Exception occurred: {e}")
|
|
return {"error": str(e)}
|